All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nfsd: don't hold i_mutex over userspace upcalls
@ 2016-01-07 21:08 J. Bruce Fields
  0 siblings, 0 replies; 10+ messages in thread
From: J. Bruce Fields @ 2016-01-07 21:08 UTC (permalink / raw)
  To: Alexander Viro; +Cc: linux-nfs, linux-fsdevel

From: NeilBrown <neilb@suse.de>

We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir.  If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.

In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about.  We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.

With some care, we may be able to avoid that in rpc.mountd.  But it
seems better just to avoid holding a mutex while waiting on userspace.

It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
something like

	mutex_lock()
	lookup_one_len()
	mutex_unlock()

In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs3xdr.c     |  2 +-
 fs/nfsd/nfs4xdr.c     |  8 +++---
 fs/nfsd/vfs.c         | 23 +++++++---------
 include/linux/namei.h |  1 +
 5 files changed, 89 insertions(+), 19 deletions(-)

Could we get this reviewed for 4.5?

--b.

diff --git a/fs/namei.c b/fs/namei.c
index d84d7c7515fc..174ef4f106cd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2279,6 +2279,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2322,6 +2324,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 00575d776d91..2246454dec76 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..325521ce389a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2838,14 +2838,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 994d66fbb446..4212aaacbb55 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -1809,7 +1815,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1833,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1853,7 +1849,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..d0f25d81b46a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -77,6 +77,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-08-18 19:10                           ` J. Bruce Fields
@ 2015-11-12 21:22                               ` J. Bruce Fields
  -1 siblings, 0 replies; 10+ messages in thread
From: J. Bruce Fields @ 2015-11-12 21:22 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, NeilBrown, Kinglong Mee,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

From: NeilBrown <neilb-l3A5Bk7waGM@public.gmane.org>
Subject: [PATCH] nfsd: don't hold i_mutex over userspace upcalls

We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir.  If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.

In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about.  We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.

With some care, we may be able to avoid that in rpc.mountd.  But it
seems better just to avoid holding a mutex while waiting on userspace.

It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
something like

	mutex_lock()
	lookup_one_len()
	mutex_unlock()

In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.

Signed-off-by: NeilBrown <neilb-l3A5Bk7waGM@public.gmane.org>
Signed-off-by: J. Bruce Fields <bfields-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs3xdr.c     |  2 +-
 fs/nfsd/nfs4xdr.c     |  8 +++---
 fs/nfsd/vfs.c         | 23 +++++++---------
 include/linux/namei.h |  1 +
 5 files changed, 89 insertions(+), 19 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 726d211db484..d68c21fe4598 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2278,6 +2278,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2321,6 +2323,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 00575d776d91..2246454dec76 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..325521ce389a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2838,14 +2838,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 994d66fbb446..4212aaacbb55 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -1809,7 +1815,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1833,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1853,7 +1849,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..d0f25d81b46a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -77,6 +77,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH] nfsd: don't hold i_mutex over userspace upcalls
@ 2015-11-12 21:22                               ` J. Bruce Fields
  0 siblings, 0 replies; 10+ messages in thread
From: J. Bruce Fields @ 2015-11-12 21:22 UTC (permalink / raw)
  To: Al Viro; +Cc: linux-fsdevel, NeilBrown, Kinglong Mee, linux-nfs

From: NeilBrown <neilb@suse.de>
Subject: [PATCH] nfsd: don't hold i_mutex over userspace upcalls

We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir.  If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.

In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about.  We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.

With some care, we may be able to avoid that in rpc.mountd.  But it
seems better just to avoid holding a mutex while waiting on userspace.

It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
something like

	mutex_lock()
	lookup_one_len()
	mutex_unlock()

In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs3xdr.c     |  2 +-
 fs/nfsd/nfs4xdr.c     |  8 +++---
 fs/nfsd/vfs.c         | 23 +++++++---------
 include/linux/namei.h |  1 +
 5 files changed, 89 insertions(+), 19 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 726d211db484..d68c21fe4598 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2278,6 +2278,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2321,6 +2323,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 00575d776d91..2246454dec76 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..325521ce389a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2838,14 +2838,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 994d66fbb446..4212aaacbb55 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -1809,7 +1815,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1833,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1853,7 +1849,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..d0f25d81b46a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -77,6 +77,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-06-03 15:18                     ` J. Bruce Fields
@ 2015-08-18 19:10                           ` J. Bruce Fields
  0 siblings, 0 replies; 10+ messages in thread
From: J. Bruce Fields @ 2015-08-18 19:10 UTC (permalink / raw)
  To: NeilBrown
  Cc: Kinglong Mee, linux-nfs-u79uwXL29TY76Z2rM5mHXA, Al Viro,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA

Al, can I get an ACK/NACK for this?

--b.

commit 5494e10316e3
Author: NeilBrown <neilb-l3A5Bk7waGM@public.gmane.org>
Date:   Fri May 1 11:53:26 2015 +1000

    nfsd: don't hold i_mutex over userspace upcalls
    
    We need information about exports when crossing mountpoints during
    lookup or NFSv4 readdir.  If we don't already have that information
    cached, we may have to ask (and wait for) rpc.mountd.
    
    In both cases we currently hold the i_mutex on the parent of the
    directory we're asking rpc.mountd about.  We've seen situations where
    rpc.mountd performs some operation on that directory that tries to take
    the i_mutex again, resulting in deadlock.
    
    With some care, we may be able to avoid that in rpc.mountd.  But it
    seems better just to avoid holding a mutex while waiting on userspace.
    
    It appears that lookup_one_len is pretty much the only operation that
    needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
    something like
    
    	mutex_lock()
    	lookup_one_len()
    	mutex_unlock()
    
    In many cases though the lookup would have been cached and not required
    the i_mutex, so it's more efficient to create a lookup_one_len() variant
    that only takes the i_mutex when necessary.
    
    Signed-off-by: NeilBrown <neilb-l3A5Bk7waGM@public.gmane.org>
    Signed-off-by: J. Bruce Fields <bfields-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

diff --git a/fs/namei.c b/fs/namei.c
index ae4e4c18b2ac..1627d0302a59 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2250,6 +2250,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2293,6 +2295,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..01dcd494f781 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..325521ce389a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2838,14 +2838,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 45c04979e7b3..2ea6c6a37364 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -1809,7 +1815,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1833,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1853,7 +1849,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..d0f25d81b46a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -77,6 +77,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
@ 2015-08-18 19:10                           ` J. Bruce Fields
  0 siblings, 0 replies; 10+ messages in thread
From: J. Bruce Fields @ 2015-08-18 19:10 UTC (permalink / raw)
  To: NeilBrown; +Cc: Kinglong Mee, linux-nfs, Al Viro, linux-fsdevel

Al, can I get an ACK/NACK for this?

--b.

commit 5494e10316e3
Author: NeilBrown <neilb@suse.de>
Date:   Fri May 1 11:53:26 2015 +1000

    nfsd: don't hold i_mutex over userspace upcalls
    
    We need information about exports when crossing mountpoints during
    lookup or NFSv4 readdir.  If we don't already have that information
    cached, we may have to ask (and wait for) rpc.mountd.
    
    In both cases we currently hold the i_mutex on the parent of the
    directory we're asking rpc.mountd about.  We've seen situations where
    rpc.mountd performs some operation on that directory that tries to take
    the i_mutex again, resulting in deadlock.
    
    With some care, we may be able to avoid that in rpc.mountd.  But it
    seems better just to avoid holding a mutex while waiting on userspace.
    
    It appears that lookup_one_len is pretty much the only operation that
    needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
    something like
    
    	mutex_lock()
    	lookup_one_len()
    	mutex_unlock()
    
    In many cases though the lookup would have been cached and not required
    the i_mutex, so it's more efficient to create a lookup_one_len() variant
    that only takes the i_mutex when necessary.
    
    Signed-off-by: NeilBrown <neilb@suse.de>
    Signed-off-by: J. Bruce Fields <bfields@redhat.com>

diff --git a/fs/namei.c b/fs/namei.c
index ae4e4c18b2ac..1627d0302a59 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2250,6 +2250,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2293,6 +2295,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..01dcd494f781 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca39a4..325521ce389a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2838,14 +2838,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 45c04979e7b3..2ea6c6a37364 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -1809,7 +1815,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1833,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1853,7 +1849,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index d8c6334cd150..d0f25d81b46a 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -77,6 +77,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-07-05 11:27                           ` Kinglong Mee
  (?)
@ 2015-07-06 18:22                           ` J. Bruce Fields
  -1 siblings, 0 replies; 10+ messages in thread
From: J. Bruce Fields @ 2015-07-06 18:22 UTC (permalink / raw)
  To: Kinglong Mee; +Cc: NeilBrown, Al Viro, linux-nfs, linux-fsdevel

On Sun, Jul 05, 2015 at 07:27:25PM +0800, Kinglong Mee wrote:
> Ping...
> 
> What's the state of this patch ?

I think I still need an ACK from Al for the addition of
lookup_one_len_unlocked.

--b.

> On 6/3/2015 23:18, J. Bruce Fields wrote:
> > This passes my review, but it needs an ACK from Al or someone for the
> > addition of the new lookup_one_len_unlocked (which is the same as
> > lookup_one_len except that it takes the i_mutex itself when required
> > instead of requiring the caller to).
> > 
> > --b.
> > 
> > On Fri, May 08, 2015 at 04:01:33PM -0400, J. Bruce Fields wrote:
> >> From: NeilBrown <neilb@suse.de>
> >>
> >> We need information about exports when crossing mountpoints during
> >> lookup or NFSv4 readdir.  If we don't already have that information
> >> cached, we may have to ask (and wait for) rpc.mountd.
> >>
> >> In both cases we currently hold the i_mutex on the parent of the
> >> directory we're asking rpc.mountd about.  We've seen situations where
> >> rpc.mountd performs some operation on that directory that tries to take
> >> the i_mutex again, resulting in deadlock.
> >>
> >> With some care, we may be able to avoid that in rpc.mountd.  But it
> >> seems better just to avoid holding a mutex while waiting on userspace.
> >>
> >> It appears that lookup_one_len is pretty much the only operation that
> >> needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
> >> something like
> >>
> >> 	mutex_lock()
> >> 	lookup_one_len()
> >> 	mutex_unlock()
> >>
> >> In many cases though the lookup would have been cached and not required
> >> the i_mutex, so it's more efficient to create a lookup_one_len() variant
> >> that only takes the i_mutex when necessary.
> >>
> >> Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> >> ---
> >>  fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  fs/nfsd/nfs3xdr.c     |  2 +-
> >>  fs/nfsd/nfs4xdr.c     |  8 +++---
> >>  fs/nfsd/vfs.c         | 23 +++++++---------
> >>  include/linux/namei.h |  1 +
> >>  5 files changed, 89 insertions(+), 19 deletions(-)
> >>
> >> Here's an updated patch.
> >>
> >> diff --git a/fs/namei.c b/fs/namei.c
> >> index 4a8d998b7274..8b866d79c5b7 100644
> >> --- a/fs/namei.c
> >> +++ b/fs/namei.c
> >> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
> >>   *
> >>   * Note that this routine is purely a helper for filesystem usage and should
> >>   * not be called by generic code.
> >> + *
> >> + * The caller must hold base->i_mutex.
> >>   */
> >>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> >>  {
> >> @@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
> >>  }
> >>  EXPORT_SYMBOL(lookup_one_len);
> >>  
> >> +/**
> >> + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
> >> + * @name:	pathname component to lookup
> >> + * @base:	base directory to lookup from
> >> + * @len:	maximum length @len should be interpreted to
> >> + *
> >> + * Note that this routine is purely a helper for filesystem usage and should
> >> + * not be called by generic code.
> >> + *
> >> + * Unlike lookup_one_len, it should be called without the parent
> >> + * i_mutex held, and will take the i_mutex itself if necessary.
> >> + */
> >> +struct dentry *lookup_one_len_unlocked(const char *name,
> >> +				       struct dentry *base, int len)
> >> +{
> >> +	struct qstr this;
> >> +	unsigned int c;
> >> +	int err;
> >> +	struct dentry *ret;
> >> +
> >> +	this.name = name;
> >> +	this.len = len;
> >> +	this.hash = full_name_hash(name, len);
> >> +	if (!len)
> >> +		return ERR_PTR(-EACCES);
> >> +
> >> +	if (unlikely(name[0] == '.')) {
> >> +		if (len < 2 || (len == 2 && name[1] == '.'))
> >> +			return ERR_PTR(-EACCES);
> >> +	}
> >> +
> >> +	while (len--) {
> >> +		c = *(const unsigned char *)name++;
> >> +		if (c == '/' || c == '\0')
> >> +			return ERR_PTR(-EACCES);
> >> +	}
> >> +	/*
> >> +	 * See if the low-level filesystem might want
> >> +	 * to use its own hash..
> >> +	 */
> >> +	if (base->d_flags & DCACHE_OP_HASH) {
> >> +		int err = base->d_op->d_hash(base, &this);
> >> +		if (err < 0)
> >> +			return ERR_PTR(err);
> >> +	}
> >> +
> >> +	err = inode_permission(base->d_inode, MAY_EXEC);
> >> +	if (err)
> >> +		return ERR_PTR(err);
> >> +
> >> +	ret = __d_lookup(base, &this);
> >> +	if (ret)
> >> +		return ret;
> >> +	/*
> >> +	 * __d_lookup() is used to try to get a quick answer and avoid the
> >> +	 * mutex.  A false-negative does no harm.
> >> +	 */
> >> +	ret = __d_lookup(base, &this);
> >> +	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
> >> +		dput(ret);
> >> +		ret = NULL;
> >> +	}
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	mutex_lock(&base->d_inode->i_mutex);
> >> +	ret =  __lookup_hash(&this, base, 0);
> >> +	mutex_unlock(&base->d_inode->i_mutex);
> >> +	return ret;
> >> +}
> >> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> >> +
> >>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
> >>  		 struct path *path, int *empty)
> >>  {
> >> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> >> index f6e7cbabac5a..01dcd494f781 100644
> >> --- a/fs/nfsd/nfs3xdr.c
> >> +++ b/fs/nfsd/nfs3xdr.c
> >> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
> >>  		} else
> >>  			dchild = dget(dparent);
> >>  	} else
> >> -		dchild = lookup_one_len(name, dparent, namlen);
> >> +		dchild = lookup_one_len_unlocked(name, dparent, namlen);
> >>  	if (IS_ERR(dchild))
> >>  		return rv;
> >>  	if (d_mountpoint(dchild))
> >> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> >> index 158badf945df..2c1adaa0bd2f 100644
> >> --- a/fs/nfsd/nfs4xdr.c
> >> +++ b/fs/nfsd/nfs4xdr.c
> >> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
> >>  	__be32 nfserr;
> >>  	int ignore_crossmnt = 0;
> >>  
> >> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> >> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
> >>  	if (IS_ERR(dentry))
> >>  		return nfserrno(PTR_ERR(dentry));
> >>  	if (d_really_is_negative(dentry)) {
> >>  		/*
> >> -		 * nfsd_buffered_readdir drops the i_mutex between
> >> -		 * readdir and calling this callback, leaving a window
> >> -		 * where this directory entry could have gone away.
> >> +		 * we're not holding the i_mutex here, so there's
> >> +		 * a window where this directory entry could have gone
> >> +		 * away.
> >>  		 */
> >>  		dput(dentry);
> >>  		return nfserr_noent;
> >> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> >> index a30e79900086..6d5b33458e91 100644
> >> --- a/fs/nfsd/vfs.c
> >> +++ b/fs/nfsd/vfs.c
> >> @@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> >>  		host_err = PTR_ERR(dentry);
> >>  		if (IS_ERR(dentry))
> >>  			goto out_nfserr;
> >> -		/*
> >> -		 * check if we have crossed a mount point ...
> >> -		 */
> >>  		if (nfsd_mountpoint(dentry, exp)) {
> >> +			/*
> >> +			 * We don't need the i_mutex after all.  It's
> >> +			 * still possible we could open this (regular
> >> +			 * files can be mountpoints too), but the
> >> +			 * i_mutex is just there to prevent renames of
> >> +			 * something that we might be about to delegate,
> >> +			 * and a mountpoint won't be renamed:
> >> +			 */
> >> +			fh_unlock(fhp);
> >>  			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
> >>  				dput(dentry);
> >>  				goto out_nfserr;
> >> @@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> >>  	offset = *offsetp;
> >>  
> >>  	while (1) {
> >> -		struct inode *dir_inode = file_inode(file);
> >>  		unsigned int reclen;
> >>  
> >>  		cdp->err = nfserr_eof; /* will be cleared on successful read */
> >> @@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> >>  		if (!size)
> >>  			break;
> >>  
> >> -		/*
> >> -		 * Various filldir functions may end up calling back into
> >> -		 * lookup_one_len() and the file system's ->lookup() method.
> >> -		 * These expect i_mutex to be held, as it would within readdir.
> >> -		 */
> >> -		host_err = mutex_lock_killable(&dir_inode->i_mutex);
> >> -		if (host_err)
> >> -			break;
> >> -
> >>  		de = (struct buffered_dirent *)buf.dirent;
> >>  		while (size > 0) {
> >>  			offset = de->offset;
> >> @@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
> >>  			size -= reclen;
> >>  			de = (struct buffered_dirent *)((char *)de + reclen);
> >>  		}
> >> -		mutex_unlock(&dir_inode->i_mutex);
> >>  		if (size > 0) /* We bailed out early */
> >>  			break;
> >>  
> >> diff --git a/include/linux/namei.h b/include/linux/namei.h
> >> index c8990779f0c3..bb3a2f7cca67 100644
> >> --- a/include/linux/namei.h
> >> +++ b/include/linux/namei.h
> >> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
> >>  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
> >>  
> >>  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
> >> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
> >>  
> >>  extern int follow_down_one(struct path *);
> >>  extern int follow_down(struct path *);
> >> -- 
> >> 1.9.3
> >>
> > 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-06-03 15:18                     ` J. Bruce Fields
@ 2015-07-05 11:27                           ` Kinglong Mee
  0 siblings, 0 replies; 10+ messages in thread
From: Kinglong Mee @ 2015-07-05 11:27 UTC (permalink / raw)
  To: J. Bruce Fields, NeilBrown, Al Viro
  Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	kinglongmee-Re5JQEeQqe8AvxtiuMwx3w

Ping...

What's the state of this patch ?
Without modify nfs-utils, this one is make sense.

thanks,
Kinglong Mee
On 6/3/2015 23:18, J. Bruce Fields wrote:
> This passes my review, but it needs an ACK from Al or someone for the
> addition of the new lookup_one_len_unlocked (which is the same as
> lookup_one_len except that it takes the i_mutex itself when required
> instead of requiring the caller to).
> 
> --b.
> 
> On Fri, May 08, 2015 at 04:01:33PM -0400, J. Bruce Fields wrote:
>> From: NeilBrown <neilb-l3A5Bk7waGM@public.gmane.org>
>>
>> We need information about exports when crossing mountpoints during
>> lookup or NFSv4 readdir.  If we don't already have that information
>> cached, we may have to ask (and wait for) rpc.mountd.
>>
>> In both cases we currently hold the i_mutex on the parent of the
>> directory we're asking rpc.mountd about.  We've seen situations where
>> rpc.mountd performs some operation on that directory that tries to take
>> the i_mutex again, resulting in deadlock.
>>
>> With some care, we may be able to avoid that in rpc.mountd.  But it
>> seems better just to avoid holding a mutex while waiting on userspace.
>>
>> It appears that lookup_one_len is pretty much the only operation that
>> needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
>> something like
>>
>> 	mutex_lock()
>> 	lookup_one_len()
>> 	mutex_unlock()
>>
>> In many cases though the lookup would have been cached and not required
>> the i_mutex, so it's more efficient to create a lookup_one_len() variant
>> that only takes the i_mutex when necessary.
>>
>> Signed-off-by: J. Bruce Fields <bfields-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>> ---
>>  fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/nfsd/nfs3xdr.c     |  2 +-
>>  fs/nfsd/nfs4xdr.c     |  8 +++---
>>  fs/nfsd/vfs.c         | 23 +++++++---------
>>  include/linux/namei.h |  1 +
>>  5 files changed, 89 insertions(+), 19 deletions(-)
>>
>> Here's an updated patch.
>>
>> diff --git a/fs/namei.c b/fs/namei.c
>> index 4a8d998b7274..8b866d79c5b7 100644
>> --- a/fs/namei.c
>> +++ b/fs/namei.c
>> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
>>   *
>>   * Note that this routine is purely a helper for filesystem usage and should
>>   * not be called by generic code.
>> + *
>> + * The caller must hold base->i_mutex.
>>   */
>>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>>  {
>> @@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>>  }
>>  EXPORT_SYMBOL(lookup_one_len);
>>  
>> +/**
>> + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
>> + * @name:	pathname component to lookup
>> + * @base:	base directory to lookup from
>> + * @len:	maximum length @len should be interpreted to
>> + *
>> + * Note that this routine is purely a helper for filesystem usage and should
>> + * not be called by generic code.
>> + *
>> + * Unlike lookup_one_len, it should be called without the parent
>> + * i_mutex held, and will take the i_mutex itself if necessary.
>> + */
>> +struct dentry *lookup_one_len_unlocked(const char *name,
>> +				       struct dentry *base, int len)
>> +{
>> +	struct qstr this;
>> +	unsigned int c;
>> +	int err;
>> +	struct dentry *ret;
>> +
>> +	this.name = name;
>> +	this.len = len;
>> +	this.hash = full_name_hash(name, len);
>> +	if (!len)
>> +		return ERR_PTR(-EACCES);
>> +
>> +	if (unlikely(name[0] == '.')) {
>> +		if (len < 2 || (len == 2 && name[1] == '.'))
>> +			return ERR_PTR(-EACCES);
>> +	}
>> +
>> +	while (len--) {
>> +		c = *(const unsigned char *)name++;
>> +		if (c == '/' || c == '\0')
>> +			return ERR_PTR(-EACCES);
>> +	}
>> +	/*
>> +	 * See if the low-level filesystem might want
>> +	 * to use its own hash..
>> +	 */
>> +	if (base->d_flags & DCACHE_OP_HASH) {
>> +		int err = base->d_op->d_hash(base, &this);
>> +		if (err < 0)
>> +			return ERR_PTR(err);
>> +	}
>> +
>> +	err = inode_permission(base->d_inode, MAY_EXEC);
>> +	if (err)
>> +		return ERR_PTR(err);
>> +
>> +	ret = __d_lookup(base, &this);
>> +	if (ret)
>> +		return ret;
>> +	/*
>> +	 * __d_lookup() is used to try to get a quick answer and avoid the
>> +	 * mutex.  A false-negative does no harm.
>> +	 */
>> +	ret = __d_lookup(base, &this);
>> +	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
>> +		dput(ret);
>> +		ret = NULL;
>> +	}
>> +	if (ret)
>> +		return ret;
>> +
>> +	mutex_lock(&base->d_inode->i_mutex);
>> +	ret =  __lookup_hash(&this, base, 0);
>> +	mutex_unlock(&base->d_inode->i_mutex);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(lookup_one_len_unlocked);
>> +
>>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
>>  		 struct path *path, int *empty)
>>  {
>> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
>> index f6e7cbabac5a..01dcd494f781 100644
>> --- a/fs/nfsd/nfs3xdr.c
>> +++ b/fs/nfsd/nfs3xdr.c
>> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>>  		} else
>>  			dchild = dget(dparent);
>>  	} else
>> -		dchild = lookup_one_len(name, dparent, namlen);
>> +		dchild = lookup_one_len_unlocked(name, dparent, namlen);
>>  	if (IS_ERR(dchild))
>>  		return rv;
>>  	if (d_mountpoint(dchild))
>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>> index 158badf945df..2c1adaa0bd2f 100644
>> --- a/fs/nfsd/nfs4xdr.c
>> +++ b/fs/nfsd/nfs4xdr.c
>> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
>>  	__be32 nfserr;
>>  	int ignore_crossmnt = 0;
>>  
>> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
>> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
>>  	if (IS_ERR(dentry))
>>  		return nfserrno(PTR_ERR(dentry));
>>  	if (d_really_is_negative(dentry)) {
>>  		/*
>> -		 * nfsd_buffered_readdir drops the i_mutex between
>> -		 * readdir and calling this callback, leaving a window
>> -		 * where this directory entry could have gone away.
>> +		 * we're not holding the i_mutex here, so there's
>> +		 * a window where this directory entry could have gone
>> +		 * away.
>>  		 */
>>  		dput(dentry);
>>  		return nfserr_noent;
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index a30e79900086..6d5b33458e91 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  		host_err = PTR_ERR(dentry);
>>  		if (IS_ERR(dentry))
>>  			goto out_nfserr;
>> -		/*
>> -		 * check if we have crossed a mount point ...
>> -		 */
>>  		if (nfsd_mountpoint(dentry, exp)) {
>> +			/*
>> +			 * We don't need the i_mutex after all.  It's
>> +			 * still possible we could open this (regular
>> +			 * files can be mountpoints too), but the
>> +			 * i_mutex is just there to prevent renames of
>> +			 * something that we might be about to delegate,
>> +			 * and a mountpoint won't be renamed:
>> +			 */
>> +			fh_unlock(fhp);
>>  			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
>>  				dput(dentry);
>>  				goto out_nfserr;
>> @@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>>  	offset = *offsetp;
>>  
>>  	while (1) {
>> -		struct inode *dir_inode = file_inode(file);
>>  		unsigned int reclen;
>>  
>>  		cdp->err = nfserr_eof; /* will be cleared on successful read */
>> @@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>>  		if (!size)
>>  			break;
>>  
>> -		/*
>> -		 * Various filldir functions may end up calling back into
>> -		 * lookup_one_len() and the file system's ->lookup() method.
>> -		 * These expect i_mutex to be held, as it would within readdir.
>> -		 */
>> -		host_err = mutex_lock_killable(&dir_inode->i_mutex);
>> -		if (host_err)
>> -			break;
>> -
>>  		de = (struct buffered_dirent *)buf.dirent;
>>  		while (size > 0) {
>>  			offset = de->offset;
>> @@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>>  			size -= reclen;
>>  			de = (struct buffered_dirent *)((char *)de + reclen);
>>  		}
>> -		mutex_unlock(&dir_inode->i_mutex);
>>  		if (size > 0) /* We bailed out early */
>>  			break;
>>  
>> diff --git a/include/linux/namei.h b/include/linux/namei.h
>> index c8990779f0c3..bb3a2f7cca67 100644
>> --- a/include/linux/namei.h
>> +++ b/include/linux/namei.h
>> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
>>  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>>  
>>  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
>> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>>  
>>  extern int follow_down_one(struct path *);
>>  extern int follow_down(struct path *);
>> -- 
>> 1.9.3
>>
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
@ 2015-07-05 11:27                           ` Kinglong Mee
  0 siblings, 0 replies; 10+ messages in thread
From: Kinglong Mee @ 2015-07-05 11:27 UTC (permalink / raw)
  To: J. Bruce Fields, NeilBrown, Al Viro; +Cc: linux-nfs, linux-fsdevel, kinglongmee

Ping...

What's the state of this patch ?
Without modify nfs-utils, this one is make sense.

thanks,
Kinglong Mee
On 6/3/2015 23:18, J. Bruce Fields wrote:
> This passes my review, but it needs an ACK from Al or someone for the
> addition of the new lookup_one_len_unlocked (which is the same as
> lookup_one_len except that it takes the i_mutex itself when required
> instead of requiring the caller to).
> 
> --b.
> 
> On Fri, May 08, 2015 at 04:01:33PM -0400, J. Bruce Fields wrote:
>> From: NeilBrown <neilb@suse.de>
>>
>> We need information about exports when crossing mountpoints during
>> lookup or NFSv4 readdir.  If we don't already have that information
>> cached, we may have to ask (and wait for) rpc.mountd.
>>
>> In both cases we currently hold the i_mutex on the parent of the
>> directory we're asking rpc.mountd about.  We've seen situations where
>> rpc.mountd performs some operation on that directory that tries to take
>> the i_mutex again, resulting in deadlock.
>>
>> With some care, we may be able to avoid that in rpc.mountd.  But it
>> seems better just to avoid holding a mutex while waiting on userspace.
>>
>> It appears that lookup_one_len is pretty much the only operation that
>> needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
>> something like
>>
>> 	mutex_lock()
>> 	lookup_one_len()
>> 	mutex_unlock()
>>
>> In many cases though the lookup would have been cached and not required
>> the i_mutex, so it's more efficient to create a lookup_one_len() variant
>> that only takes the i_mutex when necessary.
>>
>> Signed-off-by: J. Bruce Fields <bfields@redhat.com>
>> ---
>>  fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  fs/nfsd/nfs3xdr.c     |  2 +-
>>  fs/nfsd/nfs4xdr.c     |  8 +++---
>>  fs/nfsd/vfs.c         | 23 +++++++---------
>>  include/linux/namei.h |  1 +
>>  5 files changed, 89 insertions(+), 19 deletions(-)
>>
>> Here's an updated patch.
>>
>> diff --git a/fs/namei.c b/fs/namei.c
>> index 4a8d998b7274..8b866d79c5b7 100644
>> --- a/fs/namei.c
>> +++ b/fs/namei.c
>> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
>>   *
>>   * Note that this routine is purely a helper for filesystem usage and should
>>   * not be called by generic code.
>> + *
>> + * The caller must hold base->i_mutex.
>>   */
>>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>>  {
>> @@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>>  }
>>  EXPORT_SYMBOL(lookup_one_len);
>>  
>> +/**
>> + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
>> + * @name:	pathname component to lookup
>> + * @base:	base directory to lookup from
>> + * @len:	maximum length @len should be interpreted to
>> + *
>> + * Note that this routine is purely a helper for filesystem usage and should
>> + * not be called by generic code.
>> + *
>> + * Unlike lookup_one_len, it should be called without the parent
>> + * i_mutex held, and will take the i_mutex itself if necessary.
>> + */
>> +struct dentry *lookup_one_len_unlocked(const char *name,
>> +				       struct dentry *base, int len)
>> +{
>> +	struct qstr this;
>> +	unsigned int c;
>> +	int err;
>> +	struct dentry *ret;
>> +
>> +	this.name = name;
>> +	this.len = len;
>> +	this.hash = full_name_hash(name, len);
>> +	if (!len)
>> +		return ERR_PTR(-EACCES);
>> +
>> +	if (unlikely(name[0] == '.')) {
>> +		if (len < 2 || (len == 2 && name[1] == '.'))
>> +			return ERR_PTR(-EACCES);
>> +	}
>> +
>> +	while (len--) {
>> +		c = *(const unsigned char *)name++;
>> +		if (c == '/' || c == '\0')
>> +			return ERR_PTR(-EACCES);
>> +	}
>> +	/*
>> +	 * See if the low-level filesystem might want
>> +	 * to use its own hash..
>> +	 */
>> +	if (base->d_flags & DCACHE_OP_HASH) {
>> +		int err = base->d_op->d_hash(base, &this);
>> +		if (err < 0)
>> +			return ERR_PTR(err);
>> +	}
>> +
>> +	err = inode_permission(base->d_inode, MAY_EXEC);
>> +	if (err)
>> +		return ERR_PTR(err);
>> +
>> +	ret = __d_lookup(base, &this);
>> +	if (ret)
>> +		return ret;
>> +	/*
>> +	 * __d_lookup() is used to try to get a quick answer and avoid the
>> +	 * mutex.  A false-negative does no harm.
>> +	 */
>> +	ret = __d_lookup(base, &this);
>> +	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
>> +		dput(ret);
>> +		ret = NULL;
>> +	}
>> +	if (ret)
>> +		return ret;
>> +
>> +	mutex_lock(&base->d_inode->i_mutex);
>> +	ret =  __lookup_hash(&this, base, 0);
>> +	mutex_unlock(&base->d_inode->i_mutex);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(lookup_one_len_unlocked);
>> +
>>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
>>  		 struct path *path, int *empty)
>>  {
>> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
>> index f6e7cbabac5a..01dcd494f781 100644
>> --- a/fs/nfsd/nfs3xdr.c
>> +++ b/fs/nfsd/nfs3xdr.c
>> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>>  		} else
>>  			dchild = dget(dparent);
>>  	} else
>> -		dchild = lookup_one_len(name, dparent, namlen);
>> +		dchild = lookup_one_len_unlocked(name, dparent, namlen);
>>  	if (IS_ERR(dchild))
>>  		return rv;
>>  	if (d_mountpoint(dchild))
>> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
>> index 158badf945df..2c1adaa0bd2f 100644
>> --- a/fs/nfsd/nfs4xdr.c
>> +++ b/fs/nfsd/nfs4xdr.c
>> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
>>  	__be32 nfserr;
>>  	int ignore_crossmnt = 0;
>>  
>> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
>> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
>>  	if (IS_ERR(dentry))
>>  		return nfserrno(PTR_ERR(dentry));
>>  	if (d_really_is_negative(dentry)) {
>>  		/*
>> -		 * nfsd_buffered_readdir drops the i_mutex between
>> -		 * readdir and calling this callback, leaving a window
>> -		 * where this directory entry could have gone away.
>> +		 * we're not holding the i_mutex here, so there's
>> +		 * a window where this directory entry could have gone
>> +		 * away.
>>  		 */
>>  		dput(dentry);
>>  		return nfserr_noent;
>> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
>> index a30e79900086..6d5b33458e91 100644
>> --- a/fs/nfsd/vfs.c
>> +++ b/fs/nfsd/vfs.c
>> @@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>>  		host_err = PTR_ERR(dentry);
>>  		if (IS_ERR(dentry))
>>  			goto out_nfserr;
>> -		/*
>> -		 * check if we have crossed a mount point ...
>> -		 */
>>  		if (nfsd_mountpoint(dentry, exp)) {
>> +			/*
>> +			 * We don't need the i_mutex after all.  It's
>> +			 * still possible we could open this (regular
>> +			 * files can be mountpoints too), but the
>> +			 * i_mutex is just there to prevent renames of
>> +			 * something that we might be about to delegate,
>> +			 * and a mountpoint won't be renamed:
>> +			 */
>> +			fh_unlock(fhp);
>>  			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
>>  				dput(dentry);
>>  				goto out_nfserr;
>> @@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>>  	offset = *offsetp;
>>  
>>  	while (1) {
>> -		struct inode *dir_inode = file_inode(file);
>>  		unsigned int reclen;
>>  
>>  		cdp->err = nfserr_eof; /* will be cleared on successful read */
>> @@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>>  		if (!size)
>>  			break;
>>  
>> -		/*
>> -		 * Various filldir functions may end up calling back into
>> -		 * lookup_one_len() and the file system's ->lookup() method.
>> -		 * These expect i_mutex to be held, as it would within readdir.
>> -		 */
>> -		host_err = mutex_lock_killable(&dir_inode->i_mutex);
>> -		if (host_err)
>> -			break;
>> -
>>  		de = (struct buffered_dirent *)buf.dirent;
>>  		while (size > 0) {
>>  			offset = de->offset;
>> @@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>>  			size -= reclen;
>>  			de = (struct buffered_dirent *)((char *)de + reclen);
>>  		}
>> -		mutex_unlock(&dir_inode->i_mutex);
>>  		if (size > 0) /* We bailed out early */
>>  			break;
>>  
>> diff --git a/include/linux/namei.h b/include/linux/namei.h
>> index c8990779f0c3..bb3a2f7cca67 100644
>> --- a/include/linux/namei.h
>> +++ b/include/linux/namei.h
>> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
>>  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>>  
>>  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
>> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>>  
>>  extern int follow_down_one(struct path *);
>>  extern int follow_down(struct path *);
>> -- 
>> 1.9.3
>>
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-05-08 20:01                   ` [PATCH] nfsd: don't hold i_mutex over userspace upcalls J. Bruce Fields
@ 2015-06-03 15:18                     ` J. Bruce Fields
       [not found]                       ` <20150603151819.GA8441-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: J. Bruce Fields @ 2015-06-03 15:18 UTC (permalink / raw)
  To: NeilBrown; +Cc: Kinglong Mee, linux-nfs, Al Viro, linux-fsdevel

This passes my review, but it needs an ACK from Al or someone for the
addition of the new lookup_one_len_unlocked (which is the same as
lookup_one_len except that it takes the i_mutex itself when required
instead of requiring the caller to).

--b.

On Fri, May 08, 2015 at 04:01:33PM -0400, J. Bruce Fields wrote:
> From: NeilBrown <neilb@suse.de>
> 
> We need information about exports when crossing mountpoints during
> lookup or NFSv4 readdir.  If we don't already have that information
> cached, we may have to ask (and wait for) rpc.mountd.
> 
> In both cases we currently hold the i_mutex on the parent of the
> directory we're asking rpc.mountd about.  We've seen situations where
> rpc.mountd performs some operation on that directory that tries to take
> the i_mutex again, resulting in deadlock.
> 
> With some care, we may be able to avoid that in rpc.mountd.  But it
> seems better just to avoid holding a mutex while waiting on userspace.
> 
> It appears that lookup_one_len is pretty much the only operation that
> needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
> something like
> 
> 	mutex_lock()
> 	lookup_one_len()
> 	mutex_unlock()
> 
> In many cases though the lookup would have been cached and not required
> the i_mutex, so it's more efficient to create a lookup_one_len() variant
> that only takes the i_mutex when necessary.
> 
> Signed-off-by: J. Bruce Fields <bfields@redhat.com>
> ---
>  fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfs3xdr.c     |  2 +-
>  fs/nfsd/nfs4xdr.c     |  8 +++---
>  fs/nfsd/vfs.c         | 23 +++++++---------
>  include/linux/namei.h |  1 +
>  5 files changed, 89 insertions(+), 19 deletions(-)
> 
> Here's an updated patch.
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 4a8d998b7274..8b866d79c5b7 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
>   *
>   * Note that this routine is purely a helper for filesystem usage and should
>   * not be called by generic code.
> + *
> + * The caller must hold base->i_mutex.
>   */
>  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  {
> @@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
>  }
>  EXPORT_SYMBOL(lookup_one_len);
>  
> +/**
> + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
> + * @name:	pathname component to lookup
> + * @base:	base directory to lookup from
> + * @len:	maximum length @len should be interpreted to
> + *
> + * Note that this routine is purely a helper for filesystem usage and should
> + * not be called by generic code.
> + *
> + * Unlike lookup_one_len, it should be called without the parent
> + * i_mutex held, and will take the i_mutex itself if necessary.
> + */
> +struct dentry *lookup_one_len_unlocked(const char *name,
> +				       struct dentry *base, int len)
> +{
> +	struct qstr this;
> +	unsigned int c;
> +	int err;
> +	struct dentry *ret;
> +
> +	this.name = name;
> +	this.len = len;
> +	this.hash = full_name_hash(name, len);
> +	if (!len)
> +		return ERR_PTR(-EACCES);
> +
> +	if (unlikely(name[0] == '.')) {
> +		if (len < 2 || (len == 2 && name[1] == '.'))
> +			return ERR_PTR(-EACCES);
> +	}
> +
> +	while (len--) {
> +		c = *(const unsigned char *)name++;
> +		if (c == '/' || c == '\0')
> +			return ERR_PTR(-EACCES);
> +	}
> +	/*
> +	 * See if the low-level filesystem might want
> +	 * to use its own hash..
> +	 */
> +	if (base->d_flags & DCACHE_OP_HASH) {
> +		int err = base->d_op->d_hash(base, &this);
> +		if (err < 0)
> +			return ERR_PTR(err);
> +	}
> +
> +	err = inode_permission(base->d_inode, MAY_EXEC);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	ret = __d_lookup(base, &this);
> +	if (ret)
> +		return ret;
> +	/*
> +	 * __d_lookup() is used to try to get a quick answer and avoid the
> +	 * mutex.  A false-negative does no harm.
> +	 */
> +	ret = __d_lookup(base, &this);
> +	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
> +		dput(ret);
> +		ret = NULL;
> +	}
> +	if (ret)
> +		return ret;
> +
> +	mutex_lock(&base->d_inode->i_mutex);
> +	ret =  __lookup_hash(&this, base, 0);
> +	mutex_unlock(&base->d_inode->i_mutex);
> +	return ret;
> +}
> +EXPORT_SYMBOL(lookup_one_len_unlocked);
> +
>  int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
>  		 struct path *path, int *empty)
>  {
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index f6e7cbabac5a..01dcd494f781 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
>  		} else
>  			dchild = dget(dparent);
>  	} else
> -		dchild = lookup_one_len(name, dparent, namlen);
> +		dchild = lookup_one_len_unlocked(name, dparent, namlen);
>  	if (IS_ERR(dchild))
>  		return rv;
>  	if (d_mountpoint(dchild))
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 158badf945df..2c1adaa0bd2f 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
>  	__be32 nfserr;
>  	int ignore_crossmnt = 0;
>  
> -	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
> +	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
>  	if (IS_ERR(dentry))
>  		return nfserrno(PTR_ERR(dentry));
>  	if (d_really_is_negative(dentry)) {
>  		/*
> -		 * nfsd_buffered_readdir drops the i_mutex between
> -		 * readdir and calling this callback, leaving a window
> -		 * where this directory entry could have gone away.
> +		 * we're not holding the i_mutex here, so there's
> +		 * a window where this directory entry could have gone
> +		 * away.
>  		 */
>  		dput(dentry);
>  		return nfserr_noent;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index a30e79900086..6d5b33458e91 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		host_err = PTR_ERR(dentry);
>  		if (IS_ERR(dentry))
>  			goto out_nfserr;
> -		/*
> -		 * check if we have crossed a mount point ...
> -		 */
>  		if (nfsd_mountpoint(dentry, exp)) {
> +			/*
> +			 * We don't need the i_mutex after all.  It's
> +			 * still possible we could open this (regular
> +			 * files can be mountpoints too), but the
> +			 * i_mutex is just there to prevent renames of
> +			 * something that we might be about to delegate,
> +			 * and a mountpoint won't be renamed:
> +			 */
> +			fh_unlock(fhp);
>  			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
>  				dput(dentry);
>  				goto out_nfserr;
> @@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  	offset = *offsetp;
>  
>  	while (1) {
> -		struct inode *dir_inode = file_inode(file);
>  		unsigned int reclen;
>  
>  		cdp->err = nfserr_eof; /* will be cleared on successful read */
> @@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  		if (!size)
>  			break;
>  
> -		/*
> -		 * Various filldir functions may end up calling back into
> -		 * lookup_one_len() and the file system's ->lookup() method.
> -		 * These expect i_mutex to be held, as it would within readdir.
> -		 */
> -		host_err = mutex_lock_killable(&dir_inode->i_mutex);
> -		if (host_err)
> -			break;
> -
>  		de = (struct buffered_dirent *)buf.dirent;
>  		while (size > 0) {
>  			offset = de->offset;
> @@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
>  			size -= reclen;
>  			de = (struct buffered_dirent *)((char *)de + reclen);
>  		}
> -		mutex_unlock(&dir_inode->i_mutex);
>  		if (size > 0) /* We bailed out early */
>  			break;
>  
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index c8990779f0c3..bb3a2f7cca67 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
>  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
>  
>  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
> +extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
>  
>  extern int follow_down_one(struct path *);
>  extern int follow_down(struct path *);
> -- 
> 1.9.3
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] nfsd: don't hold i_mutex over userspace upcalls
  2015-05-08 16:15                 ` J. Bruce Fields
@ 2015-05-08 20:01                   ` J. Bruce Fields
  2015-06-03 15:18                     ` J. Bruce Fields
  0 siblings, 1 reply; 10+ messages in thread
From: J. Bruce Fields @ 2015-05-08 20:01 UTC (permalink / raw)
  To: NeilBrown; +Cc: Kinglong Mee, linux-nfs

From: NeilBrown <neilb@suse.de>

We need information about exports when crossing mountpoints during
lookup or NFSv4 readdir.  If we don't already have that information
cached, we may have to ask (and wait for) rpc.mountd.

In both cases we currently hold the i_mutex on the parent of the
directory we're asking rpc.mountd about.  We've seen situations where
rpc.mountd performs some operation on that directory that tries to take
the i_mutex again, resulting in deadlock.

With some care, we may be able to avoid that in rpc.mountd.  But it
seems better just to avoid holding a mutex while waiting on userspace.

It appears that lookup_one_len is pretty much the only operation that
needs the i_mutex.  So we could just drop the i_mutex elsewhere and do
something like

	mutex_lock()
	lookup_one_len()
	mutex_unlock()

In many cases though the lookup would have been cached and not required
the i_mutex, so it's more efficient to create a lookup_one_len() variant
that only takes the i_mutex when necessary.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/namei.c            | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs3xdr.c     |  2 +-
 fs/nfsd/nfs4xdr.c     |  8 +++---
 fs/nfsd/vfs.c         | 23 +++++++---------
 include/linux/namei.h |  1 +
 5 files changed, 89 insertions(+), 19 deletions(-)

Here's an updated patch.

diff --git a/fs/namei.c b/fs/namei.c
index 4a8d998b7274..8b866d79c5b7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2139,6 +2139,8 @@ static struct dentry *lookup_hash(struct nameidata *nd)
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2182,6 +2184,78 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	ret = __d_lookup(base, &this);
+	if (ret)
+		return ret;
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && ret->d_flags & DCACHE_OP_REVALIDATE) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	mutex_lock(&base->d_inode->i_mutex);
+	ret =  __lookup_hash(&this, base, 0);
+	mutex_unlock(&base->d_inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..01dcd494f781 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 158badf945df..2c1adaa0bd2f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2804,14 +2804,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a30e79900086..6d5b33458e91 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -217,10 +217,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -1876,7 +1882,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1895,15 +1900,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1920,7 +1916,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index c8990779f0c3..bb3a2f7cca67 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -62,6 +62,7 @@ extern struct dentry *kern_path_locked(const char *, struct path *);
 extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
 
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
 
 extern int follow_down_one(struct path *);
 extern int follow_down(struct path *);
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2016-01-07 21:08 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-07 21:08 [PATCH] nfsd: don't hold i_mutex over userspace upcalls J. Bruce Fields
  -- strict thread matches above, loose matches on Subject: below --
2015-04-29 19:19 [PATCH RFC] NFSD: fix cannot umounting mount points under pseudo root J. Bruce Fields
2015-04-29 21:52 ` NeilBrown
2015-04-30 21:36   ` J. Bruce Fields
2015-05-01  1:53     ` NeilBrown
2015-05-04 22:01       ` J. Bruce Fields
2015-05-05 13:54         ` Kinglong Mee
2015-05-05 14:18           ` J. Bruce Fields
2015-05-05 15:52             ` J. Bruce Fields
2015-05-05 22:26               ` NeilBrown
2015-05-08 16:15                 ` J. Bruce Fields
2015-05-08 20:01                   ` [PATCH] nfsd: don't hold i_mutex over userspace upcalls J. Bruce Fields
2015-06-03 15:18                     ` J. Bruce Fields
     [not found]                       ` <20150603151819.GA8441-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2015-07-05 11:27                         ` Kinglong Mee
2015-07-05 11:27                           ` Kinglong Mee
2015-07-06 18:22                           ` J. Bruce Fields
2015-08-18 19:10                         ` J. Bruce Fields
2015-08-18 19:10                           ` J. Bruce Fields
     [not found]                           ` <20150818191028.GA3957-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
2015-11-12 21:22                             ` J. Bruce Fields
2015-11-12 21:22                               ` J. Bruce Fields

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.