All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] ceph: clean up some unsafe dentry->d_parent accesses
@ 2016-12-13 18:04 Jeff Layton
  2016-12-13 18:04 ` [PATCH 1/3] ceph: clean up unsafe d_parent access in __choose_mds Jeff Layton
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Jeff Layton @ 2016-12-13 18:04 UTC (permalink / raw)
  To: ceph-devel; +Cc: zyan, sage, idryomov, linux-fsdevel, dhowells, viro

In several places, the kcephfs client accesses dentry->d_parent without
holding appropriate locking. This is not safe, as you can race with
renames that can morph the tree such that is changes and dentries found
previously can end up freed. This patchset fixes these places to access
the parent(s) safely, mostly using the rcu_read_lock().

Note that I'm not aware of any specific bug reports in this area --
these were just discovered by inspection (mostly by Zheng). This passes
xfstests at least up to test generic/095, where we hit an unrelated
softlockup in the splice write code (http://tracker.ceph.com/issues/18130).

Tracker bug link: http://tracker.ceph.com/issues/18148

Jeff Layton (3):
  ceph: clean up unsafe d_parent access in __choose_mds
  ceph: clean up unsafe d_parent accesses in build_dentry_path
  ceph: fix unsafe dcache access in ceph_encode_dentry_release

 fs/ceph/caps.c       |  7 ++++--
 fs/ceph/mds_client.c | 68 +++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 51 insertions(+), 24 deletions(-)

-- 
2.7.4


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/3] ceph: clean up unsafe d_parent access in __choose_mds
  2016-12-13 18:04 [PATCH 0/3] ceph: clean up some unsafe dentry->d_parent accesses Jeff Layton
@ 2016-12-13 18:04 ` Jeff Layton
  2016-12-14  8:58   ` Yan, Zheng
  2016-12-14 12:55   ` [PATCH v2] " Jeff Layton
  2016-12-13 18:04 ` [PATCH 2/3] ceph: clean up unsafe d_parent accesses in build_dentry_path Jeff Layton
  2016-12-13 18:04 ` [PATCH 3/3] ceph: fix unsafe dcache access in ceph_encode_dentry_release Jeff Layton
  2 siblings, 2 replies; 8+ messages in thread
From: Jeff Layton @ 2016-12-13 18:04 UTC (permalink / raw)
  To: ceph-devel; +Cc: zyan, sage, idryomov, linux-fsdevel, dhowells, viro

__choose_mds exists to pick an MDS to use when issuing a call. Doing
that typically involves picking an inode and using the authoritative
MDS for it. In most cases, that's pretty straightforward, as we are
using an inode to which we hold a reference (usually represented by
r_dentry or r_inode in the request).

In the case of a snapshotted directory however, we need to fetch
the non-snapped parent, which involves walking back up the parents
in the tree. The dentries in the snapshot dir are effectively frozen
but the overall parent is _not_, and could change if a concurrent
rename were to occur.

Clean this code up and take special care to ensure the validity of
the entries we're working with. First, hold the rcu_read_lock so we
can ensure that any d_parent we find won't go away.

Change get_nonsnap_parent to return an inode, and take a reference to
that inode before returning (if any). Change the other places where we
set "inode" in __choose_mds to also take a reference, and then call iput
on that inode before exiting the function.

Link: http://tracker.ceph.com/issues/18148
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/ceph/mds_client.c | 59 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 815acd1a56d4..d51cfd2c6def 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -667,6 +667,27 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 }
 
 /*
+ * Walk back up the dentry tree until we hit a dentry representing a
+ * non-snapshot inode. We do this using the rcu_read_lock (which must be held
+ * when calling this) to ensure that the objects won't disappear while we're
+ * working with them. Once we hit a candidate dentry, we attempt to take a
+ * reference to it, and return that as the result.
+ */
+static struct inode *get_nonsnap_parent(struct dentry *dentry) { struct inode
+	*inode = NULL;
+
+	while (dentry && !IS_ROOT(dentry)) {
+		inode = d_inode_rcu(dentry);
+		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
+			break;
+		dentry = dentry->d_parent;
+	}
+	if (inode)
+		inode = igrab(inode);
+	return inode;
+}
+
+/*
  * Choose mds to send request to next.  If there is a hint set in the
  * request (e.g., due to a prior forward hint from the mds), use that.
  * Otherwise, consult frag tree and/or caps to identify the
@@ -674,19 +695,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
  *
  * Called under mdsc->mutex.
  */
-static struct dentry *get_nonsnap_parent(struct dentry *dentry)
-{
-	/*
-	 * we don't need to worry about protecting the d_parent access
-	 * here because we never renaming inside the snapped namespace
-	 * except to resplice to another snapdir, and either the old or new
-	 * result is a valid result.
-	 */
-	while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
-		dentry = dentry->d_parent;
-	return dentry;
-}
-
 static int __choose_mds(struct ceph_mds_client *mdsc,
 			struct ceph_mds_request *req)
 {
@@ -716,30 +724,38 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	inode = NULL;
 	if (req->r_inode) {
 		inode = req->r_inode;
+		ihold(inode);
 	} else if (req->r_dentry) {
 		/* ignore race with rename; old or new d_parent is okay */
-		struct dentry *parent = req->r_dentry->d_parent;
-		struct inode *dir = d_inode(parent);
+		struct dentry *parent;
+		struct inode *dir;
+
+		rcu_read_lock();
+		parent = req->r_dentry->d_parent;
+		dir = d_inode_rcu(parent);
 
 		if (dir->i_sb != mdsc->fsc->sb) {
 			/* not this fs! */
 			inode = d_inode(req->r_dentry);
+			ihold(inode);
 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
 			/* direct snapped/virtual snapdir requests
 			 * based on parent dir inode */
-			struct dentry *dn = get_nonsnap_parent(parent);
-			inode = d_inode(dn);
+			inode = get_nonsnap_parent(parent);
 			dout("__choose_mds using nonsnap parent %p\n", inode);
 		} else {
 			/* dentry target */
 			inode = d_inode(req->r_dentry);
 			if (!inode || mode == USE_AUTH_MDS) {
 				/* dir + name */
-				inode = dir;
+				inode = igrab(dir);
 				hash = ceph_dentry_hash(dir, req->r_dentry);
 				is_hash = true;
+			} else {
+				ihold(inode);
 			}
 		}
+		rcu_read_unlock();
 	}
 
 	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
@@ -768,7 +784,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 				     (int)r, frag.ndist);
 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
 				    CEPH_MDS_STATE_ACTIVE)
-					return mds;
+					goto out;
 			}
 
 			/* since this file/dir wasn't known to be
@@ -783,7 +799,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 				     inode, ceph_vinop(inode), frag.frag, mds);
 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
 				    CEPH_MDS_STATE_ACTIVE)
-					return mds;
+					goto out;
 			}
 		}
 	}
@@ -796,6 +812,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
 	if (!cap) {
 		spin_unlock(&ci->i_ceph_lock);
+		iput(inode);
 		goto random;
 	}
 	mds = cap->session->s_mds;
@@ -803,6 +820,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	     inode, ceph_vinop(inode), mds,
 	     cap == ci->i_auth_cap ? "auth " : "", cap);
 	spin_unlock(&ci->i_ceph_lock);
+out:
+	iput(inode);
 	return mds;
 
 random:
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/3] ceph: clean up unsafe d_parent accesses in build_dentry_path
  2016-12-13 18:04 [PATCH 0/3] ceph: clean up some unsafe dentry->d_parent accesses Jeff Layton
  2016-12-13 18:04 ` [PATCH 1/3] ceph: clean up unsafe d_parent access in __choose_mds Jeff Layton
@ 2016-12-13 18:04 ` Jeff Layton
  2016-12-13 18:04 ` [PATCH 3/3] ceph: fix unsafe dcache access in ceph_encode_dentry_release Jeff Layton
  2 siblings, 0 replies; 8+ messages in thread
From: Jeff Layton @ 2016-12-13 18:04 UTC (permalink / raw)
  To: ceph-devel; +Cc: zyan, sage, idryomov, linux-fsdevel, dhowells, viro

While we hold a reference to the dentry when build_dentry_path is
called, we could end up racing with a rename that changes d_parent.
Handle that situation correctly, by using the rcu_read_lock to
ensure that the parent dentry and inode stick around long enough
to safely check ceph_snap and ceph_ino.

Link: http://tracker.ceph.com/issues/18148
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/ceph/mds_client.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d51cfd2c6def..9498c0d282ff 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1798,13 +1798,18 @@ static int build_dentry_path(struct dentry *dentry,
 			     int *pfreepath)
 {
 	char *path;
+	struct inode *dir;
 
-	if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) {
-		*pino = ceph_ino(d_inode(dentry->d_parent));
+	rcu_read_lock();
+	dir = d_inode_rcu(dentry->d_parent);
+	if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
+		*pino = ceph_ino(dir);
+		rcu_read_unlock();
 		*ppath = dentry->d_name.name;
 		*ppathlen = dentry->d_name.len;
 		return 0;
 	}
+	rcu_read_unlock();
 	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/3] ceph: fix unsafe dcache access in ceph_encode_dentry_release
  2016-12-13 18:04 [PATCH 0/3] ceph: clean up some unsafe dentry->d_parent accesses Jeff Layton
  2016-12-13 18:04 ` [PATCH 1/3] ceph: clean up unsafe d_parent access in __choose_mds Jeff Layton
  2016-12-13 18:04 ` [PATCH 2/3] ceph: clean up unsafe d_parent accesses in build_dentry_path Jeff Layton
@ 2016-12-13 18:04 ` Jeff Layton
  2 siblings, 0 replies; 8+ messages in thread
From: Jeff Layton @ 2016-12-13 18:04 UTC (permalink / raw)
  To: ceph-devel; +Cc: zyan, sage, idryomov, linux-fsdevel, dhowells, viro

Accessing d_parent requires some sort of locking or it could vanish
out from under us. Since we take the d_lock anyway, use that to fetch
d_parent and take a reference to it, and then use that reference to
call ceph_encode_inode_release.

Link: http://tracker.ceph.com/issues/18148
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/ceph/caps.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 16e6ded0b7f2..6bc5c1efbb26 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3834,7 +3834,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 int ceph_encode_dentry_release(void **p, struct dentry *dentry,
 			       int mds, int drop, int unless)
 {
-	struct inode *dir = d_inode(dentry->d_parent);
+	struct dentry *parent;
 	struct ceph_mds_request_release *rel = *p;
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
 	int force = 0;
@@ -3849,9 +3849,12 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
 	spin_lock(&dentry->d_lock);
 	if (di->lease_session && di->lease_session->s_mds == mds)
 		force = 1;
+	parent = dget(dentry->d_parent);
 	spin_unlock(&dentry->d_lock);
 
-	ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+	ret = ceph_encode_inode_release(p, d_inode(parent), mds, drop,
+					unless, force);
+	dput(parent);
 
 	spin_lock(&dentry->d_lock);
 	if (ret && di->lease_session && di->lease_session->s_mds == mds) {
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3] ceph: clean up unsafe d_parent access in __choose_mds
  2016-12-13 18:04 ` [PATCH 1/3] ceph: clean up unsafe d_parent access in __choose_mds Jeff Layton
@ 2016-12-14  8:58   ` Yan, Zheng
  2016-12-14 12:04     ` Jeff Layton
  2016-12-14 12:55   ` [PATCH v2] " Jeff Layton
  1 sibling, 1 reply; 8+ messages in thread
From: Yan, Zheng @ 2016-12-14  8:58 UTC (permalink / raw)
  To: Jeff Layton
  Cc: ceph-devel, Sage Weil, Ilya Dryomov, linux-fsdevel, David Howells, viro


> On 14 Dec 2016, at 02:04, Jeff Layton <jlayton@redhat.com> wrote:
> 
> __choose_mds exists to pick an MDS to use when issuing a call. Doing
> that typically involves picking an inode and using the authoritative
> MDS for it. In most cases, that's pretty straightforward, as we are
> using an inode to which we hold a reference (usually represented by
> r_dentry or r_inode in the request).
> 
> In the case of a snapshotted directory however, we need to fetch
> the non-snapped parent, which involves walking back up the parents
> in the tree. The dentries in the snapshot dir are effectively frozen
> but the overall parent is _not_, and could change if a concurrent
> rename were to occur.
> 
> Clean this code up and take special care to ensure the validity of
> the entries we're working with. First, hold the rcu_read_lock so we
> can ensure that any d_parent we find won't go away.
> 
> Change get_nonsnap_parent to return an inode, and take a reference to
> that inode before returning (if any). Change the other places where we
> set "inode" in __choose_mds to also take a reference, and then call iput
> on that inode before exiting the function.
> 
> Link: http://tracker.ceph.com/issues/18148
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
> fs/ceph/mds_client.c | 59 ++++++++++++++++++++++++++++++++++------------------
> 1 file changed, 39 insertions(+), 20 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 815acd1a56d4..d51cfd2c6def 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -667,6 +667,27 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
> }
> 
> /*
> + * Walk back up the dentry tree until we hit a dentry representing a
> + * non-snapshot inode. We do this using the rcu_read_lock (which must be held
> + * when calling this) to ensure that the objects won't disappear while we're
> + * working with them. Once we hit a candidate dentry, we attempt to take a
> + * reference to it, and return that as the result.
> + */
> +static struct inode *get_nonsnap_parent(struct dentry *dentry) { struct inode
> +	*inode = NULL;
> +
> +	while (dentry && !IS_ROOT(dentry)) {
> +		inode = d_inode_rcu(dentry);
> +		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
> +			break;
> +		dentry = dentry->d_parent;
> +	}
> +	if (inode)
> +		inode = igrab(inode);
> +	return inode;
> +}
> +
> +/*
>  * Choose mds to send request to next.  If there is a hint set in the
>  * request (e.g., due to a prior forward hint from the mds), use that.
>  * Otherwise, consult frag tree and/or caps to identify the
> @@ -674,19 +695,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
>  *
>  * Called under mdsc->mutex.
>  */
> -static struct dentry *get_nonsnap_parent(struct dentry *dentry)
> -{
> -	/*
> -	 * we don't need to worry about protecting the d_parent access
> -	 * here because we never renaming inside the snapped namespace
> -	 * except to resplice to another snapdir, and either the old or new
> -	 * result is a valid result.
> -	 */
> -	while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
> -		dentry = dentry->d_parent;
> -	return dentry;
> -}
> -
> static int __choose_mds(struct ceph_mds_client *mdsc,
> 			struct ceph_mds_request *req)
> {
> @@ -716,30 +724,38 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 	inode = NULL;
> 	if (req->r_inode) {
> 		inode = req->r_inode;
> +		ihold(inode);
> 	} else if (req->r_dentry) {
> 		/* ignore race with rename; old or new d_parent is okay */
> -		struct dentry *parent = req->r_dentry->d_parent;
> -		struct inode *dir = d_inode(parent);
> +		struct dentry *parent;
> +		struct inode *dir;
> +
> +		rcu_read_lock();
> +		parent = req->r_dentry->d_parent;
> +		dir = d_inode_rcu(parent);

d_inode_rcu(parent) can return null in theory. I think we should use req->r_locked_dir when it’s not null.
If both r_locked_dir and d_inode_rcu(parent) are nulls, return an error return the request sender.

Regards
Yan, Zheng 


> 
> 		if (dir->i_sb != mdsc->fsc->sb) {
> 			/* not this fs! */
> 			inode = d_inode(req->r_dentry);
> +			ihold(inode);
> 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
> 			/* direct snapped/virtual snapdir requests
> 			 * based on parent dir inode */
> -			struct dentry *dn = get_nonsnap_parent(parent);
> -			inode = d_inode(dn);
> +			inode = get_nonsnap_parent(parent);
> 			dout("__choose_mds using nonsnap parent %p\n", inode);
> 		} else {
> 			/* dentry target */
> 			inode = d_inode(req->r_dentry);
> 			if (!inode || mode == USE_AUTH_MDS) {
> 				/* dir + name */
> -				inode = dir;
> +				inode = igrab(dir);
> 				hash = ceph_dentry_hash(dir, req->r_dentry);
> 				is_hash = true;
> +			} else {
> +				ihold(inode);
> 			}
> 		}
> +		rcu_read_unlock();
> 	}
> 
> 	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
> @@ -768,7 +784,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 				     (int)r, frag.ndist);
> 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
> 				    CEPH_MDS_STATE_ACTIVE)
> -					return mds;
> +					goto out;
> 			}
> 
> 			/* since this file/dir wasn't known to be
> @@ -783,7 +799,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 				     inode, ceph_vinop(inode), frag.frag, mds);
> 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
> 				    CEPH_MDS_STATE_ACTIVE)
> -					return mds;
> +					goto out;
> 			}
> 		}
> 	}
> @@ -796,6 +812,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
> 	if (!cap) {
> 		spin_unlock(&ci->i_ceph_lock);
> +		iput(inode);
> 		goto random;
> 	}
> 	mds = cap->session->s_mds;
> @@ -803,6 +820,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 	     inode, ceph_vinop(inode), mds,
> 	     cap == ci->i_auth_cap ? "auth " : "", cap);
> 	spin_unlock(&ci->i_ceph_lock);
> +out:
> +	iput(inode);
> 	return mds;
> 
> random:
> -- 
> 2.7.4
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3] ceph: clean up unsafe d_parent access in __choose_mds
  2016-12-14  8:58   ` Yan, Zheng
@ 2016-12-14 12:04     ` Jeff Layton
  0 siblings, 0 replies; 8+ messages in thread
From: Jeff Layton @ 2016-12-14 12:04 UTC (permalink / raw)
  To: Yan, Zheng
  Cc: ceph-devel, Sage Weil, Ilya Dryomov, linux-fsdevel, David Howells, viro

On Wed, 2016-12-14 at 16:58 +0800, Yan, Zheng wrote:
> > 
> > On 14 Dec 2016, at 02:04, Jeff Layton <jlayton@redhat.com> wrote:
> > 
> > __choose_mds exists to pick an MDS to use when issuing a call. Doing
> > that typically involves picking an inode and using the authoritative
> > MDS for it. In most cases, that's pretty straightforward, as we are
> > using an inode to which we hold a reference (usually represented by
> > r_dentry or r_inode in the request).
> > 
> > In the case of a snapshotted directory however, we need to fetch
> > the non-snapped parent, which involves walking back up the parents
> > in the tree. The dentries in the snapshot dir are effectively frozen
> > but the overall parent is _not_, and could change if a concurrent
> > rename were to occur.
> > 
> > Clean this code up and take special care to ensure the validity of
> > the entries we're working with. First, hold the rcu_read_lock so we
> > can ensure that any d_parent we find won't go away.
> > 
> > Change get_nonsnap_parent to return an inode, and take a reference to
> > that inode before returning (if any). Change the other places where we
> > set "inode" in __choose_mds to also take a reference, and then call iput
> > on that inode before exiting the function.
> > 
> > Link: http://tracker.ceph.com/issues/18148
> > Signed-off-by: Jeff Layton <jlayton@redhat.com>
> > ---
> > fs/ceph/mds_client.c | 59 ++++++++++++++++++++++++++++++++++------------------
> > 1 file changed, 39 insertions(+), 20 deletions(-)
> > 
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index 815acd1a56d4..d51cfd2c6def 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -667,6 +667,27 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
> > }
> > 
> > /*
> > + * Walk back up the dentry tree until we hit a dentry representing a
> > + * non-snapshot inode. We do this using the rcu_read_lock (which must be held
> > + * when calling this) to ensure that the objects won't disappear while we're
> > + * working with them. Once we hit a candidate dentry, we attempt to take a
> > + * reference to it, and return that as the result.
> > + */
> > +static struct inode *get_nonsnap_parent(struct dentry *dentry) { struct inode
> > +	*inode = NULL;
> > +
> > +	while (dentry && !IS_ROOT(dentry)) {
> > +		inode = d_inode_rcu(dentry);
> > +		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
> > +			break;
> > +		dentry = dentry->d_parent;
> > +	}
> > +	if (inode)
> > +		inode = igrab(inode);
> > +	return inode;
> > +}
> > +
> > +/*
> >  * Choose mds to send request to next.  If there is a hint set in the
> >  * request (e.g., due to a prior forward hint from the mds), use that.
> >  * Otherwise, consult frag tree and/or caps to identify the
> > @@ -674,19 +695,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
> >  *
> >  * Called under mdsc->mutex.
> >  */
> > -static struct dentry *get_nonsnap_parent(struct dentry *dentry)
> > -{
> > -	/*
> > -	 * we don't need to worry about protecting the d_parent access
> > -	 * here because we never renaming inside the snapped namespace
> > -	 * except to resplice to another snapdir, and either the old or new
> > -	 * result is a valid result.
> > -	 */
> > -	while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
> > -		dentry = dentry->d_parent;
> > -	return dentry;
> > -}
> > -
> > static int __choose_mds(struct ceph_mds_client *mdsc,
> > 			struct ceph_mds_request *req)
> > {
> > @@ -716,30 +724,38 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> > 	inode = NULL;
> > 	if (req->r_inode) {
> > 		inode = req->r_inode;
> > +		ihold(inode);
> > 	} else if (req->r_dentry) {
> > 		/* ignore race with rename; old or new d_parent is okay */
> > -		struct dentry *parent = req->r_dentry->d_parent;
> > -		struct inode *dir = d_inode(parent);
> > +		struct dentry *parent;
> > +		struct inode *dir;
> > +
> > +		rcu_read_lock();
> > +		parent = req->r_dentry->d_parent;
> > +		dir = d_inode_rcu(parent);
> 
> d_inode_rcu(parent) can return null in theory. I think we should use req->r_locked_dir when it’s not null.
> If both r_locked_dir and d_inode_rcu(parent) are nulls, return an error return the request sender.
> 
> Regards
> Yan, Zheng 
> 
> 

Yes, good catch. d_parent can go negative if all you hold is the
rcu_read_lock. I don't think we need to return an error the case that
both are NULL though. In that event, we'll just "goto random" (which is
what we generally do in this function when we aren't sure which one to
use).

I'll respin...

Thanks,
Jeff

> > 
> > 
> > 		if (dir->i_sb != mdsc->fsc->sb) {
> > 			/* not this fs! */
> > 			inode = d_inode(req->r_dentry);
> > +			ihold(inode);
> > 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
> > 			/* direct snapped/virtual snapdir requests
> > 			 * based on parent dir inode */
> > -			struct dentry *dn = get_nonsnap_parent(parent);
> > -			inode = d_inode(dn);
> > +			inode = get_nonsnap_parent(parent);
> > 			dout("__choose_mds using nonsnap parent %p\n", inode);
> > 		} else {
> > 			/* dentry target */
> > 			inode = d_inode(req->r_dentry);
> > 			if (!inode || mode == USE_AUTH_MDS) {
> > 				/* dir + name */
> > -				inode = dir;
> > +				inode = igrab(dir);
> > 				hash = ceph_dentry_hash(dir, req->r_dentry);
> > 				is_hash = true;
> > +			} else {
> > +				ihold(inode);
> > 			}
> > 		}
> > +		rcu_read_unlock();
> > 	}
> > 
> > 	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
> > @@ -768,7 +784,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> > 				     (int)r, frag.ndist);
> > 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
> > 				    CEPH_MDS_STATE_ACTIVE)
> > -					return mds;
> > +					goto out;
> > 			}
> > 
> > 			/* since this file/dir wasn't known to be
> > @@ -783,7 +799,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> > 				     inode, ceph_vinop(inode), frag.frag, mds);
> > 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
> > 				    CEPH_MDS_STATE_ACTIVE)
> > -					return mds;
> > +					goto out;
> > 			}
> > 		}
> > 	}
> > @@ -796,6 +812,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> > 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
> > 	if (!cap) {
> > 		spin_unlock(&ci->i_ceph_lock);
> > +		iput(inode);
> > 		goto random;
> > 	}
> > 	mds = cap->session->s_mds;
> > @@ -803,6 +820,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> > 	     inode, ceph_vinop(inode), mds,
> > 	     cap == ci->i_auth_cap ? "auth " : "", cap);
> > 	spin_unlock(&ci->i_ceph_lock);
> > +out:
> > +	iput(inode);
> > 	return mds;
> > 
> > random:
> > -- 
> > 2.7.4
> > 
> 

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] ceph: clean up unsafe d_parent access in __choose_mds
  2016-12-13 18:04 ` [PATCH 1/3] ceph: clean up unsafe d_parent access in __choose_mds Jeff Layton
  2016-12-14  8:58   ` Yan, Zheng
@ 2016-12-14 12:55   ` Jeff Layton
  2016-12-14 13:21     ` Yan, Zheng
  1 sibling, 1 reply; 8+ messages in thread
From: Jeff Layton @ 2016-12-14 12:55 UTC (permalink / raw)
  To: ceph-devel; +Cc: zyan, sage, idryomov, linux-fsdevel, dhowells, viro

__choose_mds exists to pick an MDS to use when issuing a call. Doing
that typically involves picking an inode and using the authoritative
MDS for it. In most cases, that's pretty straightforward, as we are
using an inode to which we hold a reference (usually represented by
r_dentry or r_inode in the request).

In the case of a snapshotted directory however, we need to fetch
the non-snapped parent, which involves walking back up the parents
in the tree. The dentries in the snapshot dir are effectively frozen
but the overall parent is _not_, and could change if a concurrent
rename were to occur.

Clean this code up and take special care to ensure the validity of
the entries we're working with. First, hold the rcu_read_lock so we
can ensure that any d_parent we find won't go away.

Change get_nonsnap_parent to return an inode, and take a reference to
that inode before returning (if any). Change the other places where we
set "inode" in __choose_mds to also take a reference, and then call iput
on that inode before exiting the function.

Link: http://tracker.ceph.com/issues/18148
Signed-off-by: Jeff Layton <jlayton@redhat.com>
---
 fs/ceph/mds_client.c | 67 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 22 deletions(-)

v2: try to use req->r_locked_dir if d_inode_rcu(parent) is NULL

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 815acd1a56d4..eef772155ab1 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -667,6 +667,27 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 }
 
 /*
+ * Walk back up the dentry tree until we hit a dentry representing a
+ * non-snapshot inode. We do this using the rcu_read_lock (which must be held
+ * when calling this) to ensure that the objects won't disappear while we're
+ * working with them. Once we hit a candidate dentry, we attempt to take a
+ * reference to it, and return that as the result.
+ */
+static struct inode *get_nonsnap_parent(struct dentry *dentry) { struct inode
+	*inode = NULL;
+
+	while (dentry && !IS_ROOT(dentry)) {
+		inode = d_inode_rcu(dentry);
+		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
+			break;
+		dentry = dentry->d_parent;
+	}
+	if (inode)
+		inode = igrab(inode);
+	return inode;
+}
+
+/*
  * Choose mds to send request to next.  If there is a hint set in the
  * request (e.g., due to a prior forward hint from the mds), use that.
  * Otherwise, consult frag tree and/or caps to identify the
@@ -674,19 +695,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
  *
  * Called under mdsc->mutex.
  */
-static struct dentry *get_nonsnap_parent(struct dentry *dentry)
-{
-	/*
-	 * we don't need to worry about protecting the d_parent access
-	 * here because we never renaming inside the snapped namespace
-	 * except to resplice to another snapdir, and either the old or new
-	 * result is a valid result.
-	 */
-	while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
-		dentry = dentry->d_parent;
-	return dentry;
-}
-
 static int __choose_mds(struct ceph_mds_client *mdsc,
 			struct ceph_mds_request *req)
 {
@@ -716,30 +724,42 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	inode = NULL;
 	if (req->r_inode) {
 		inode = req->r_inode;
+		ihold(inode);
 	} else if (req->r_dentry) {
 		/* ignore race with rename; old or new d_parent is okay */
-		struct dentry *parent = req->r_dentry->d_parent;
-		struct inode *dir = d_inode(parent);
-
-		if (dir->i_sb != mdsc->fsc->sb) {
+		struct dentry *parent;
+		struct inode *dir;
+
+		rcu_read_lock();
+		parent = req->r_dentry->d_parent;
+		dir = d_inode_rcu(parent);
+
+		if (!dir) {
+			inode = req->r_locked_dir;
+			if (inode)
+				ihold(inode);
+		} else if (dir->i_sb != mdsc->fsc->sb) {
 			/* not this fs! */
 			inode = d_inode(req->r_dentry);
+			ihold(inode);
 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
 			/* direct snapped/virtual snapdir requests
 			 * based on parent dir inode */
-			struct dentry *dn = get_nonsnap_parent(parent);
-			inode = d_inode(dn);
+			inode = get_nonsnap_parent(parent);
 			dout("__choose_mds using nonsnap parent %p\n", inode);
 		} else {
 			/* dentry target */
 			inode = d_inode(req->r_dentry);
 			if (!inode || mode == USE_AUTH_MDS) {
 				/* dir + name */
-				inode = dir;
+				inode = igrab(dir);
 				hash = ceph_dentry_hash(dir, req->r_dentry);
 				is_hash = true;
+			} else {
+				ihold(inode);
 			}
 		}
+		rcu_read_unlock();
 	}
 
 	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
@@ -768,7 +788,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 				     (int)r, frag.ndist);
 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
 				    CEPH_MDS_STATE_ACTIVE)
-					return mds;
+					goto out;
 			}
 
 			/* since this file/dir wasn't known to be
@@ -783,7 +803,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 				     inode, ceph_vinop(inode), frag.frag, mds);
 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
 				    CEPH_MDS_STATE_ACTIVE)
-					return mds;
+					goto out;
 			}
 		}
 	}
@@ -796,6 +816,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
 	if (!cap) {
 		spin_unlock(&ci->i_ceph_lock);
+		iput(inode);
 		goto random;
 	}
 	mds = cap->session->s_mds;
@@ -803,6 +824,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	     inode, ceph_vinop(inode), mds,
 	     cap == ci->i_auth_cap ? "auth " : "", cap);
 	spin_unlock(&ci->i_ceph_lock);
+out:
+	iput(inode);
 	return mds;
 
 random:
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] ceph: clean up unsafe d_parent access in __choose_mds
  2016-12-14 12:55   ` [PATCH v2] " Jeff Layton
@ 2016-12-14 13:21     ` Yan, Zheng
  0 siblings, 0 replies; 8+ messages in thread
From: Yan, Zheng @ 2016-12-14 13:21 UTC (permalink / raw)
  To: Jeff Layton
  Cc: ceph-devel, Sage Weil, Ilya Dryomov, linux-fsdevel,
	David Howells, Al Viro


> On 14 Dec 2016, at 20:55, Jeff Layton <jlayton@redhat.com> wrote:
> 
> __choose_mds exists to pick an MDS to use when issuing a call. Doing
> that typically involves picking an inode and using the authoritative
> MDS for it. In most cases, that's pretty straightforward, as we are
> using an inode to which we hold a reference (usually represented by
> r_dentry or r_inode in the request).
> 
> In the case of a snapshotted directory however, we need to fetch
> the non-snapped parent, which involves walking back up the parents
> in the tree. The dentries in the snapshot dir are effectively frozen
> but the overall parent is _not_, and could change if a concurrent
> rename were to occur.
> 
> Clean this code up and take special care to ensure the validity of
> the entries we're working with. First, hold the rcu_read_lock so we
> can ensure that any d_parent we find won't go away.
> 
> Change get_nonsnap_parent to return an inode, and take a reference to
> that inode before returning (if any). Change the other places where we
> set "inode" in __choose_mds to also take a reference, and then call iput
> on that inode before exiting the function.
> 
> Link: http://tracker.ceph.com/issues/18148
> Signed-off-by: Jeff Layton <jlayton@redhat.com>
> ---
> fs/ceph/mds_client.c | 67 +++++++++++++++++++++++++++++++++++-----------------
> 1 file changed, 45 insertions(+), 22 deletions(-)
> 
> v2: try to use req->r_locked_dir if d_inode_rcu(parent) is NULL
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 815acd1a56d4..eef772155ab1 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -667,6 +667,27 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
> }
> 
> /*
> + * Walk back up the dentry tree until we hit a dentry representing a
> + * non-snapshot inode. We do this using the rcu_read_lock (which must be held
> + * when calling this) to ensure that the objects won't disappear while we're
> + * working with them. Once we hit a candidate dentry, we attempt to take a
> + * reference to it, and return that as the result.
> + */
> +static struct inode *get_nonsnap_parent(struct dentry *dentry) { struct inode
> +	*inode = NULL;
> +
> +	while (dentry && !IS_ROOT(dentry)) {
> +		inode = d_inode_rcu(dentry);
> +		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
> +			break;
> +		dentry = dentry->d_parent;
> +	}
> +	if (inode)
> +		inode = igrab(inode);
> +	return inode;
> +}
> +
> +/*
>  * Choose mds to send request to next.  If there is a hint set in the
>  * request (e.g., due to a prior forward hint from the mds), use that.
>  * Otherwise, consult frag tree and/or caps to identify the
> @@ -674,19 +695,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
>  *
>  * Called under mdsc->mutex.
>  */
> -static struct dentry *get_nonsnap_parent(struct dentry *dentry)
> -{
> -	/*
> -	 * we don't need to worry about protecting the d_parent access
> -	 * here because we never renaming inside the snapped namespace
> -	 * except to resplice to another snapdir, and either the old or new
> -	 * result is a valid result.
> -	 */
> -	while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
> -		dentry = dentry->d_parent;
> -	return dentry;
> -}
> -
> static int __choose_mds(struct ceph_mds_client *mdsc,
> 			struct ceph_mds_request *req)
> {
> @@ -716,30 +724,42 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 	inode = NULL;
> 	if (req->r_inode) {
> 		inode = req->r_inode;
> +		ihold(inode);
> 	} else if (req->r_dentry) {
> 		/* ignore race with rename; old or new d_parent is okay */
> -		struct dentry *parent = req->r_dentry->d_parent;
> -		struct inode *dir = d_inode(parent);
> -
> -		if (dir->i_sb != mdsc->fsc->sb) {
> +		struct dentry *parent;
> +		struct inode *dir;
> +
> +		rcu_read_lock();
> +		parent = req->r_dentry->d_parent;
> +		dir = d_inode_rcu(parent);
> +
> +		if (!dir) {
> +			inode = req->r_locked_dir;
> +			if (inode)
> +				ihold(inode);


I prefer to first try using req->r_locked_dir. Because r_locked_dir is non-null in most cases. Use d_inode_rcu(parent) only when r_locked_dir is null.
For the other two patches, I think we also can first try r_locked_dir.

Regards
Yan, Zheng



> +		} else if (dir->i_sb != mdsc->fsc->sb) {
> 			/* not this fs! */
> 			inode = d_inode(req->r_dentry);
> +			ihold(inode);
> 		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
> 			/* direct snapped/virtual snapdir requests
> 			 * based on parent dir inode */
> -			struct dentry *dn = get_nonsnap_parent(parent);
> -			inode = d_inode(dn);
> +			inode = get_nonsnap_parent(parent);
> 			dout("__choose_mds using nonsnap parent %p\n", inode);
> 		} else {
> 			/* dentry target */
> 			inode = d_inode(req->r_dentry);
> 			if (!inode || mode == USE_AUTH_MDS) {
> 				/* dir + name */
> -				inode = dir;
> +				inode = igrab(dir);
> 				hash = ceph_dentry_hash(dir, req->r_dentry);
> 				is_hash = true;
> +			} else {
> +				ihold(inode);
> 			}
> 		}
> +		rcu_read_unlock();
> 	}
> 
> 	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
> @@ -768,7 +788,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 				     (int)r, frag.ndist);
> 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
> 				    CEPH_MDS_STATE_ACTIVE)
> -					return mds;
> +					goto out;
> 			}
> 
> 			/* since this file/dir wasn't known to be
> @@ -783,7 +803,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 				     inode, ceph_vinop(inode), frag.frag, mds);
> 				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
> 				    CEPH_MDS_STATE_ACTIVE)
> -					return mds;
> +					goto out;
> 			}
> 		}
> 	}
> @@ -796,6 +816,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
> 	if (!cap) {
> 		spin_unlock(&ci->i_ceph_lock);
> +		iput(inode);
> 		goto random;
> 	}
> 	mds = cap->session->s_mds;
> @@ -803,6 +824,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
> 	     inode, ceph_vinop(inode), mds,
> 	     cap == ci->i_auth_cap ? "auth " : "", cap);
> 	spin_unlock(&ci->i_ceph_lock);
> +out:
> +	iput(inode);
> 	return mds;
> 
> random:
> -- 
> 2.7.4
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-12-14 13:21 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-13 18:04 [PATCH 0/3] ceph: clean up some unsafe dentry->d_parent accesses Jeff Layton
2016-12-13 18:04 ` [PATCH 1/3] ceph: clean up unsafe d_parent access in __choose_mds Jeff Layton
2016-12-14  8:58   ` Yan, Zheng
2016-12-14 12:04     ` Jeff Layton
2016-12-14 12:55   ` [PATCH v2] " Jeff Layton
2016-12-14 13:21     ` Yan, Zheng
2016-12-13 18:04 ` [PATCH 2/3] ceph: clean up unsafe d_parent accesses in build_dentry_path Jeff Layton
2016-12-13 18:04 ` [PATCH 3/3] ceph: fix unsafe dcache access in ceph_encode_dentry_release Jeff Layton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.