Linux-Fsdevel Archive on lore.kernel.org
 help / color / Atom feed
From: NeilBrown <neilb@suse.com>
To: Al Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	linux-kernel@vger.kernel.org
Subject: [PATCH 3/3] VFS: close race between getcwd() and d_move()
Date: Thu, 09 Nov 2017 14:22:07 +1100
Message-ID: <151019772763.30101.16040338743875884111.stgit@noble> (raw)
In-Reply-To: <151019756744.30101.3832608128627682973.stgit@noble>

d_move() will call __d_drop() and then __d_rehash()
on the dentry being moved.  This creates a small window
when the dentry appears to be unhashed.  Many tests
of d_unhashed() are made under ->d_lock and so are safe
from racing with this window, but some aren't.
In particular, getcwd() calls d_unlinked() (which calls
d_unhashed()) without d_lock protection, so it can race.

This races has been seen in practice with lustre, which uses d_move() as
part of name lookup.  See:
   https://jira.hpdd.intel.com/browse/LU-9735
It could race with a regular rename(), and result in ENOENT instead
of either the 'before' or 'after' name.

The race can be demonstrated with a simple program which
has two threads, one renaming a directory back and forth
while another calls getcwd() within that directory: it should never
fail, but does.  See:
  https://patchwork.kernel.org/patch/9455345/

We could fix this race by taking d_lock and rechecking when
d_unhashed() reports true.  Alternately when can remove the window,
which is the approach this patch takes.

When __d_drop and __d_rehash are used to move a dentry, an extra
flag is passed which causes d_hash.pprev to not be cleared, and
to not be tested.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 fs/dcache.c |   31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index d5952306206b..3130d62f29c9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -471,8 +471,11 @@ static void dentry_lru_add(struct dentry *dentry)
  * reason (NFS timeouts or autofs deletes).
  *
  * __d_drop requires dentry->d_lock.
+ * ___d_drop takes an extra @moving argument.
+ * If true, d_hash.pprev is not cleared, so there is no transient d_unhashed()
+ * state.
  */
-void __d_drop(struct dentry *dentry)
+static void inline ___d_drop(struct dentry *dentry, bool moving)
 {
 	if (!d_unhashed(dentry)) {
 		struct hlist_bl_head *b;
@@ -493,12 +496,18 @@ void __d_drop(struct dentry *dentry)
 		} else
 			hlist_bl_lock(b);
 		__hlist_bl_del(&dentry->d_hash);
-		dentry->d_hash.pprev = NULL;
+		if (likely(!moving))
+			dentry->d_hash.pprev = NULL;
 		hlist_bl_unlock(b);
 		/* After this call, in-progress rcu-walk path lookup will fail. */
 		write_seqcount_invalidate(&dentry->d_seq);
 	}
 }
+
+void __d_drop(struct dentry *dentry)
+{
+	___d_drop(dentry, false);
+}
 EXPORT_SYMBOL(__d_drop);
 
 void d_drop(struct dentry *dentry)
@@ -2387,10 +2396,10 @@ void d_delete(struct dentry * dentry)
 }
 EXPORT_SYMBOL(d_delete);
 
-static void __d_rehash(struct dentry *entry)
+static void __d_rehash(struct dentry *entry, bool moving)
 {
 	struct hlist_bl_head *b = d_hash(entry->d_name.hash);
-	BUG_ON(!d_unhashed(entry));
+	BUG_ON(!moving && !d_unhashed(entry));
 	hlist_bl_lock(b);
 	hlist_bl_add_head_rcu(&entry->d_hash, b);
 	hlist_bl_unlock(b);
@@ -2406,7 +2415,7 @@ static void __d_rehash(struct dentry *entry)
 void d_rehash(struct dentry * entry)
 {
 	spin_lock(&entry->d_lock);
-	__d_rehash(entry);
+	__d_rehash(entry, false);
 	spin_unlock(&entry->d_lock);
 }
 EXPORT_SYMBOL(d_rehash);
@@ -2580,7 +2589,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
 		raw_write_seqcount_end(&dentry->d_seq);
 		fsnotify_update_flags(dentry);
 	}
-	__d_rehash(dentry);
+	__d_rehash(dentry, false);
 	if (dir)
 		end_dir_add(dir, n);
 	spin_unlock(&dentry->d_lock);
@@ -2642,7 +2651,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
 			alias = NULL;
 		} else {
 			__dget_dlock(alias);
-			__d_rehash(alias);
+			__d_rehash(alias, false);
 			spin_unlock(&alias->d_lock);
 		}
 		spin_unlock(&inode->i_lock);
@@ -2828,8 +2837,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 
 	/* unhash both */
 	/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
-	__d_drop(dentry);
-	__d_drop(target);
+	___d_drop(dentry, true);
+	___d_drop(target, exchange);
 
 	/* Switch the names.. */
 	if (exchange)
@@ -2838,9 +2847,9 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 		copy_name(dentry, target);
 
 	/* rehash in new place(s) */
-	__d_rehash(dentry);
+	__d_rehash(dentry, true);
 	if (exchange)
-		__d_rehash(target);
+		__d_rehash(target, true);
 
 	/* ... and switch them in the tree */
 	if (IS_ROOT(dentry)) {

  parent reply index

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-11-09  3:22 [PATCH 0/3] Three VFS patch resends NeilBrown
2017-11-09  3:22 ` [PATCH 1/3] VFS: use synchronize_rcu_expedited() in namespace_unlock() NeilBrown
2017-11-09  3:22 ` [PATCH 2/3] Improve fairness when locking the per-superblock s_anon list NeilBrown
2017-11-09 19:52   ` Linus Torvalds
2017-11-09 20:50     ` Al Viro
2017-11-09 23:09       ` NeilBrown
2017-11-09 23:19         ` Al Viro
2017-11-10  0:02       ` Linus Torvalds
2017-11-10  8:50     ` Christoph Hellwig
2017-11-09  3:22 ` NeilBrown [this message]
2017-11-09 11:41   ` [PATCH 3/3] VFS: close race between getcwd() and d_move() Nikolay Borisov
2017-11-09 13:08     ` Matthew Wilcox
2017-11-09 16:02       ` Nikolay Borisov
2017-11-09 20:23   ` Linus Torvalds
2017-11-09 22:14     ` NeilBrown
2017-11-10  1:40       ` Linus Torvalds
2017-11-10  4:45         ` NeilBrown
2017-11-10 19:52           ` Linus Torvalds
2017-11-10 20:53           ` Al Viro
2017-11-21 23:50             ` Al Viro
2017-11-22  1:31               ` NeilBrown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=151019772763.30101.16040338743875884111.stgit@noble \
    --to=neilb@suse.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-Fsdevel Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-fsdevel/0 linux-fsdevel/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-fsdevel linux-fsdevel/ https://lore.kernel.org/linux-fsdevel \
		linux-fsdevel@vger.kernel.org
	public-inbox-index linux-fsdevel

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-fsdevel


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git