linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.de>
To: Al Viro <viro@zeniv.linux.org.uk>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Daire Byrne <daire@dneg.com>,
	Trond Myklebust <trond.myklebust@hammerspace.com>,
	Chuck Lever <chuck.lever@oracle.com>
Cc: Linux NFS Mailing List <linux-nfs@vger.kernel.org>,
	linux-fsdevel@vger.kernel.org,
	LKML <linux-kernel@vger.kernel.org>
Subject: [PATCH 06/10] VFS: support concurrent renames.
Date: Fri, 26 Aug 2022 12:10:43 +1000	[thread overview]
Message-ID: <166147984375.25420.13018600986239729815.stgit@noble.brown> (raw)
In-Reply-To: <166147828344.25420.13834885828450967910.stgit@noble.brown>

Allow object can now be renamed from or to a directory in which a create
or unlink is concurrently happening.

Two or more renames with the one directory can also be concurrent.
s_vfs_rename_mutex still serialises lookups for cross-directory renames,
but the renames themselves can proceed concurrently.

A core part of this change is introducing lock_rename_lookup()
which both locks the directories and performs the lookups.
If the filesystem supports shared-lock updates and a wq is provided,
shared locks are used on directories, otherwise exclusive locks.
DCACHE_PAR_UPDATE is always set on the found dentries.

unlock_rename_lookup() performs appropriate unlocking.  It needs to be
told if a wq was provided to lock_rename_lookup().

As we may use alloc_dentry_parallel() which can block, we need to be
careful of the case where both names are the same, in the same
directory.  If the first ->lookup() chooses not to complete the lookup -
as may happen with LOOKUP_RENAME_TARGET - then the second will block.
LOOKUP_RENAME_TARGET is only expected on the first name listed, so we
make sure to lookup the second name given first.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/namei.c            |  221 ++++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/namei.h |   10 ++
 2 files changed, 208 insertions(+), 23 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 13f8ac9721be..a7c458cc787c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3156,6 +3156,187 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 }
 EXPORT_SYMBOL(unlock_rename);
 
+static struct dentry *lock_rename_lookup(struct dentry *p1, struct dentry *p2,
+					 struct dentry **d1p, struct dentry **d2p,
+					 struct qstr *last1, struct qstr *last2,
+					 unsigned int flags1, unsigned int flags2,
+					 wait_queue_head_t *wq)
+{
+	struct dentry *p;
+	struct dentry *d1, *d2;
+	bool ok1, ok2;
+	bool shared = wq && IS_PAR_UPDATE(p1->d_inode);
+
+	if (p1 == p2) {
+		if (shared)
+			inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+		else
+			inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+	retry:
+		/* last1 is expected to be target so and might be looked up
+		 * lazily.  So look up last2 first to avoid the second look up
+		 * waiting for the first.
+		 */
+		d2 = __lookup_hash(last2, p2, flags2, wq);
+		if (IS_ERR(d2))
+			goto out_unlock_2;
+		d1 = __lookup_hash(last1, p1, flags1, wq);
+		if (IS_ERR(d1))
+			goto out_unlock_1;
+		*d1p = d1; *d2p = d2;
+
+		if (d1 < d2) {
+			ok1 = d_lock_update_nested(d1, p1, last1,
+						   I_MUTEX_PARENT);
+			ok2 = d_lock_update_nested(d2, p2, last2,
+						   I_MUTEX_PARENT2);
+		} else if (d1 > d2) {
+			ok2 = d_lock_update_nested(d2, p2, last2,
+						   I_MUTEX_PARENT);
+			ok1 = d_lock_update_nested(d1, p1, last1,
+						   I_MUTEX_PARENT2);
+		} else {
+			/* d1 == d2 !! */
+			ok1 = d_lock_update_nested(d1, p1, last1,
+						   I_MUTEX_PARENT);
+			ok2 = ok1;
+		}
+		if (!ok1 || !ok2) {
+			if (ok1)
+				d_unlock_update(d1);
+			if (ok2)
+				d_unlock_update(d2);
+			dput(d1);
+			dput(d2);
+			goto retry;
+		}
+		return NULL;
+	out_unlock_1:
+		d_lookup_done(d2);
+		dput(d2);
+		d2 = d1;
+	out_unlock_2:
+		if (shared)
+			inode_unlock_shared(p1->d_inode);
+		else
+			inode_unlock(p1->d_inode);
+		return d1;
+	}
+
+	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
+
+	if ((p = d_ancestor(p2, p1)) != NULL) {
+		if (shared) {
+			inode_lock_shared_nested(p2->d_inode, I_MUTEX_PARENT);
+			inode_lock_shared_nested(p1->d_inode, I_MUTEX_CHILD);
+		} else {
+			inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+			inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
+		}
+	} else if ((p = d_ancestor(p1, p2)) != NULL) {
+		if (shared) {
+			inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+			inode_lock_shared_nested(p2->d_inode, I_MUTEX_CHILD);
+		} else {
+			inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+			inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
+		}
+	} else {
+		if (shared) {
+			inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+			inode_lock_shared_nested(p2->d_inode, I_MUTEX_PARENT2);
+		} else {
+			inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+			inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
+		}
+	}
+retry2:
+	d1 = __lookup_hash(last1, p1, flags1, wq);
+	if (IS_ERR(d1))
+		goto unlock_out_3;
+	d2 = __lookup_hash(last2, p2, flags2, wq);
+	if (IS_ERR(d2))
+		goto unlock_out_4;
+
+	if (d1 < d2) {
+		ok1 = d_lock_update_nested(d1, p1, last1, I_MUTEX_PARENT);
+		ok2 = d_lock_update_nested(d2, p2, last2, I_MUTEX_PARENT2);
+	} else {
+		ok2 = d_lock_update_nested(d2, p2, last2, I_MUTEX_PARENT);
+		ok1 = d_lock_update_nested(d1, p1, last1, I_MUTEX_PARENT2);
+	}
+	if (!ok1 || !ok2) {
+		if (ok1)
+			d_unlock_update(d1);
+		if (ok2)
+			d_unlock_update(d2);
+		dput(d1);
+		dput(d2);
+		goto retry2;
+	}
+	*d1p = d1;
+	*d2p = d2;
+	return p;
+unlock_out_4:
+	d_lookup_done(d1);
+	dput(d1);
+	d1 = d2;
+unlock_out_3:
+	if (shared) {
+		inode_unlock_shared(p1->d_inode);
+		inode_unlock_shared(p2->d_inode);
+	} else {
+		inode_unlock(p1->d_inode);
+		inode_unlock(p2->d_inode);
+	}
+	mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+	return d1;
+}
+
+struct dentry *lock_rename_lookup_one(struct dentry *p1, struct dentry *p2,
+				      struct dentry **d1p, struct dentry **d2p,
+				      const char *name1, int nlen1,
+				      const char *name2, int nlen2,
+				      unsigned int flags1, unsigned int flags2,
+				      wait_queue_head_t *wq)
+{
+	struct qstr this1, this2;
+	int err;
+
+	err = lookup_one_common(&init_user_ns, name1, p1, nlen1, &this1);
+	if (err)
+		return ERR_PTR(err);
+	err = lookup_one_common(&init_user_ns, name2, p2, nlen2, &this2);
+	if (err)
+		return ERR_PTR(err);
+	return lock_rename_lookup(p1, p2, d1p, d2p, &this1, &this2,
+				  flags1, flags2, wq);
+}
+EXPORT_SYMBOL(lock_rename_lookup_one);
+
+void unlock_rename_lookup(struct dentry *p1, struct dentry *p2,
+			  struct dentry *d1, struct dentry *d2,
+			  bool with_wq)
+{
+	bool shared = with_wq && IS_PAR_UPDATE(p1->d_inode);
+	d_lookup_done(d1);
+	d_lookup_done(d2);
+	d_unlock_update(d1);
+	if (d2 != d1)
+		d_unlock_update(d2);
+	if (shared) {
+		inode_unlock_shared(p1->d_inode);
+		if (p1 != p2) {
+			inode_unlock_shared(p2->d_inode);
+			mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+		}
+	} else
+		unlock_rename(p1, p2);
+	dput(d1);
+	dput(d2);
+}
+EXPORT_SYMBOL(unlock_rename_lookup);
+
 /**
  * mode_strip_umask - handle vfs umask stripping
  * @dir:	parent directory of the new inode
@@ -4945,6 +5126,7 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
 	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
 	bool should_retry = false;
 	int error = -EINVAL;
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 
 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		goto put_names;
@@ -4985,58 +5167,53 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
 		goto exit2;
 
 retry_deleg:
-	trap = lock_rename(new_path.dentry, old_path.dentry);
-
-	old_dentry = __lookup_hash(&old_last, old_path.dentry,
-				   lookup_flags, NULL);
-	error = PTR_ERR(old_dentry);
-	if (IS_ERR(old_dentry))
+	trap = lock_rename_lookup(new_path.dentry, old_path.dentry,
+				  &new_dentry, &old_dentry,
+				  &new_last, &old_last,
+				  lookup_flags | target_flags, lookup_flags,
+				  &wq);
+	if (IS_ERR(trap))
 		goto exit3;
 	/* source must exist */
 	error = -ENOENT;
 	if (d_is_negative(old_dentry))
 		goto exit4;
-	new_dentry = __lookup_hash(&new_last, new_path.dentry,
-				   lookup_flags | target_flags, NULL);
-	error = PTR_ERR(new_dentry);
-	if (IS_ERR(new_dentry))
-		goto exit4;
 	error = -EEXIST;
 	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
-		goto exit5;
+		goto exit4;
 	if (flags & RENAME_EXCHANGE) {
 		error = -ENOENT;
 		if (d_is_negative(new_dentry))
-			goto exit5;
+			goto exit4;
 
 		if (!d_is_dir(new_dentry)) {
 			error = -ENOTDIR;
 			if (new_last.name[new_last.len])
-				goto exit5;
+				goto exit4;
 		}
 	}
 	/* unless the source is a directory trailing slashes give -ENOTDIR */
 	if (!d_is_dir(old_dentry)) {
 		error = -ENOTDIR;
 		if (old_last.name[old_last.len])
-			goto exit5;
+			goto exit4;
 		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
-			goto exit5;
+			goto exit4;
 	}
 	/* source should not be ancestor of target */
 	error = -EINVAL;
 	if (old_dentry == trap)
-		goto exit5;
+		goto exit4;
 	/* target should not be an ancestor of source */
 	if (!(flags & RENAME_EXCHANGE))
 		error = -ENOTEMPTY;
 	if (new_dentry == trap)
-		goto exit5;
+		goto exit4;
 
 	error = security_path_rename(&old_path, old_dentry,
 				     &new_path, new_dentry, flags);
 	if (error)
-		goto exit5;
+		goto exit4;
 
 	rd.old_dir	   = old_path.dentry->d_inode;
 	rd.old_dentry	   = old_dentry;
@@ -5047,12 +5224,10 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
 	rd.delegated_inode = &delegated_inode;
 	rd.flags	   = flags;
 	error = vfs_rename(&rd);
-exit5:
-	dput(new_dentry);
 exit4:
-	dput(old_dentry);
+	unlock_rename_lookup(new_path.dentry, old_path.dentry, new_dentry, old_dentry,
+			     true);
 exit3:
-	unlock_rename(new_path.dentry, old_path.dentry);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
diff --git a/include/linux/namei.h b/include/linux/namei.h
index b1a210a51210..29756921f69b 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -108,6 +108,16 @@ extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
+extern struct dentry *lock_rename_lookup_one(
+	struct dentry *p1, struct dentry *p2,
+	struct dentry **d1p, struct dentry **d2p,
+	const char *name1, int nlen1,
+	const char *name2, int nlen2,
+	unsigned int flags1, unsigned int flags2,
+	wait_queue_head_t *wq);
+extern void unlock_rename_lookup(struct dentry *p1, struct dentry *p2,
+				 struct dentry *d1, struct dentry *d2,
+				 bool withwq);
 
 extern int __must_check nd_jump_link(struct path *path);
 



  parent reply	other threads:[~2022-08-26  2:18 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-26  2:10 [PATCH/RFC 00/10 v5] Improve scalability of directory operations NeilBrown
2022-08-26  2:10 ` [PATCH 09/10] VFS: add LOOKUP_SILLY_RENAME NeilBrown
2022-08-27  1:21   ` Al Viro
2022-08-29  3:15     ` NeilBrown
2022-08-26  2:10 ` [PATCH 01/10] VFS: support parallel updates in the one directory NeilBrown
2022-08-26 19:06   ` Linus Torvalds
2022-08-26 23:06     ` NeilBrown
2022-08-27  0:13       ` Linus Torvalds
2022-08-27  0:23         ` Al Viro
2022-08-27 21:14         ` Al Viro
2022-08-27  0:17     ` Al Viro
2022-09-01  0:31       ` NeilBrown
2022-09-01  3:44         ` Al Viro
2022-08-27  3:43   ` Al Viro
2022-08-29  1:59     ` NeilBrown
2022-09-03  0:06       ` Al Viro
2022-09-03  1:40         ` NeilBrown
2022-09-03  2:12           ` Al Viro
2022-09-03 17:52             ` Al Viro
2022-09-04 23:33               ` NeilBrown
2022-08-26  2:10 ` [PATCH 08/10] NFSD: allow parallel creates from nfsd NeilBrown
2022-08-27  4:37   ` Al Viro
2022-08-29  3:12     ` NeilBrown
2022-08-26  2:10 ` [PATCH 05/10] VFS: export done_path_update() NeilBrown
2022-08-26  2:10 ` [PATCH 02/10] VFS: move EEXIST and ENOENT tests into lookup_hash_update() NeilBrown
2022-08-26  2:10 ` NeilBrown [this message]
2022-08-27  4:12   ` [PATCH 06/10] VFS: support concurrent renames Al Viro
2022-08-29  3:08     ` NeilBrown
2022-08-26  2:10 ` [PATCH 10/10] NFS: support parallel updates in the one directory NeilBrown
2022-08-26 15:31   ` John Stoffel
2022-08-26 23:13     ` NeilBrown
2022-08-26  2:10 ` [PATCH 03/10] VFS: move want_write checks into lookup_hash_update() NeilBrown
2022-08-27  3:48   ` Al Viro
2022-08-26  2:10 ` [PATCH 04/10] VFS: move dput() and mnt_drop_write() into done_path_update() NeilBrown
2022-08-26  2:10 ` [PATCH 07/10] VFS: hold DCACHE_PAR_UPDATE lock across d_revalidate() NeilBrown
2022-08-26 14:42 ` [PATCH/RFC 00/10 v5] Improve scalability of directory operations John Stoffel
2022-08-26 23:30   ` NeilBrown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=166147984375.25420.13018600986239729815.stgit@noble.brown \
    --to=neilb@suse.de \
    --cc=chuck.lever@oracle.com \
    --cc=daire@dneg.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=trond.myklebust@hammerspace.com \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).