All of lore.kernel.org
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.de>
To: Al Viro <viro@zeniv.linux.org.uk>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Daire Byrne <daire@dneg.com>,
	Trond Myklebust <trond.myklebust@hammerspace.com>,
	Chuck Lever <chuck.lever@oracle.com>
Cc: Linux NFS Mailing List <linux-nfs@vger.kernel.org>,
	linux-fsdevel@vger.kernel.org,
	LKML <linux-kernel@vger.kernel.org>
Subject: [PATCH 06/10] VFS: support concurrent renames.
Date: Fri, 26 Aug 2022 12:10:43 +1000	[thread overview]
Message-ID: <166147984375.25420.13018600986239729815.stgit@noble.brown> (raw)
In-Reply-To: <166147828344.25420.13834885828450967910.stgit@noble.brown>

Allow object can now be renamed from or to a directory in which a create
or unlink is concurrently happening.

Two or more renames with the one directory can also be concurrent.
s_vfs_rename_mutex still serialises lookups for cross-directory renames,
but the renames themselves can proceed concurrently.

A core part of this change is introducing lock_rename_lookup()
which both locks the directories and performs the lookups.
If the filesystem supports shared-lock updates and a wq is provided,
shared locks are used on directories, otherwise exclusive locks.
DCACHE_PAR_UPDATE is always set on the found dentries.

unlock_rename_lookup() performs appropriate unlocking.  It needs to be
told if a wq was provided to lock_rename_lookup().

As we may use alloc_dentry_parallel() which can block, we need to be
careful of the case where both names are the same, in the same
directory.  If the first ->lookup() chooses not to complete the lookup -
as may happen with LOOKUP_RENAME_TARGET - then the second will block.
LOOKUP_RENAME_TARGET is only expected on the first name listed, so we
make sure to lookup the second name given first.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/namei.c            |  221 ++++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/namei.h |   10 ++
 2 files changed, 208 insertions(+), 23 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 13f8ac9721be..a7c458cc787c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3156,6 +3156,187 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 }
 EXPORT_SYMBOL(unlock_rename);
 
+static struct dentry *lock_rename_lookup(struct dentry *p1, struct dentry *p2,
+					 struct dentry **d1p, struct dentry **d2p,
+					 struct qstr *last1, struct qstr *last2,
+					 unsigned int flags1, unsigned int flags2,
+					 wait_queue_head_t *wq)
+{
+	struct dentry *p;
+	struct dentry *d1, *d2;
+	bool ok1, ok2;
+	bool shared = wq && IS_PAR_UPDATE(p1->d_inode);
+
+	if (p1 == p2) {
+		if (shared)
+			inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+		else
+			inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+	retry:
+		/* last1 is expected to be target so and might be looked up
+		 * lazily.  So look up last2 first to avoid the second look up
+		 * waiting for the first.
+		 */
+		d2 = __lookup_hash(last2, p2, flags2, wq);
+		if (IS_ERR(d2))
+			goto out_unlock_2;
+		d1 = __lookup_hash(last1, p1, flags1, wq);
+		if (IS_ERR(d1))
+			goto out_unlock_1;
+		*d1p = d1; *d2p = d2;
+
+		if (d1 < d2) {
+			ok1 = d_lock_update_nested(d1, p1, last1,
+						   I_MUTEX_PARENT);
+			ok2 = d_lock_update_nested(d2, p2, last2,
+						   I_MUTEX_PARENT2);
+		} else if (d1 > d2) {
+			ok2 = d_lock_update_nested(d2, p2, last2,
+						   I_MUTEX_PARENT);
+			ok1 = d_lock_update_nested(d1, p1, last1,
+						   I_MUTEX_PARENT2);
+		} else {
+			/* d1 == d2 !! */
+			ok1 = d_lock_update_nested(d1, p1, last1,
+						   I_MUTEX_PARENT);
+			ok2 = ok1;
+		}
+		if (!ok1 || !ok2) {
+			if (ok1)
+				d_unlock_update(d1);
+			if (ok2)
+				d_unlock_update(d2);
+			dput(d1);
+			dput(d2);
+			goto retry;
+		}
+		return NULL;
+	out_unlock_1:
+		d_lookup_done(d2);
+		dput(d2);
+		d2 = d1;
+	out_unlock_2:
+		if (shared)
+			inode_unlock_shared(p1->d_inode);
+		else
+			inode_unlock(p1->d_inode);
+		return d1;
+	}
+
+	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
+
+	if ((p = d_ancestor(p2, p1)) != NULL) {
+		if (shared) {
+			inode_lock_shared_nested(p2->d_inode, I_MUTEX_PARENT);
+			inode_lock_shared_nested(p1->d_inode, I_MUTEX_CHILD);
+		} else {
+			inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+			inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
+		}
+	} else if ((p = d_ancestor(p1, p2)) != NULL) {
+		if (shared) {
+			inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+			inode_lock_shared_nested(p2->d_inode, I_MUTEX_CHILD);
+		} else {
+			inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+			inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
+		}
+	} else {
+		if (shared) {
+			inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+			inode_lock_shared_nested(p2->d_inode, I_MUTEX_PARENT2);
+		} else {
+			inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+			inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
+		}
+	}
+retry2:
+	d1 = __lookup_hash(last1, p1, flags1, wq);
+	if (IS_ERR(d1))
+		goto unlock_out_3;
+	d2 = __lookup_hash(last2, p2, flags2, wq);
+	if (IS_ERR(d2))
+		goto unlock_out_4;
+
+	if (d1 < d2) {
+		ok1 = d_lock_update_nested(d1, p1, last1, I_MUTEX_PARENT);
+		ok2 = d_lock_update_nested(d2, p2, last2, I_MUTEX_PARENT2);
+	} else {
+		ok2 = d_lock_update_nested(d2, p2, last2, I_MUTEX_PARENT);
+		ok1 = d_lock_update_nested(d1, p1, last1, I_MUTEX_PARENT2);
+	}
+	if (!ok1 || !ok2) {
+		if (ok1)
+			d_unlock_update(d1);
+		if (ok2)
+			d_unlock_update(d2);
+		dput(d1);
+		dput(d2);
+		goto retry2;
+	}
+	*d1p = d1;
+	*d2p = d2;
+	return p;
+unlock_out_4:
+	d_lookup_done(d1);
+	dput(d1);
+	d1 = d2;
+unlock_out_3:
+	if (shared) {
+		inode_unlock_shared(p1->d_inode);
+		inode_unlock_shared(p2->d_inode);
+	} else {
+		inode_unlock(p1->d_inode);
+		inode_unlock(p2->d_inode);
+	}
+	mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+	return d1;
+}
+
+struct dentry *lock_rename_lookup_one(struct dentry *p1, struct dentry *p2,
+				      struct dentry **d1p, struct dentry **d2p,
+				      const char *name1, int nlen1,
+				      const char *name2, int nlen2,
+				      unsigned int flags1, unsigned int flags2,
+				      wait_queue_head_t *wq)
+{
+	struct qstr this1, this2;
+	int err;
+
+	err = lookup_one_common(&init_user_ns, name1, p1, nlen1, &this1);
+	if (err)
+		return ERR_PTR(err);
+	err = lookup_one_common(&init_user_ns, name2, p2, nlen2, &this2);
+	if (err)
+		return ERR_PTR(err);
+	return lock_rename_lookup(p1, p2, d1p, d2p, &this1, &this2,
+				  flags1, flags2, wq);
+}
+EXPORT_SYMBOL(lock_rename_lookup_one);
+
+void unlock_rename_lookup(struct dentry *p1, struct dentry *p2,
+			  struct dentry *d1, struct dentry *d2,
+			  bool with_wq)
+{
+	bool shared = with_wq && IS_PAR_UPDATE(p1->d_inode);
+	d_lookup_done(d1);
+	d_lookup_done(d2);
+	d_unlock_update(d1);
+	if (d2 != d1)
+		d_unlock_update(d2);
+	if (shared) {
+		inode_unlock_shared(p1->d_inode);
+		if (p1 != p2) {
+			inode_unlock_shared(p2->d_inode);
+			mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+		}
+	} else
+		unlock_rename(p1, p2);
+	dput(d1);
+	dput(d2);
+}
+EXPORT_SYMBOL(unlock_rename_lookup);
+
 /**
  * mode_strip_umask - handle vfs umask stripping
  * @dir:	parent directory of the new inode
@@ -4945,6 +5126,7 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
 	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
 	bool should_retry = false;
 	int error = -EINVAL;
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 
 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		goto put_names;
@@ -4985,58 +5167,53 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
 		goto exit2;
 
 retry_deleg:
-	trap = lock_rename(new_path.dentry, old_path.dentry);
-
-	old_dentry = __lookup_hash(&old_last, old_path.dentry,
-				   lookup_flags, NULL);
-	error = PTR_ERR(old_dentry);
-	if (IS_ERR(old_dentry))
+	trap = lock_rename_lookup(new_path.dentry, old_path.dentry,
+				  &new_dentry, &old_dentry,
+				  &new_last, &old_last,
+				  lookup_flags | target_flags, lookup_flags,
+				  &wq);
+	if (IS_ERR(trap))
 		goto exit3;
 	/* source must exist */
 	error = -ENOENT;
 	if (d_is_negative(old_dentry))
 		goto exit4;
-	new_dentry = __lookup_hash(&new_last, new_path.dentry,
-				   lookup_flags | target_flags, NULL);
-	error = PTR_ERR(new_dentry);
-	if (IS_ERR(new_dentry))
-		goto exit4;
 	error = -EEXIST;
 	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
-		goto exit5;
+		goto exit4;
 	if (flags & RENAME_EXCHANGE) {
 		error = -ENOENT;
 		if (d_is_negative(new_dentry))
-			goto exit5;
+			goto exit4;
 
 		if (!d_is_dir(new_dentry)) {
 			error = -ENOTDIR;
 			if (new_last.name[new_last.len])
-				goto exit5;
+				goto exit4;
 		}
 	}
 	/* unless the source is a directory trailing slashes give -ENOTDIR */
 	if (!d_is_dir(old_dentry)) {
 		error = -ENOTDIR;
 		if (old_last.name[old_last.len])
-			goto exit5;
+			goto exit4;
 		if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
-			goto exit5;
+			goto exit4;
 	}
 	/* source should not be ancestor of target */
 	error = -EINVAL;
 	if (old_dentry == trap)
-		goto exit5;
+		goto exit4;
 	/* target should not be an ancestor of source */
 	if (!(flags & RENAME_EXCHANGE))
 		error = -ENOTEMPTY;
 	if (new_dentry == trap)
-		goto exit5;
+		goto exit4;
 
 	error = security_path_rename(&old_path, old_dentry,
 				     &new_path, new_dentry, flags);
 	if (error)
-		goto exit5;
+		goto exit4;
 
 	rd.old_dir	   = old_path.dentry->d_inode;
 	rd.old_dentry	   = old_dentry;
@@ -5047,12 +5224,10 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
 	rd.delegated_inode = &delegated_inode;
 	rd.flags	   = flags;
 	error = vfs_rename(&rd);
-exit5:
-	dput(new_dentry);
 exit4:
-	dput(old_dentry);
+	unlock_rename_lookup(new_path.dentry, old_path.dentry, new_dentry, old_dentry,
+			     true);
 exit3:
-	unlock_rename(new_path.dentry, old_path.dentry);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
diff --git a/include/linux/namei.h b/include/linux/namei.h
index b1a210a51210..29756921f69b 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -108,6 +108,16 @@ extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
+extern struct dentry *lock_rename_lookup_one(
+	struct dentry *p1, struct dentry *p2,
+	struct dentry **d1p, struct dentry **d2p,
+	const char *name1, int nlen1,
+	const char *name2, int nlen2,
+	unsigned int flags1, unsigned int flags2,
+	wait_queue_head_t *wq);
+extern void unlock_rename_lookup(struct dentry *p1, struct dentry *p2,
+				 struct dentry *d1, struct dentry *d2,
+				 bool withwq);
 
 extern int __must_check nd_jump_link(struct path *path);
 



  parent reply	other threads:[~2022-08-26  2:18 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-26  2:10 [PATCH/RFC 00/10 v5] Improve scalability of directory operations NeilBrown
2022-08-26  2:10 ` [PATCH 09/10] VFS: add LOOKUP_SILLY_RENAME NeilBrown
2022-08-27  1:21   ` Al Viro
2022-08-29  3:15     ` NeilBrown
2022-08-26  2:10 ` [PATCH 01/10] VFS: support parallel updates in the one directory NeilBrown
2022-08-26 19:06   ` Linus Torvalds
2022-08-26 23:06     ` NeilBrown
2022-08-27  0:13       ` Linus Torvalds
2022-08-27  0:23         ` Al Viro
2022-08-27 21:14         ` Al Viro
2022-08-27  0:17     ` Al Viro
2022-09-01  0:31       ` NeilBrown
2022-09-01  3:44         ` Al Viro
2022-08-27  3:43   ` Al Viro
2022-08-29  1:59     ` NeilBrown
2022-09-03  0:06       ` Al Viro
2022-09-03  1:40         ` NeilBrown
2022-09-03  2:12           ` Al Viro
2022-09-03 17:52             ` Al Viro
2022-09-04 23:33               ` NeilBrown
2022-08-26  2:10 ` [PATCH 08/10] NFSD: allow parallel creates from nfsd NeilBrown
2022-08-27  4:37   ` Al Viro
2022-08-29  3:12     ` NeilBrown
2022-08-26  2:10 ` [PATCH 05/10] VFS: export done_path_update() NeilBrown
2022-08-26  2:10 ` [PATCH 02/10] VFS: move EEXIST and ENOENT tests into lookup_hash_update() NeilBrown
2022-08-26  2:10 ` NeilBrown [this message]
2022-08-27  4:12   ` [PATCH 06/10] VFS: support concurrent renames Al Viro
2022-08-29  3:08     ` NeilBrown
2022-08-26  2:10 ` [PATCH 10/10] NFS: support parallel updates in the one directory NeilBrown
2022-08-26 15:31   ` John Stoffel
2022-08-26 23:13     ` NeilBrown
2022-08-26  2:10 ` [PATCH 03/10] VFS: move want_write checks into lookup_hash_update() NeilBrown
2022-08-27  3:48   ` Al Viro
2022-08-26  2:10 ` [PATCH 04/10] VFS: move dput() and mnt_drop_write() into done_path_update() NeilBrown
2022-08-26  2:10 ` [PATCH 07/10] VFS: hold DCACHE_PAR_UPDATE lock across d_revalidate() NeilBrown
2022-08-26 14:42 ` [PATCH/RFC 00/10 v5] Improve scalability of directory operations John Stoffel
2022-08-26 23:30   ` NeilBrown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=166147984375.25420.13018600986239729815.stgit@noble.brown \
    --to=neilb@suse.de \
    --cc=chuck.lever@oracle.com \
    --cc=daire@dneg.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=trond.myklebust@hammerspace.com \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.