From: NeilBrown <neilb@suse.de>
To: Al Viro <viro@zeniv.linux.org.uk>,
Linus Torvalds <torvalds@linux-foundation.org>,
Daire Byrne <daire@dneg.com>,
Trond Myklebust <trond.myklebust@hammerspace.com>,
Chuck Lever <chuck.lever@oracle.com>
Cc: Linux NFS Mailing List <linux-nfs@vger.kernel.org>,
linux-fsdevel@vger.kernel.org,
LKML <linux-kernel@vger.kernel.org>
Subject: [PATCH 06/10] VFS: support concurrent renames.
Date: Fri, 26 Aug 2022 12:10:43 +1000 [thread overview]
Message-ID: <166147984375.25420.13018600986239729815.stgit@noble.brown> (raw)
In-Reply-To: <166147828344.25420.13834885828450967910.stgit@noble.brown>
Allow object can now be renamed from or to a directory in which a create
or unlink is concurrently happening.
Two or more renames with the one directory can also be concurrent.
s_vfs_rename_mutex still serialises lookups for cross-directory renames,
but the renames themselves can proceed concurrently.
A core part of this change is introducing lock_rename_lookup()
which both locks the directories and performs the lookups.
If the filesystem supports shared-lock updates and a wq is provided,
shared locks are used on directories, otherwise exclusive locks.
DCACHE_PAR_UPDATE is always set on the found dentries.
unlock_rename_lookup() performs appropriate unlocking. It needs to be
told if a wq was provided to lock_rename_lookup().
As we may use alloc_dentry_parallel() which can block, we need to be
careful of the case where both names are the same, in the same
directory. If the first ->lookup() chooses not to complete the lookup -
as may happen with LOOKUP_RENAME_TARGET - then the second will block.
LOOKUP_RENAME_TARGET is only expected on the first name listed, so we
make sure to lookup the second name given first.
Signed-off-by: NeilBrown <neilb@suse.de>
---
fs/namei.c | 221 ++++++++++++++++++++++++++++++++++++++++++++-----
include/linux/namei.h | 10 ++
2 files changed, 208 insertions(+), 23 deletions(-)
diff --git a/fs/namei.c b/fs/namei.c
index 13f8ac9721be..a7c458cc787c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3156,6 +3156,187 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
}
EXPORT_SYMBOL(unlock_rename);
+static struct dentry *lock_rename_lookup(struct dentry *p1, struct dentry *p2,
+ struct dentry **d1p, struct dentry **d2p,
+ struct qstr *last1, struct qstr *last2,
+ unsigned int flags1, unsigned int flags2,
+ wait_queue_head_t *wq)
+{
+ struct dentry *p;
+ struct dentry *d1, *d2;
+ bool ok1, ok2;
+ bool shared = wq && IS_PAR_UPDATE(p1->d_inode);
+
+ if (p1 == p2) {
+ if (shared)
+ inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+ else
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ retry:
+ /* last1 is expected to be target so and might be looked up
+ * lazily. So look up last2 first to avoid the second look up
+ * waiting for the first.
+ */
+ d2 = __lookup_hash(last2, p2, flags2, wq);
+ if (IS_ERR(d2))
+ goto out_unlock_2;
+ d1 = __lookup_hash(last1, p1, flags1, wq);
+ if (IS_ERR(d1))
+ goto out_unlock_1;
+ *d1p = d1; *d2p = d2;
+
+ if (d1 < d2) {
+ ok1 = d_lock_update_nested(d1, p1, last1,
+ I_MUTEX_PARENT);
+ ok2 = d_lock_update_nested(d2, p2, last2,
+ I_MUTEX_PARENT2);
+ } else if (d1 > d2) {
+ ok2 = d_lock_update_nested(d2, p2, last2,
+ I_MUTEX_PARENT);
+ ok1 = d_lock_update_nested(d1, p1, last1,
+ I_MUTEX_PARENT2);
+ } else {
+ /* d1 == d2 !! */
+ ok1 = d_lock_update_nested(d1, p1, last1,
+ I_MUTEX_PARENT);
+ ok2 = ok1;
+ }
+ if (!ok1 || !ok2) {
+ if (ok1)
+ d_unlock_update(d1);
+ if (ok2)
+ d_unlock_update(d2);
+ dput(d1);
+ dput(d2);
+ goto retry;
+ }
+ return NULL;
+ out_unlock_1:
+ d_lookup_done(d2);
+ dput(d2);
+ d2 = d1;
+ out_unlock_2:
+ if (shared)
+ inode_unlock_shared(p1->d_inode);
+ else
+ inode_unlock(p1->d_inode);
+ return d1;
+ }
+
+ mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
+
+ if ((p = d_ancestor(p2, p1)) != NULL) {
+ if (shared) {
+ inode_lock_shared_nested(p2->d_inode, I_MUTEX_PARENT);
+ inode_lock_shared_nested(p1->d_inode, I_MUTEX_CHILD);
+ } else {
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
+ }
+ } else if ((p = d_ancestor(p1, p2)) != NULL) {
+ if (shared) {
+ inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_shared_nested(p2->d_inode, I_MUTEX_CHILD);
+ } else {
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
+ }
+ } else {
+ if (shared) {
+ inode_lock_shared_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_shared_nested(p2->d_inode, I_MUTEX_PARENT2);
+ } else {
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
+ }
+ }
+retry2:
+ d1 = __lookup_hash(last1, p1, flags1, wq);
+ if (IS_ERR(d1))
+ goto unlock_out_3;
+ d2 = __lookup_hash(last2, p2, flags2, wq);
+ if (IS_ERR(d2))
+ goto unlock_out_4;
+
+ if (d1 < d2) {
+ ok1 = d_lock_update_nested(d1, p1, last1, I_MUTEX_PARENT);
+ ok2 = d_lock_update_nested(d2, p2, last2, I_MUTEX_PARENT2);
+ } else {
+ ok2 = d_lock_update_nested(d2, p2, last2, I_MUTEX_PARENT);
+ ok1 = d_lock_update_nested(d1, p1, last1, I_MUTEX_PARENT2);
+ }
+ if (!ok1 || !ok2) {
+ if (ok1)
+ d_unlock_update(d1);
+ if (ok2)
+ d_unlock_update(d2);
+ dput(d1);
+ dput(d2);
+ goto retry2;
+ }
+ *d1p = d1;
+ *d2p = d2;
+ return p;
+unlock_out_4:
+ d_lookup_done(d1);
+ dput(d1);
+ d1 = d2;
+unlock_out_3:
+ if (shared) {
+ inode_unlock_shared(p1->d_inode);
+ inode_unlock_shared(p2->d_inode);
+ } else {
+ inode_unlock(p1->d_inode);
+ inode_unlock(p2->d_inode);
+ }
+ mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+ return d1;
+}
+
+struct dentry *lock_rename_lookup_one(struct dentry *p1, struct dentry *p2,
+ struct dentry **d1p, struct dentry **d2p,
+ const char *name1, int nlen1,
+ const char *name2, int nlen2,
+ unsigned int flags1, unsigned int flags2,
+ wait_queue_head_t *wq)
+{
+ struct qstr this1, this2;
+ int err;
+
+ err = lookup_one_common(&init_user_ns, name1, p1, nlen1, &this1);
+ if (err)
+ return ERR_PTR(err);
+ err = lookup_one_common(&init_user_ns, name2, p2, nlen2, &this2);
+ if (err)
+ return ERR_PTR(err);
+ return lock_rename_lookup(p1, p2, d1p, d2p, &this1, &this2,
+ flags1, flags2, wq);
+}
+EXPORT_SYMBOL(lock_rename_lookup_one);
+
+void unlock_rename_lookup(struct dentry *p1, struct dentry *p2,
+ struct dentry *d1, struct dentry *d2,
+ bool with_wq)
+{
+ bool shared = with_wq && IS_PAR_UPDATE(p1->d_inode);
+ d_lookup_done(d1);
+ d_lookup_done(d2);
+ d_unlock_update(d1);
+ if (d2 != d1)
+ d_unlock_update(d2);
+ if (shared) {
+ inode_unlock_shared(p1->d_inode);
+ if (p1 != p2) {
+ inode_unlock_shared(p2->d_inode);
+ mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+ }
+ } else
+ unlock_rename(p1, p2);
+ dput(d1);
+ dput(d2);
+}
+EXPORT_SYMBOL(unlock_rename_lookup);
+
/**
* mode_strip_umask - handle vfs umask stripping
* @dir: parent directory of the new inode
@@ -4945,6 +5126,7 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
bool should_retry = false;
int error = -EINVAL;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
goto put_names;
@@ -4985,58 +5167,53 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
goto exit2;
retry_deleg:
- trap = lock_rename(new_path.dentry, old_path.dentry);
-
- old_dentry = __lookup_hash(&old_last, old_path.dentry,
- lookup_flags, NULL);
- error = PTR_ERR(old_dentry);
- if (IS_ERR(old_dentry))
+ trap = lock_rename_lookup(new_path.dentry, old_path.dentry,
+ &new_dentry, &old_dentry,
+ &new_last, &old_last,
+ lookup_flags | target_flags, lookup_flags,
+ &wq);
+ if (IS_ERR(trap))
goto exit3;
/* source must exist */
error = -ENOENT;
if (d_is_negative(old_dentry))
goto exit4;
- new_dentry = __lookup_hash(&new_last, new_path.dentry,
- lookup_flags | target_flags, NULL);
- error = PTR_ERR(new_dentry);
- if (IS_ERR(new_dentry))
- goto exit4;
error = -EEXIST;
if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
- goto exit5;
+ goto exit4;
if (flags & RENAME_EXCHANGE) {
error = -ENOENT;
if (d_is_negative(new_dentry))
- goto exit5;
+ goto exit4;
if (!d_is_dir(new_dentry)) {
error = -ENOTDIR;
if (new_last.name[new_last.len])
- goto exit5;
+ goto exit4;
}
}
/* unless the source is a directory trailing slashes give -ENOTDIR */
if (!d_is_dir(old_dentry)) {
error = -ENOTDIR;
if (old_last.name[old_last.len])
- goto exit5;
+ goto exit4;
if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
- goto exit5;
+ goto exit4;
}
/* source should not be ancestor of target */
error = -EINVAL;
if (old_dentry == trap)
- goto exit5;
+ goto exit4;
/* target should not be an ancestor of source */
if (!(flags & RENAME_EXCHANGE))
error = -ENOTEMPTY;
if (new_dentry == trap)
- goto exit5;
+ goto exit4;
error = security_path_rename(&old_path, old_dentry,
&new_path, new_dentry, flags);
if (error)
- goto exit5;
+ goto exit4;
rd.old_dir = old_path.dentry->d_inode;
rd.old_dentry = old_dentry;
@@ -5047,12 +5224,10 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
rd.delegated_inode = &delegated_inode;
rd.flags = flags;
error = vfs_rename(&rd);
-exit5:
- dput(new_dentry);
exit4:
- dput(old_dentry);
+ unlock_rename_lookup(new_path.dentry, old_path.dentry, new_dentry, old_dentry,
+ true);
exit3:
- unlock_rename(new_path.dentry, old_path.dentry);
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
diff --git a/include/linux/namei.h b/include/linux/namei.h
index b1a210a51210..29756921f69b 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -108,6 +108,16 @@ extern int follow_up(struct path *);
extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern void unlock_rename(struct dentry *, struct dentry *);
+extern struct dentry *lock_rename_lookup_one(
+ struct dentry *p1, struct dentry *p2,
+ struct dentry **d1p, struct dentry **d2p,
+ const char *name1, int nlen1,
+ const char *name2, int nlen2,
+ unsigned int flags1, unsigned int flags2,
+ wait_queue_head_t *wq);
+extern void unlock_rename_lookup(struct dentry *p1, struct dentry *p2,
+ struct dentry *d1, struct dentry *d2,
+ bool withwq);
extern int __must_check nd_jump_link(struct path *path);
next prev parent reply other threads:[~2022-08-26 2:18 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-08-26 2:10 [PATCH/RFC 00/10 v5] Improve scalability of directory operations NeilBrown
2022-08-26 2:10 ` [PATCH 09/10] VFS: add LOOKUP_SILLY_RENAME NeilBrown
2022-08-27 1:21 ` Al Viro
2022-08-29 3:15 ` NeilBrown
2022-08-26 2:10 ` [PATCH 01/10] VFS: support parallel updates in the one directory NeilBrown
2022-08-26 19:06 ` Linus Torvalds
2022-08-26 23:06 ` NeilBrown
2022-08-27 0:13 ` Linus Torvalds
2022-08-27 0:23 ` Al Viro
2022-08-27 21:14 ` Al Viro
2022-08-27 0:17 ` Al Viro
2022-09-01 0:31 ` NeilBrown
2022-09-01 3:44 ` Al Viro
2022-08-27 3:43 ` Al Viro
2022-08-29 1:59 ` NeilBrown
2022-09-03 0:06 ` Al Viro
2022-09-03 1:40 ` NeilBrown
2022-09-03 2:12 ` Al Viro
2022-09-03 17:52 ` Al Viro
2022-09-04 23:33 ` NeilBrown
2022-08-26 2:10 ` [PATCH 08/10] NFSD: allow parallel creates from nfsd NeilBrown
2022-08-27 4:37 ` Al Viro
2022-08-29 3:12 ` NeilBrown
2022-08-26 2:10 ` [PATCH 05/10] VFS: export done_path_update() NeilBrown
2022-08-26 2:10 ` [PATCH 02/10] VFS: move EEXIST and ENOENT tests into lookup_hash_update() NeilBrown
2022-08-26 2:10 ` NeilBrown [this message]
2022-08-27 4:12 ` [PATCH 06/10] VFS: support concurrent renames Al Viro
2022-08-29 3:08 ` NeilBrown
2022-08-26 2:10 ` [PATCH 10/10] NFS: support parallel updates in the one directory NeilBrown
2022-08-26 15:31 ` John Stoffel
2022-08-26 23:13 ` NeilBrown
2022-08-26 2:10 ` [PATCH 03/10] VFS: move want_write checks into lookup_hash_update() NeilBrown
2022-08-27 3:48 ` Al Viro
2022-08-26 2:10 ` [PATCH 04/10] VFS: move dput() and mnt_drop_write() into done_path_update() NeilBrown
2022-08-26 2:10 ` [PATCH 07/10] VFS: hold DCACHE_PAR_UPDATE lock across d_revalidate() NeilBrown
2022-08-26 14:42 ` [PATCH/RFC 00/10 v5] Improve scalability of directory operations John Stoffel
2022-08-26 23:30 ` NeilBrown
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=166147984375.25420.13018600986239729815.stgit@noble.brown \
--to=neilb@suse.de \
--cc=chuck.lever@oracle.com \
--cc=daire@dneg.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-nfs@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
--cc=trond.myklebust@hammerspace.com \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).