linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] ceph: wait async unlink to finish
@ 2022-05-16 12:20 Xiubo Li
  2022-05-16 12:20 ` [PATCH 1/2] fs/dcache: add d_compare() helper support Xiubo Li
  2022-05-16 12:20 ` [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir Xiubo Li
  0 siblings, 2 replies; 10+ messages in thread
From: Xiubo Li @ 2022-05-16 12:20 UTC (permalink / raw)
  To: jlayton, viro
  Cc: idryomov, vshankar, ceph-devel, mcgrof, akpm, arnd,
	linux-fsdevel, linux-kernel, Xiubo Li

Xiubo Li (2):
  fs/dcache: add d_compare() helper support
  ceph: wait the first reply of inflight unlink/rmdir

 fs/ceph/dir.c          | 55 ++++++++++++++++++++++++++++++---
 fs/ceph/file.c         |  5 +++
 fs/ceph/mds_client.c   | 69 ++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/mds_client.h   |  1 +
 fs/ceph/super.c        |  2 ++
 fs/ceph/super.h        | 18 ++++++++---
 fs/dcache.c            | 15 +++++++++
 include/linux/dcache.h |  2 ++
 8 files changed, 157 insertions(+), 10 deletions(-)

-- 
2.36.0.rc1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/2] fs/dcache: add d_compare() helper support
  2022-05-16 12:20 [PATCH 0/2] ceph: wait async unlink to finish Xiubo Li
@ 2022-05-16 12:20 ` Xiubo Li
  2022-05-16 13:12   ` Jeff Layton
  2022-05-16 12:20 ` [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir Xiubo Li
  1 sibling, 1 reply; 10+ messages in thread
From: Xiubo Li @ 2022-05-16 12:20 UTC (permalink / raw)
  To: jlayton, viro
  Cc: idryomov, vshankar, ceph-devel, mcgrof, akpm, arnd,
	linux-fsdevel, linux-kernel, Xiubo Li

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/dcache.c            | 15 +++++++++++++++
 include/linux/dcache.h |  2 ++
 2 files changed, 17 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 93f4f5ee07bf..95a72f92a94b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2262,6 +2262,21 @@ static inline bool d_same_name(const struct dentry *dentry,
 				       name) == 0;
 }
 
+/**
+ * d_compare - compare dentry name with case-exact name
+ * @parent: parent dentry
+ * @dentry: the negative dentry that was passed to the parent's lookup func
+ * @name:   the case-exact name to be associated with the returned dentry
+ *
+ * Return: 0 if names are same, or 1
+ */
+bool d_compare(const struct dentry *parent, const struct dentry *dentry,
+	       const struct qstr *name)
+{
+	return !d_same_name(dentry, parent, name);
+}
+EXPORT_SYMBOL(d_compare);
+
 /**
  * __d_lookup_rcu - search for a dentry (racy, store-free)
  * @parent: parent dentry
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f5bba51480b2..444b2230e5c3 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -233,6 +233,8 @@ extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
 					wait_queue_head_t *);
 extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
+extern bool d_compare(const struct dentry *parent, const struct dentry *dentry,
+		      const struct qstr *name);
 extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
 extern struct dentry *d_find_any_alias(struct inode *inode);
 extern struct dentry * d_obtain_alias(struct inode *);
-- 
2.36.0.rc1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir
  2022-05-16 12:20 [PATCH 0/2] ceph: wait async unlink to finish Xiubo Li
  2022-05-16 12:20 ` [PATCH 1/2] fs/dcache: add d_compare() helper support Xiubo Li
@ 2022-05-16 12:20 ` Xiubo Li
  2022-05-16 13:23   ` Jeff Layton
                     ` (2 more replies)
  1 sibling, 3 replies; 10+ messages in thread
From: Xiubo Li @ 2022-05-16 12:20 UTC (permalink / raw)
  To: jlayton, viro
  Cc: idryomov, vshankar, ceph-devel, mcgrof, akpm, arnd,
	linux-fsdevel, linux-kernel, Xiubo Li

In async unlink case the kclient won't wait for the first reply
from MDS and just drop all the links and unhash the dentry and then
succeeds immediately.

For any new create/link/rename,etc requests followed by using the
same file names we must wait for the first reply of the inflight
unlink request, or the MDS possibly will fail these following
requests with -EEXIST if the inflight async unlink request was
delayed for some reasons.

And the worst case is that for the none async openc request it will
successfully open the file if the CDentry hasn't been unlinked yet,
but later the previous delayed async unlink request will remove the
CDenty. That means the just created file is possiblly deleted later
by accident.

We need to wait for the inflight async unlink requests to finish
when creating new files/directories by using the same file names.

URL: https://tracker.ceph.com/issues/55332
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/dir.c        | 55 +++++++++++++++++++++++++++++++----
 fs/ceph/file.c       |  5 ++++
 fs/ceph/mds_client.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/mds_client.h |  1 +
 fs/ceph/super.c      |  2 ++
 fs/ceph/super.h      | 18 ++++++++----
 6 files changed, 140 insertions(+), 10 deletions(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index eae417d71136..20c648406528 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 	if (ceph_snap(dir) != CEPH_NOSNAP)
 		return -EROFS;
 
+	err = ceph_wait_on_conflict_unlink(dentry);
+	if (err)
+		return err;
+
 	if (ceph_quota_is_max_files_exceeded(dir)) {
 		err = -EDQUOT;
 		goto out;
@@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 	if (ceph_snap(dir) != CEPH_NOSNAP)
 		return -EROFS;
 
+	err = ceph_wait_on_conflict_unlink(dentry);
+	if (err)
+		return err;
+
 	if (ceph_quota_is_max_files_exceeded(dir)) {
 		err = -EDQUOT;
 		goto out;
@@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
 	struct ceph_mds_request *req;
 	struct ceph_acl_sec_ctx as_ctx = {};
-	int err = -EROFS;
+	int err;
 	int op;
 
+	err = ceph_wait_on_conflict_unlink(dentry);
+	if (err)
+		return err;
+
 	if (ceph_snap(dir) == CEPH_SNAPDIR) {
 		/* mkdir .snap/foo is a MKSNAP */
 		op = CEPH_MDS_OP_MKSNAP;
@@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
 		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
 		op = CEPH_MDS_OP_MKDIR;
 	} else {
+		err = -EROFS;
 		goto out;
 	}
 
@@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 	struct ceph_mds_request *req;
 	int err;
 
+	err = ceph_wait_on_conflict_unlink(dentry);
+	if (err)
+		return err;
+
 	if (ceph_snap(dir) != CEPH_NOSNAP)
 		return -EROFS;
 
@@ -1071,9 +1088,24 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
 				 struct ceph_mds_request *req)
 {
+	struct dentry *dentry = req->r_dentry;
+	struct ceph_dentry_info *di = ceph_dentry(dentry);
 	int result = req->r_err ? req->r_err :
 			le32_to_cpu(req->r_reply_info.head->result);
 
+	if (test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) {
+		BUG_ON(req->r_op != CEPH_MDS_OP_UNLINK);
+
+		hash_del_rcu(&di->hnode);
+
+		spin_lock(&dentry->d_lock);
+		di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
+		wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
+		spin_unlock(&dentry->d_lock);
+
+		synchronize_rcu();
+	}
+
 	if (result == -EJUKEBOX)
 		goto out;
 
@@ -1081,7 +1113,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
 	if (result) {
 		int pathlen = 0;
 		u64 base = 0;
-		char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+		char *path = ceph_mdsc_build_path(dentry, &pathlen,
 						  &base, 0);
 
 		/* mark error on parent + clear complete */
@@ -1089,13 +1121,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
 		ceph_dir_clear_complete(req->r_parent);
 
 		/* drop the dentry -- we don't know its status */
-		if (!d_unhashed(req->r_dentry))
-			d_drop(req->r_dentry);
+		if (!d_unhashed(dentry))
+			d_drop(dentry);
 
 		/* mark inode itself for an error (since metadata is bogus) */
 		mapping_set_error(req->r_old_inode->i_mapping, result);
 
-		pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
+		pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
 			base, IS_ERR(path) ? "<<bad>>" : path, result);
 		ceph_mdsc_free_path(path, pathlen);
 	}
@@ -1189,12 +1221,21 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 		ihold(req->r_old_inode);
 		err = ceph_mdsc_submit_request(mdsc, dir, req);
 		if (!err) {
+			struct ceph_dentry_info *di;
+
 			/*
 			 * We have enough caps, so we assume that the unlink
 			 * will succeed. Fix up the target inode and dcache.
 			 */
 			drop_nlink(inode);
 			d_delete(dentry);
+
+			spin_lock(&dentry->d_lock);
+			di = ceph_dentry(dentry);
+			di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
+			hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
+				     dentry->d_name.hash);
+			spin_unlock(&dentry->d_lock);
 		} else if (err == -EJUKEBOX) {
 			try_async = false;
 			ceph_mdsc_put_request(req);
@@ -1237,6 +1278,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
 	    (!ceph_quota_is_same_realm(old_dir, new_dir)))
 		return -EXDEV;
 
+	err = ceph_wait_on_conflict_unlink(new_dentry);
+	if (err)
+		return err;
+
 	dout("rename dir %p dentry %p to dir %p dentry %p\n",
 	     old_dir, old_dentry, new_dir, new_dentry);
 	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8c8226c0feac..47d068e6436a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -740,6 +740,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	if (dentry->d_name.len > NAME_MAX)
 		return -ENAMETOOLONG;
 
+	err = ceph_wait_on_conflict_unlink(dentry);
+	if (err)
+		return err;
+
 	if (flags & O_CREAT) {
 		if (ceph_quota_is_max_files_exceeded(dir))
 			return -EDQUOT;
@@ -757,6 +761,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		/* If it's not being looked up, it's negative */
 		return -ENOENT;
 	}
+
 retry:
 	/* do the open */
 	req = prepare_open_request(dir->i_sb, flags, mode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e8c87dea0551..0ae0e0110eb4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -468,6 +468,75 @@ static int ceph_parse_deleg_inos(void **p, void *end,
 	return -EIO;
 }
 
+/*
+ * In async unlink case the kclient won't wait for the first reply
+ * from MDS and just drop all the links and unhash the dentry and then
+ * succeeds immediately.
+ *
+ * For any new create/link/rename,etc requests followed by using the
+ * same file names we must wait for the first reply of the inflight
+ * unlink request, or the MDS possibly will fail these following
+ * requests with -EEXIST if the inflight async unlink request was
+ * delayed for some reasons.
+ *
+ * And the worst case is that for the none async openc request it will
+ * successfully open the file if the CDentry hasn't been unlinked yet,
+ * but later the previous delayed async unlink request will remove the
+ * CDenty. That means the just created file is possiblly deleted later
+ * by accident.
+ *
+ * We need to wait for the inflight async unlink requests to finish
+ * when creating new files/directories by using the same file names.
+ */
+int ceph_wait_on_conflict_unlink(struct dentry *dentry)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct dentry *pdentry = dentry->d_parent;
+	struct dentry *udentry, *found = NULL;
+	struct ceph_dentry_info *di;
+	struct qstr dname;
+	u32 hash = dentry->d_name.hash;
+	int err;
+
+	dname.name = dentry->d_name.name;
+	dname.len = dentry->d_name.len;
+
+	rcu_read_lock();
+	hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
+				   hnode, hash) {
+		udentry = di->dentry;
+
+		spin_lock(&udentry->d_lock);
+		if (udentry->d_name.hash != hash)
+			goto next;
+		if (unlikely(udentry->d_parent != pdentry))
+			goto next;
+		if (!hash_hashed(&di->hnode))
+			goto next;
+
+		if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
+			goto next;
+
+		if (d_compare(pdentry, udentry, &dname))
+			goto next;
+
+		spin_unlock(&udentry->d_lock);
+		found = dget(udentry);
+		break;
+next:
+		spin_unlock(&udentry->d_lock);
+	}
+	rcu_read_unlock();
+
+	if (likely(!found))
+		return 0;
+
+	err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
+			  TASK_INTERRUPTIBLE);
+	dput(found);
+	return err;
+}
+
 u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
 {
 	unsigned long ino;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 33497846e47e..d1ae679c52c3 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -582,6 +582,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
 			   TASK_INTERRUPTIBLE);
 }
 
+extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
 extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
 extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
 #endif
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b73b4f75462c..7ae65001f04c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -816,6 +816,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 	if (!fsc->cap_wq)
 		goto fail_inode_wq;
 
+	hash_init(fsc->async_unlink_conflict);
+
 	spin_lock(&ceph_fsc_lock);
 	list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
 	spin_unlock(&ceph_fsc_lock);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 506d52633627..58bbb5df42da 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -19,6 +19,7 @@
 #include <linux/security.h>
 #include <linux/netfs.h>
 #include <linux/fscache.h>
+#include <linux/hashtable.h>
 
 #include <linux/ceph/libceph.h>
 
@@ -99,6 +100,8 @@ struct ceph_mount_options {
 	char *mon_addr;
 };
 
+#define CEPH_ASYNC_CREATE_CONFLICT_BITS 12
+
 struct ceph_fs_client {
 	struct super_block *sb;
 
@@ -124,6 +127,8 @@ struct ceph_fs_client {
 	struct workqueue_struct *inode_wq;
 	struct workqueue_struct *cap_wq;
 
+	DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs_dentry_lru, *debugfs_caps;
 	struct dentry *debugfs_congestion_kb;
@@ -281,7 +286,8 @@ struct ceph_dentry_info {
 	struct dentry *dentry;
 	struct ceph_mds_session *lease_session;
 	struct list_head lease_list;
-	unsigned flags;
+	struct hlist_node hnode;
+	unsigned long flags;
 	int lease_shared_gen;
 	u32 lease_gen;
 	u32 lease_seq;
@@ -290,10 +296,12 @@ struct ceph_dentry_info {
 	u64 offset;
 };
 
-#define CEPH_DENTRY_REFERENCED		1
-#define CEPH_DENTRY_LEASE_LIST		2
-#define CEPH_DENTRY_SHRINK_LIST		4
-#define CEPH_DENTRY_PRIMARY_LINK	8
+#define CEPH_DENTRY_REFERENCED		(1 << 0)
+#define CEPH_DENTRY_LEASE_LIST		(1 << 1)
+#define CEPH_DENTRY_SHRINK_LIST		(1 << 2)
+#define CEPH_DENTRY_PRIMARY_LINK	(1 << 3)
+#define CEPH_DENTRY_ASYNC_UNLINK_BIT	(4)
+#define CEPH_DENTRY_ASYNC_UNLINK	(1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
 
 struct ceph_inode_xattrs_info {
 	/*
-- 
2.36.0.rc1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/2] fs/dcache: add d_compare() helper support
  2022-05-16 12:20 ` [PATCH 1/2] fs/dcache: add d_compare() helper support Xiubo Li
@ 2022-05-16 13:12   ` Jeff Layton
  0 siblings, 0 replies; 10+ messages in thread
From: Jeff Layton @ 2022-05-16 13:12 UTC (permalink / raw)
  To: Xiubo Li, viro
  Cc: idryomov, vshankar, ceph-devel, mcgrof, akpm, arnd,
	linux-fsdevel, linux-kernel

On Mon, 2022-05-16 at 20:20 +0800, Xiubo Li wrote:
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/dcache.c            | 15 +++++++++++++++
>  include/linux/dcache.h |  2 ++
>  2 files changed, 17 insertions(+)
> 
> diff --git a/fs/dcache.c b/fs/dcache.c
> index 93f4f5ee07bf..95a72f92a94b 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -2262,6 +2262,21 @@ static inline bool d_same_name(const struct dentry *dentry,
>  				       name) == 0;
>  }
>  
> +/**
> + * d_compare - compare dentry name with case-exact name
> + * @parent: parent dentry
> + * @dentry: the negative dentry that was passed to the parent's lookup func
> + * @name:   the case-exact name to be associated with the returned dentry
> + *
> + * Return: 0 if names are same, or 1
> + */
> +bool d_compare(const struct dentry *parent, const struct dentry *dentry,
> +	       const struct qstr *name)
> +{
> +	return !d_same_name(dentry, parent, name);
> +}
> +EXPORT_SYMBOL(d_compare);
> +
>  /**
>   * __d_lookup_rcu - search for a dentry (racy, store-free)
>   * @parent: parent dentry
> diff --git a/include/linux/dcache.h b/include/linux/dcache.h
> index f5bba51480b2..444b2230e5c3 100644
> --- a/include/linux/dcache.h
> +++ b/include/linux/dcache.h
> @@ -233,6 +233,8 @@ extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
>  					wait_queue_head_t *);
>  extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
>  extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
> +extern bool d_compare(const struct dentry *parent, const struct dentry *dentry,
> +		      const struct qstr *name);
>  extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
>  extern struct dentry *d_find_any_alias(struct inode *inode);
>  extern struct dentry * d_obtain_alias(struct inode *);

I wonder if we ought to just un-inline and export d_same_name instead?
Still, this is less disruptive and the dcache code is hugely performance
sensitive. It's possible that inlining that function makes a difference.

Reviewed-by: Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir
  2022-05-16 12:20 ` [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir Xiubo Li
@ 2022-05-16 13:23   ` Jeff Layton
  2022-05-16 14:56     ` Xiubo Li
  2022-05-16 21:05   ` kernel test robot
  2022-05-16 23:18   ` kernel test robot
  2 siblings, 1 reply; 10+ messages in thread
From: Jeff Layton @ 2022-05-16 13:23 UTC (permalink / raw)
  To: Xiubo Li, viro
  Cc: idryomov, vshankar, ceph-devel, mcgrof, akpm, arnd,
	linux-fsdevel, linux-kernel

On Mon, 2022-05-16 at 20:20 +0800, Xiubo Li wrote:
> In async unlink case the kclient won't wait for the first reply
> from MDS and just drop all the links and unhash the dentry and then
> succeeds immediately.
> 
> For any new create/link/rename,etc requests followed by using the
> same file names we must wait for the first reply of the inflight
> unlink request, or the MDS possibly will fail these following
> requests with -EEXIST if the inflight async unlink request was
> delayed for some reasons.
> 
> And the worst case is that for the none async openc request it will
> successfully open the file if the CDentry hasn't been unlinked yet,
> but later the previous delayed async unlink request will remove the
> CDenty. That means the just created file is possiblly deleted later
> by accident.
> 
> We need to wait for the inflight async unlink requests to finish
> when creating new files/directories by using the same file names.
> 
> URL: https://tracker.ceph.com/issues/55332
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/dir.c        | 55 +++++++++++++++++++++++++++++++----
>  fs/ceph/file.c       |  5 ++++
>  fs/ceph/mds_client.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/ceph/mds_client.h |  1 +
>  fs/ceph/super.c      |  2 ++
>  fs/ceph/super.h      | 18 ++++++++----
>  6 files changed, 140 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index eae417d71136..20c648406528 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
>  	if (ceph_snap(dir) != CEPH_NOSNAP)
>  		return -EROFS;
>  
> +	err = ceph_wait_on_conflict_unlink(dentry);
> +	if (err)
> +		return err;
> +
>  	if (ceph_quota_is_max_files_exceeded(dir)) {
>  		err = -EDQUOT;
>  		goto out;
> @@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
>  	if (ceph_snap(dir) != CEPH_NOSNAP)
>  		return -EROFS;
>  
> +	err = ceph_wait_on_conflict_unlink(dentry);
> +	if (err)
> +		return err;
> +
>  	if (ceph_quota_is_max_files_exceeded(dir)) {
>  		err = -EDQUOT;
>  		goto out;
> @@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
>  	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
>  	struct ceph_mds_request *req;
>  	struct ceph_acl_sec_ctx as_ctx = {};
> -	int err = -EROFS;
> +	int err;
>  	int op;
>  
> +	err = ceph_wait_on_conflict_unlink(dentry);
> +	if (err)
> +		return err;
> +
>  	if (ceph_snap(dir) == CEPH_SNAPDIR) {
>  		/* mkdir .snap/foo is a MKSNAP */
>  		op = CEPH_MDS_OP_MKSNAP;
> @@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
>  		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
>  		op = CEPH_MDS_OP_MKDIR;
>  	} else {
> +		err = -EROFS;
>  		goto out;
>  	}
>  
> @@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>  	struct ceph_mds_request *req;
>  	int err;
>  
> +	err = ceph_wait_on_conflict_unlink(dentry);
> +	if (err)
> +		return err;
> +
>  	if (ceph_snap(dir) != CEPH_NOSNAP)
>  		return -EROFS;
>  
> @@ -1071,9 +1088,24 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>  static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>  				 struct ceph_mds_request *req)
>  {
> +	struct dentry *dentry = req->r_dentry;
> +	struct ceph_dentry_info *di = ceph_dentry(dentry);
>  	int result = req->r_err ? req->r_err :
>  			le32_to_cpu(req->r_reply_info.head->result);
>  
> +	if (test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) {
> +		BUG_ON(req->r_op != CEPH_MDS_OP_UNLINK);
> +
> +		hash_del_rcu(&di->hnode);
> +
> +		spin_lock(&dentry->d_lock);
> +		di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
> +		wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
> +		spin_unlock(&dentry->d_lock);
> +
> +		synchronize_rcu();
> +	}
> +
>  	if (result == -EJUKEBOX)
>  		goto out;
>  
> @@ -1081,7 +1113,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>  	if (result) {
>  		int pathlen = 0;
>  		u64 base = 0;
> -		char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
> +		char *path = ceph_mdsc_build_path(dentry, &pathlen,
>  						  &base, 0);
>  
>  		/* mark error on parent + clear complete */
> @@ -1089,13 +1121,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>  		ceph_dir_clear_complete(req->r_parent);
>  
>  		/* drop the dentry -- we don't know its status */
> -		if (!d_unhashed(req->r_dentry))
> -			d_drop(req->r_dentry);
> +		if (!d_unhashed(dentry))
> +			d_drop(dentry);
>  
>  		/* mark inode itself for an error (since metadata is bogus) */
>  		mapping_set_error(req->r_old_inode->i_mapping, result);
>  
> -		pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
> +		pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
>  			base, IS_ERR(path) ? "<<bad>>" : path, result);
>  		ceph_mdsc_free_path(path, pathlen);
>  	}
> @@ -1189,12 +1221,21 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>  		ihold(req->r_old_inode);
>  		err = ceph_mdsc_submit_request(mdsc, dir, req);
>  		if (!err) {
> +			struct ceph_dentry_info *di;
> +
>  			/*
>  			 * We have enough caps, so we assume that the unlink
>  			 * will succeed. Fix up the target inode and dcache.
>  			 */
>  			drop_nlink(inode);
>  			d_delete(dentry);
> +
> +			spin_lock(&dentry->d_lock);
> +			di = ceph_dentry(dentry);
> +			di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
> +			hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
> +				     dentry->d_name.hash);
> +			spin_unlock(&dentry->d_lock);

This looks racy. It's possible that the reply comes in before we get to
the point of setting this flag. You probably want to do this before
calling ceph_mdsc_submit_request, and just unwind it if the submission
fails.


Also, you do still need some sort of lock to protect the
hash_add/del/_rcu calls. Those don't do any locking on their own. The
d_lock is insufficient here since it can't protect the whole list. You
may be able to use the i_ceph_lock of the parent though?

>  		} else if (err == -EJUKEBOX) {
>  			try_async = false;
>  			ceph_mdsc_put_request(req);
> @@ -1237,6 +1278,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
>  	    (!ceph_quota_is_same_realm(old_dir, new_dir)))
>  		return -EXDEV;
>  
> +	err = ceph_wait_on_conflict_unlink(new_dentry);
> +	if (err)
> +		return err;
> +
>  	dout("rename dir %p dentry %p to dir %p dentry %p\n",
>  	     old_dir, old_dentry, new_dir, new_dentry);
>  	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 8c8226c0feac..47d068e6436a 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -740,6 +740,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  	if (dentry->d_name.len > NAME_MAX)
>  		return -ENAMETOOLONG;
>  
> +	err = ceph_wait_on_conflict_unlink(dentry);
> +	if (err)
> +		return err;
> +

What might be nice here eventually is to not block an async create here,
but instead queue the request so that it gets transmitted after the
async unlink reply comes in.

That'll be hard to get right though, so this is fine for now.

>  	if (flags & O_CREAT) {
>  		if (ceph_quota_is_max_files_exceeded(dir))
>  			return -EDQUOT;
> @@ -757,6 +761,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>  		/* If it's not being looked up, it's negative */
>  		return -ENOENT;
>  	}
> +
>  retry:
>  	/* do the open */
>  	req = prepare_open_request(dir->i_sb, flags, mode);
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index e8c87dea0551..0ae0e0110eb4 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -468,6 +468,75 @@ static int ceph_parse_deleg_inos(void **p, void *end,
>  	return -EIO;
>  }
>  
> +/*
> + * In async unlink case the kclient won't wait for the first reply
> + * from MDS and just drop all the links and unhash the dentry and then
> + * succeeds immediately.
> + *
> + * For any new create/link/rename,etc requests followed by using the
> + * same file names we must wait for the first reply of the inflight
> + * unlink request, or the MDS possibly will fail these following
> + * requests with -EEXIST if the inflight async unlink request was
> + * delayed for some reasons.
> + *
> + * And the worst case is that for the none async openc request it will
> + * successfully open the file if the CDentry hasn't been unlinked yet,
> + * but later the previous delayed async unlink request will remove the
> + * CDenty. That means the just created file is possiblly deleted later
> + * by accident.
> + *
> + * We need to wait for the inflight async unlink requests to finish
> + * when creating new files/directories by using the same file names.
> + */
> +int ceph_wait_on_conflict_unlink(struct dentry *dentry)
> +{
> +	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
> +	struct dentry *pdentry = dentry->d_parent;
> +	struct dentry *udentry, *found = NULL;
> +	struct ceph_dentry_info *di;
> +	struct qstr dname;
> +	u32 hash = dentry->d_name.hash;
> +	int err;
> +
> +	dname.name = dentry->d_name.name;
> +	dname.len = dentry->d_name.len;
> +
> +	rcu_read_lock();
> +	hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
> +				   hnode, hash) {
> +		udentry = di->dentry;
> +
> +		spin_lock(&udentry->d_lock);
> +		if (udentry->d_name.hash != hash)
> +			goto next;
> +		if (unlikely(udentry->d_parent != pdentry))
> +			goto next;
> +		if (!hash_hashed(&di->hnode))
> +			goto next;
> +
> +		if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
> +			goto next;
> +

Maybe this should be a warning? Will we ever have entries in this
hashtable that don't have this bit set?

> +		if (d_compare(pdentry, udentry, &dname))
> +			goto next;
> +
> +		spin_unlock(&udentry->d_lock);
> +		found = dget(udentry);
> +		break;
> +next:
> +		spin_unlock(&udentry->d_lock);
> +	}
> +	rcu_read_unlock();
> +
> +	if (likely(!found))
> +		return 0;
> +
> +	err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
> +			  TASK_INTERRUPTIBLE);
> +	dput(found);
> +	return err;
> +}
> +
>  u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
>  {
>  	unsigned long ino;
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 33497846e47e..d1ae679c52c3 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -582,6 +582,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
>  			   TASK_INTERRUPTIBLE);
>  }
>  
> +extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
>  extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
>  extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
>  #endif
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index b73b4f75462c..7ae65001f04c 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -816,6 +816,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>  	if (!fsc->cap_wq)
>  		goto fail_inode_wq;
>  
> +	hash_init(fsc->async_unlink_conflict);
> +
>  	spin_lock(&ceph_fsc_lock);
>  	list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
>  	spin_unlock(&ceph_fsc_lock);
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 506d52633627..58bbb5df42da 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -19,6 +19,7 @@
>  #include <linux/security.h>
>  #include <linux/netfs.h>
>  #include <linux/fscache.h>
> +#include <linux/hashtable.h>
>  
>  #include <linux/ceph/libceph.h>
>  
> @@ -99,6 +100,8 @@ struct ceph_mount_options {
>  	char *mon_addr;
>  };
>  
> +#define CEPH_ASYNC_CREATE_CONFLICT_BITS 12
> +
>  struct ceph_fs_client {
>  	struct super_block *sb;
>  
> @@ -124,6 +127,8 @@ struct ceph_fs_client {
>  	struct workqueue_struct *inode_wq;
>  	struct workqueue_struct *cap_wq;
>  
> +	DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
> +
>  #ifdef CONFIG_DEBUG_FS
>  	struct dentry *debugfs_dentry_lru, *debugfs_caps;
>  	struct dentry *debugfs_congestion_kb;
> @@ -281,7 +286,8 @@ struct ceph_dentry_info {
>  	struct dentry *dentry;
>  	struct ceph_mds_session *lease_session;
>  	struct list_head lease_list;
> -	unsigned flags;
> +	struct hlist_node hnode;
> +	unsigned long flags;
>  	int lease_shared_gen;
>  	u32 lease_gen;
>  	u32 lease_seq;
> @@ -290,10 +296,12 @@ struct ceph_dentry_info {
>  	u64 offset;
>  };
>  
> -#define CEPH_DENTRY_REFERENCED		1
> -#define CEPH_DENTRY_LEASE_LIST		2
> -#define CEPH_DENTRY_SHRINK_LIST		4
> -#define CEPH_DENTRY_PRIMARY_LINK	8
> +#define CEPH_DENTRY_REFERENCED		(1 << 0)
> +#define CEPH_DENTRY_LEASE_LIST		(1 << 1)
> +#define CEPH_DENTRY_SHRINK_LIST		(1 << 2)
> +#define CEPH_DENTRY_PRIMARY_LINK	(1 << 3)
> +#define CEPH_DENTRY_ASYNC_UNLINK_BIT	(4)
> +#define CEPH_DENTRY_ASYNC_UNLINK	(1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
>  
>  struct ceph_inode_xattrs_info {
>  	/*

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir
  2022-05-16 13:23   ` Jeff Layton
@ 2022-05-16 14:56     ` Xiubo Li
  2022-05-16 16:58       ` Jeff Layton
  0 siblings, 1 reply; 10+ messages in thread
From: Xiubo Li @ 2022-05-16 14:56 UTC (permalink / raw)
  To: Jeff Layton, viro
  Cc: idryomov, vshankar, ceph-devel, mcgrof, akpm, arnd,
	linux-fsdevel, linux-kernel


On 5/16/22 9:23 PM, Jeff Layton wrote:
> On Mon, 2022-05-16 at 20:20 +0800, Xiubo Li wrote:
>> In async unlink case the kclient won't wait for the first reply
>> from MDS and just drop all the links and unhash the dentry and then
>> succeeds immediately.
>>
>> For any new create/link/rename,etc requests followed by using the
>> same file names we must wait for the first reply of the inflight
>> unlink request, or the MDS possibly will fail these following
>> requests with -EEXIST if the inflight async unlink request was
>> delayed for some reasons.
>>
>> And the worst case is that for the none async openc request it will
>> successfully open the file if the CDentry hasn't been unlinked yet,
>> but later the previous delayed async unlink request will remove the
>> CDenty. That means the just created file is possiblly deleted later
>> by accident.
>>
>> We need to wait for the inflight async unlink requests to finish
>> when creating new files/directories by using the same file names.
>>
>> URL: https://tracker.ceph.com/issues/55332
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/dir.c        | 55 +++++++++++++++++++++++++++++++----
>>   fs/ceph/file.c       |  5 ++++
>>   fs/ceph/mds_client.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
>>   fs/ceph/mds_client.h |  1 +
>>   fs/ceph/super.c      |  2 ++
>>   fs/ceph/super.h      | 18 ++++++++----
>>   6 files changed, 140 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
>> index eae417d71136..20c648406528 100644
>> --- a/fs/ceph/dir.c
>> +++ b/fs/ceph/dir.c
>> @@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
>>   	if (ceph_snap(dir) != CEPH_NOSNAP)
>>   		return -EROFS;
>>   
>> +	err = ceph_wait_on_conflict_unlink(dentry);
>> +	if (err)
>> +		return err;
>> +
>>   	if (ceph_quota_is_max_files_exceeded(dir)) {
>>   		err = -EDQUOT;
>>   		goto out;
>> @@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
>>   	if (ceph_snap(dir) != CEPH_NOSNAP)
>>   		return -EROFS;
>>   
>> +	err = ceph_wait_on_conflict_unlink(dentry);
>> +	if (err)
>> +		return err;
>> +
>>   	if (ceph_quota_is_max_files_exceeded(dir)) {
>>   		err = -EDQUOT;
>>   		goto out;
>> @@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
>>   	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
>>   	struct ceph_mds_request *req;
>>   	struct ceph_acl_sec_ctx as_ctx = {};
>> -	int err = -EROFS;
>> +	int err;
>>   	int op;
>>   
>> +	err = ceph_wait_on_conflict_unlink(dentry);
>> +	if (err)
>> +		return err;
>> +
>>   	if (ceph_snap(dir) == CEPH_SNAPDIR) {
>>   		/* mkdir .snap/foo is a MKSNAP */
>>   		op = CEPH_MDS_OP_MKSNAP;
>> @@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
>>   		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
>>   		op = CEPH_MDS_OP_MKDIR;
>>   	} else {
>> +		err = -EROFS;
>>   		goto out;
>>   	}
>>   
>> @@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>>   	struct ceph_mds_request *req;
>>   	int err;
>>   
>> +	err = ceph_wait_on_conflict_unlink(dentry);
>> +	if (err)
>> +		return err;
>> +
>>   	if (ceph_snap(dir) != CEPH_NOSNAP)
>>   		return -EROFS;
>>   
>> @@ -1071,9 +1088,24 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>>   static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>>   				 struct ceph_mds_request *req)
>>   {
>> +	struct dentry *dentry = req->r_dentry;
>> +	struct ceph_dentry_info *di = ceph_dentry(dentry);
>>   	int result = req->r_err ? req->r_err :
>>   			le32_to_cpu(req->r_reply_info.head->result);
>>   
>> +	if (test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) {
>> +		BUG_ON(req->r_op != CEPH_MDS_OP_UNLINK);
>> +
>> +		hash_del_rcu(&di->hnode);
>> +
>> +		spin_lock(&dentry->d_lock);
>> +		di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
>> +		wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
>> +		spin_unlock(&dentry->d_lock);
>> +
>> +		synchronize_rcu();
>> +	}
>> +
>>   	if (result == -EJUKEBOX)
>>   		goto out;
>>   
>> @@ -1081,7 +1113,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>>   	if (result) {
>>   		int pathlen = 0;
>>   		u64 base = 0;
>> -		char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
>> +		char *path = ceph_mdsc_build_path(dentry, &pathlen,
>>   						  &base, 0);
>>   
>>   		/* mark error on parent + clear complete */
>> @@ -1089,13 +1121,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>>   		ceph_dir_clear_complete(req->r_parent);
>>   
>>   		/* drop the dentry -- we don't know its status */
>> -		if (!d_unhashed(req->r_dentry))
>> -			d_drop(req->r_dentry);
>> +		if (!d_unhashed(dentry))
>> +			d_drop(dentry);
>>   
>>   		/* mark inode itself for an error (since metadata is bogus) */
>>   		mapping_set_error(req->r_old_inode->i_mapping, result);
>>   
>> -		pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
>> +		pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
>>   			base, IS_ERR(path) ? "<<bad>>" : path, result);
>>   		ceph_mdsc_free_path(path, pathlen);
>>   	}
>> @@ -1189,12 +1221,21 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>>   		ihold(req->r_old_inode);
>>   		err = ceph_mdsc_submit_request(mdsc, dir, req);
>>   		if (!err) {
>> +			struct ceph_dentry_info *di;
>> +
>>   			/*
>>   			 * We have enough caps, so we assume that the unlink
>>   			 * will succeed. Fix up the target inode and dcache.
>>   			 */
>>   			drop_nlink(inode);
>>   			d_delete(dentry);
>> +
>> +			spin_lock(&dentry->d_lock);
>> +			di = ceph_dentry(dentry);
>> +			di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
>> +			hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
>> +				     dentry->d_name.hash);
>> +			spin_unlock(&dentry->d_lock);
> This looks racy. It's possible that the reply comes in before we get to
> the point of setting this flag. You probably want to do this before
> calling ceph_mdsc_submit_request, and just unwind it if the submission
> fails.

Ah, right. Will fix it.


>
> Also, you do still need some sort of lock to protect the
> hash_add/del/_rcu calls.

Sure, will fix it too.

>   Those don't do any locking on their own. The
> d_lock is insufficient here since it can't protect the whole list. You
> may be able to use the i_ceph_lock of the parent though?

The hashtable is a global one, so we couldn't use the i_ceph_lock here. 
I will add one dedicated spin lock for each sb.

>>   		} else if (err == -EJUKEBOX) {
>>   			try_async = false;
>>   			ceph_mdsc_put_request(req);
>> @@ -1237,6 +1278,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
>>   	    (!ceph_quota_is_same_realm(old_dir, new_dir)))
>>   		return -EXDEV;
>>   
>> +	err = ceph_wait_on_conflict_unlink(new_dentry);
>> +	if (err)
>> +		return err;
>> +
>>   	dout("rename dir %p dentry %p to dir %p dentry %p\n",
>>   	     old_dir, old_dentry, new_dir, new_dentry);
>>   	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>> index 8c8226c0feac..47d068e6436a 100644
>> --- a/fs/ceph/file.c
>> +++ b/fs/ceph/file.c
>> @@ -740,6 +740,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>>   	if (dentry->d_name.len > NAME_MAX)
>>   		return -ENAMETOOLONG;
>>   
>> +	err = ceph_wait_on_conflict_unlink(dentry);
>> +	if (err)
>> +		return err;
>> +
> What might be nice here eventually is to not block an async create here,
> but instead queue the request so that it gets transmitted after the
> async unlink reply comes in.
>
> That'll be hard to get right though, so this is fine for now.

Sure.

>
>>   	if (flags & O_CREAT) {
>>   		if (ceph_quota_is_max_files_exceeded(dir))
>>   			return -EDQUOT;
>> @@ -757,6 +761,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>>   		/* If it's not being looked up, it's negative */
>>   		return -ENOENT;
>>   	}
>> +
>>   retry:
>>   	/* do the open */
>>   	req = prepare_open_request(dir->i_sb, flags, mode);
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index e8c87dea0551..0ae0e0110eb4 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -468,6 +468,75 @@ static int ceph_parse_deleg_inos(void **p, void *end,
>>   	return -EIO;
>>   }
>>   
>> +/*
>> + * In async unlink case the kclient won't wait for the first reply
>> + * from MDS and just drop all the links and unhash the dentry and then
>> + * succeeds immediately.
>> + *
>> + * For any new create/link/rename,etc requests followed by using the
>> + * same file names we must wait for the first reply of the inflight
>> + * unlink request, or the MDS possibly will fail these following
>> + * requests with -EEXIST if the inflight async unlink request was
>> + * delayed for some reasons.
>> + *
>> + * And the worst case is that for the none async openc request it will
>> + * successfully open the file if the CDentry hasn't been unlinked yet,
>> + * but later the previous delayed async unlink request will remove the
>> + * CDenty. That means the just created file is possiblly deleted later
>> + * by accident.
>> + *
>> + * We need to wait for the inflight async unlink requests to finish
>> + * when creating new files/directories by using the same file names.
>> + */
>> +int ceph_wait_on_conflict_unlink(struct dentry *dentry)
>> +{
>> +	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
>> +	struct dentry *pdentry = dentry->d_parent;
>> +	struct dentry *udentry, *found = NULL;
>> +	struct ceph_dentry_info *di;
>> +	struct qstr dname;
>> +	u32 hash = dentry->d_name.hash;
>> +	int err;
>> +
>> +	dname.name = dentry->d_name.name;
>> +	dname.len = dentry->d_name.len;
>> +
>> +	rcu_read_lock();
>> +	hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
>> +				   hnode, hash) {
>> +		udentry = di->dentry;
>> +
>> +		spin_lock(&udentry->d_lock);
>> +		if (udentry->d_name.hash != hash)
>> +			goto next;
>> +		if (unlikely(udentry->d_parent != pdentry))
>> +			goto next;
>> +		if (!hash_hashed(&di->hnode))
>> +			goto next;
>> +
>> +		if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
>> +			goto next;
>> +
> Maybe this should be a warning? Will we ever have entries in this
> hashtable that don't have this bit set?

Just before we take "spin_lock(&udentry->d_lock)" the udentry could be 
already removed from hashtable and the bit was cleared ?

-- Xiubo

>
>> +		if (d_compare(pdentry, udentry, &dname))
>> +			goto next;
>> +
>> +		spin_unlock(&udentry->d_lock);
>> +		found = dget(udentry);
>> +		break;
>> +next:
>> +		spin_unlock(&udentry->d_lock);
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	if (likely(!found))
>> +		return 0;
>> +
>> +	err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
>> +			  TASK_INTERRUPTIBLE);
>> +	dput(found);
>> +	return err;
>> +}
>> +
>>   u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
>>   {
>>   	unsigned long ino;
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index 33497846e47e..d1ae679c52c3 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -582,6 +582,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
>>   			   TASK_INTERRUPTIBLE);
>>   }
>>   
>> +extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
>>   extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
>>   extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
>>   #endif
>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>> index b73b4f75462c..7ae65001f04c 100644
>> --- a/fs/ceph/super.c
>> +++ b/fs/ceph/super.c
>> @@ -816,6 +816,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>>   	if (!fsc->cap_wq)
>>   		goto fail_inode_wq;
>>   
>> +	hash_init(fsc->async_unlink_conflict);
>> +
>>   	spin_lock(&ceph_fsc_lock);
>>   	list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
>>   	spin_unlock(&ceph_fsc_lock);
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 506d52633627..58bbb5df42da 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -19,6 +19,7 @@
>>   #include <linux/security.h>
>>   #include <linux/netfs.h>
>>   #include <linux/fscache.h>
>> +#include <linux/hashtable.h>
>>   
>>   #include <linux/ceph/libceph.h>
>>   
>> @@ -99,6 +100,8 @@ struct ceph_mount_options {
>>   	char *mon_addr;
>>   };
>>   
>> +#define CEPH_ASYNC_CREATE_CONFLICT_BITS 12
>> +
>>   struct ceph_fs_client {
>>   	struct super_block *sb;
>>   
>> @@ -124,6 +127,8 @@ struct ceph_fs_client {
>>   	struct workqueue_struct *inode_wq;
>>   	struct workqueue_struct *cap_wq;
>>   
>> +	DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
>> +
>>   #ifdef CONFIG_DEBUG_FS
>>   	struct dentry *debugfs_dentry_lru, *debugfs_caps;
>>   	struct dentry *debugfs_congestion_kb;
>> @@ -281,7 +286,8 @@ struct ceph_dentry_info {
>>   	struct dentry *dentry;
>>   	struct ceph_mds_session *lease_session;
>>   	struct list_head lease_list;
>> -	unsigned flags;
>> +	struct hlist_node hnode;
>> +	unsigned long flags;
>>   	int lease_shared_gen;
>>   	u32 lease_gen;
>>   	u32 lease_seq;
>> @@ -290,10 +296,12 @@ struct ceph_dentry_info {
>>   	u64 offset;
>>   };
>>   
>> -#define CEPH_DENTRY_REFERENCED		1
>> -#define CEPH_DENTRY_LEASE_LIST		2
>> -#define CEPH_DENTRY_SHRINK_LIST		4
>> -#define CEPH_DENTRY_PRIMARY_LINK	8
>> +#define CEPH_DENTRY_REFERENCED		(1 << 0)
>> +#define CEPH_DENTRY_LEASE_LIST		(1 << 1)
>> +#define CEPH_DENTRY_SHRINK_LIST		(1 << 2)
>> +#define CEPH_DENTRY_PRIMARY_LINK	(1 << 3)
>> +#define CEPH_DENTRY_ASYNC_UNLINK_BIT	(4)
>> +#define CEPH_DENTRY_ASYNC_UNLINK	(1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
>>   
>>   struct ceph_inode_xattrs_info {
>>   	/*


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir
  2022-05-16 14:56     ` Xiubo Li
@ 2022-05-16 16:58       ` Jeff Layton
  2022-05-17  0:48         ` Xiubo Li
  0 siblings, 1 reply; 10+ messages in thread
From: Jeff Layton @ 2022-05-16 16:58 UTC (permalink / raw)
  To: Xiubo Li, viro
  Cc: idryomov, vshankar, ceph-devel, mcgrof, akpm, arnd,
	linux-fsdevel, linux-kernel

On Mon, 2022-05-16 at 22:56 +0800, Xiubo Li wrote:
> On 5/16/22 9:23 PM, Jeff Layton wrote:
> > On Mon, 2022-05-16 at 20:20 +0800, Xiubo Li wrote:
> > > In async unlink case the kclient won't wait for the first reply
> > > from MDS and just drop all the links and unhash the dentry and then
> > > succeeds immediately.
> > > 
> > > For any new create/link/rename,etc requests followed by using the
> > > same file names we must wait for the first reply of the inflight
> > > unlink request, or the MDS possibly will fail these following
> > > requests with -EEXIST if the inflight async unlink request was
> > > delayed for some reasons.
> > > 
> > > And the worst case is that for the none async openc request it will
> > > successfully open the file if the CDentry hasn't been unlinked yet,
> > > but later the previous delayed async unlink request will remove the
> > > CDenty. That means the just created file is possiblly deleted later
> > > by accident.
> > > 
> > > We need to wait for the inflight async unlink requests to finish
> > > when creating new files/directories by using the same file names.
> > > 
> > > URL: https://tracker.ceph.com/issues/55332
> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > ---
> > >   fs/ceph/dir.c        | 55 +++++++++++++++++++++++++++++++----
> > >   fs/ceph/file.c       |  5 ++++
> > >   fs/ceph/mds_client.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
> > >   fs/ceph/mds_client.h |  1 +
> > >   fs/ceph/super.c      |  2 ++
> > >   fs/ceph/super.h      | 18 ++++++++----
> > >   6 files changed, 140 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > index eae417d71136..20c648406528 100644
> > > --- a/fs/ceph/dir.c
> > > +++ b/fs/ceph/dir.c
> > > @@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
> > >   	if (ceph_snap(dir) != CEPH_NOSNAP)
> > >   		return -EROFS;
> > >   
> > > +	err = ceph_wait_on_conflict_unlink(dentry);
> > > +	if (err)
> > > +		return err;
> > > +
> > >   	if (ceph_quota_is_max_files_exceeded(dir)) {
> > >   		err = -EDQUOT;
> > >   		goto out;
> > > @@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
> > >   	if (ceph_snap(dir) != CEPH_NOSNAP)
> > >   		return -EROFS;
> > >   
> > > +	err = ceph_wait_on_conflict_unlink(dentry);
> > > +	if (err)
> > > +		return err;
> > > +
> > >   	if (ceph_quota_is_max_files_exceeded(dir)) {
> > >   		err = -EDQUOT;
> > >   		goto out;
> > > @@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
> > >   	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
> > >   	struct ceph_mds_request *req;
> > >   	struct ceph_acl_sec_ctx as_ctx = {};
> > > -	int err = -EROFS;
> > > +	int err;
> > >   	int op;
> > >   
> > > +	err = ceph_wait_on_conflict_unlink(dentry);
> > > +	if (err)
> > > +		return err;
> > > +
> > >   	if (ceph_snap(dir) == CEPH_SNAPDIR) {
> > >   		/* mkdir .snap/foo is a MKSNAP */
> > >   		op = CEPH_MDS_OP_MKSNAP;
> > > @@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
> > >   		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
> > >   		op = CEPH_MDS_OP_MKDIR;
> > >   	} else {
> > > +		err = -EROFS;
> > >   		goto out;
> > >   	}
> > >   
> > > @@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
> > >   	struct ceph_mds_request *req;
> > >   	int err;
> > >   
> > > +	err = ceph_wait_on_conflict_unlink(dentry);
> > > +	if (err)
> > > +		return err;
> > > +
> > >   	if (ceph_snap(dir) != CEPH_NOSNAP)
> > >   		return -EROFS;
> > >   
> > > @@ -1071,9 +1088,24 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
> > >   static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > >   				 struct ceph_mds_request *req)
> > >   {
> > > +	struct dentry *dentry = req->r_dentry;
> > > +	struct ceph_dentry_info *di = ceph_dentry(dentry);
> > >   	int result = req->r_err ? req->r_err :
> > >   			le32_to_cpu(req->r_reply_info.head->result);
> > >   
> > > +	if (test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) {
> > > +		BUG_ON(req->r_op != CEPH_MDS_OP_UNLINK);
> > > +
> > > +		hash_del_rcu(&di->hnode);
> > > +
> > > +		spin_lock(&dentry->d_lock);
> > > +		di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
> > > +		wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
> > > +		spin_unlock(&dentry->d_lock);
> > > +
> > > +		synchronize_rcu();
> > > +	}
> > > +
> > >   	if (result == -EJUKEBOX)
> > >   		goto out;
> > >   
> > > @@ -1081,7 +1113,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > >   	if (result) {
> > >   		int pathlen = 0;
> > >   		u64 base = 0;
> > > -		char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
> > > +		char *path = ceph_mdsc_build_path(dentry, &pathlen,
> > >   						  &base, 0);
> > >   
> > >   		/* mark error on parent + clear complete */
> > > @@ -1089,13 +1121,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > >   		ceph_dir_clear_complete(req->r_parent);
> > >   
> > >   		/* drop the dentry -- we don't know its status */
> > > -		if (!d_unhashed(req->r_dentry))
> > > -			d_drop(req->r_dentry);
> > > +		if (!d_unhashed(dentry))
> > > +			d_drop(dentry);
> > >   
> > >   		/* mark inode itself for an error (since metadata is bogus) */
> > >   		mapping_set_error(req->r_old_inode->i_mapping, result);
> > >   
> > > -		pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
> > > +		pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
> > >   			base, IS_ERR(path) ? "<<bad>>" : path, result);
> > >   		ceph_mdsc_free_path(path, pathlen);
> > >   	}
> > > @@ -1189,12 +1221,21 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > >   		ihold(req->r_old_inode);
> > >   		err = ceph_mdsc_submit_request(mdsc, dir, req);
> > >   		if (!err) {
> > > +			struct ceph_dentry_info *di;
> > > +
> > >   			/*
> > >   			 * We have enough caps, so we assume that the unlink
> > >   			 * will succeed. Fix up the target inode and dcache.
> > >   			 */
> > >   			drop_nlink(inode);
> > >   			d_delete(dentry);
> > > +
> > > +			spin_lock(&dentry->d_lock);
> > > +			di = ceph_dentry(dentry);
> > > +			di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
> > > +			hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
> > > +				     dentry->d_name.hash);
> > > +			spin_unlock(&dentry->d_lock);
> > This looks racy. It's possible that the reply comes in before we get to
> > the point of setting this flag. You probably want to do this before
> > calling ceph_mdsc_submit_request, and just unwind it if the submission
> > fails.
> 
> Ah, right. Will fix it.
> 
> 
> > 
> > Also, you do still need some sort of lock to protect the
> > hash_add/del/_rcu calls.
> 
> Sure, will fix it too.
> 
> >   Those don't do any locking on their own. The
> > d_lock is insufficient here since it can't protect the whole list. You
> > may be able to use the i_ceph_lock of the parent though?
> 
> The hashtable is a global one, so we couldn't use the i_ceph_lock here. 
> I will add one dedicated spin lock for each sb.
> 
> > >   		} else if (err == -EJUKEBOX) {
> > >   			try_async = false;
> > >   			ceph_mdsc_put_request(req);
> > > @@ -1237,6 +1278,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
> > >   	    (!ceph_quota_is_same_realm(old_dir, new_dir)))
> > >   		return -EXDEV;
> > >   
> > > +	err = ceph_wait_on_conflict_unlink(new_dentry);
> > > +	if (err)
> > > +		return err;
> > > +
> > >   	dout("rename dir %p dentry %p to dir %p dentry %p\n",
> > >   	     old_dir, old_dentry, new_dir, new_dentry);
> > >   	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
> > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > index 8c8226c0feac..47d068e6436a 100644
> > > --- a/fs/ceph/file.c
> > > +++ b/fs/ceph/file.c
> > > @@ -740,6 +740,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > >   	if (dentry->d_name.len > NAME_MAX)
> > >   		return -ENAMETOOLONG;
> > >   
> > > +	err = ceph_wait_on_conflict_unlink(dentry);
> > > +	if (err)
> > > +		return err;
> > > +
> > What might be nice here eventually is to not block an async create here,
> > but instead queue the request so that it gets transmitted after the
> > async unlink reply comes in.
> > 
> > That'll be hard to get right though, so this is fine for now.
> 
> Sure.
> 
> > 
> > >   	if (flags & O_CREAT) {
> > >   		if (ceph_quota_is_max_files_exceeded(dir))
> > >   			return -EDQUOT;
> > > @@ -757,6 +761,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > >   		/* If it's not being looked up, it's negative */
> > >   		return -ENOENT;
> > >   	}
> > > +
> > >   retry:
> > >   	/* do the open */
> > >   	req = prepare_open_request(dir->i_sb, flags, mode);
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index e8c87dea0551..0ae0e0110eb4 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -468,6 +468,75 @@ static int ceph_parse_deleg_inos(void **p, void *end,
> > >   	return -EIO;
> > >   }
> > >   
> > > +/*
> > > + * In async unlink case the kclient won't wait for the first reply
> > > + * from MDS and just drop all the links and unhash the dentry and then
> > > + * succeeds immediately.
> > > + *
> > > + * For any new create/link/rename,etc requests followed by using the
> > > + * same file names we must wait for the first reply of the inflight
> > > + * unlink request, or the MDS possibly will fail these following
> > > + * requests with -EEXIST if the inflight async unlink request was
> > > + * delayed for some reasons.
> > > + *
> > > + * And the worst case is that for the none async openc request it will
> > > + * successfully open the file if the CDentry hasn't been unlinked yet,
> > > + * but later the previous delayed async unlink request will remove the
> > > + * CDenty. That means the just created file is possiblly deleted later
> > > + * by accident.
> > > + *
> > > + * We need to wait for the inflight async unlink requests to finish
> > > + * when creating new files/directories by using the same file names.
> > > + */
> > > +int ceph_wait_on_conflict_unlink(struct dentry *dentry)
> > > +{
> > > +	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
> > > +	struct dentry *pdentry = dentry->d_parent;
> > > +	struct dentry *udentry, *found = NULL;
> > > +	struct ceph_dentry_info *di;
> > > +	struct qstr dname;
> > > +	u32 hash = dentry->d_name.hash;
> > > +	int err;
> > > +
> > > +	dname.name = dentry->d_name.name;
> > > +	dname.len = dentry->d_name.len;
> > > +
> > > +	rcu_read_lock();
> > > +	hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
> > > +				   hnode, hash) {
> > > +		udentry = di->dentry;
> > > +
> > > +		spin_lock(&udentry->d_lock);
> > > +		if (udentry->d_name.hash != hash)
> > > +			goto next;
> > > +		if (unlikely(udentry->d_parent != pdentry))
> > > +			goto next;
> > > +		if (!hash_hashed(&di->hnode))
> > > +			goto next;
> > > +
> > > +		if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
> > > +			goto next;
> > > +
> > Maybe this should be a warning? Will we ever have entries in this
> > hashtable that don't have this bit set?
> 
> Just before we take "spin_lock(&udentry->d_lock)" the udentry could be 
> already removed from hashtable and the bit was cleared ?
> 
> 


The point is that you're removing the dentry from the hash before you
clear the flag, so there should never be a dentry in the hash that has
the flag cleared.

> > 
> > > +		if (d_compare(pdentry, udentry, &dname))
> > > +			goto next;
> > > +
> > > +		spin_unlock(&udentry->d_lock);
> > > +		found = dget(udentry);
> > > +		break;
> > > +next:
> > > +		spin_unlock(&udentry->d_lock);
> > > +	}
> > > +	rcu_read_unlock();
> > > +
> > > +	if (likely(!found))
> > > +		return 0;
> > > +
> > > +	err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
> > > +			  TASK_INTERRUPTIBLE);
> > > +	dput(found);
> > > +	return err;
> > > +}
> > > +
> > >   u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
> > >   {
> > >   	unsigned long ino;
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index 33497846e47e..d1ae679c52c3 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -582,6 +582,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
> > >   			   TASK_INTERRUPTIBLE);
> > >   }
> > >   
> > > +extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
> > >   extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
> > >   extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
> > >   #endif
> > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > index b73b4f75462c..7ae65001f04c 100644
> > > --- a/fs/ceph/super.c
> > > +++ b/fs/ceph/super.c
> > > @@ -816,6 +816,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
> > >   	if (!fsc->cap_wq)
> > >   		goto fail_inode_wq;
> > >   
> > > +	hash_init(fsc->async_unlink_conflict);
> > > +
> > >   	spin_lock(&ceph_fsc_lock);
> > >   	list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
> > >   	spin_unlock(&ceph_fsc_lock);
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index 506d52633627..58bbb5df42da 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -19,6 +19,7 @@
> > >   #include <linux/security.h>
> > >   #include <linux/netfs.h>
> > >   #include <linux/fscache.h>
> > > +#include <linux/hashtable.h>
> > >   
> > >   #include <linux/ceph/libceph.h>
> > >   
> > > @@ -99,6 +100,8 @@ struct ceph_mount_options {
> > >   	char *mon_addr;
> > >   };
> > >   
> > > +#define CEPH_ASYNC_CREATE_CONFLICT_BITS 12
> > > +
> > >   struct ceph_fs_client {
> > >   	struct super_block *sb;
> > >   
> > > @@ -124,6 +127,8 @@ struct ceph_fs_client {
> > >   	struct workqueue_struct *inode_wq;
> > >   	struct workqueue_struct *cap_wq;
> > >   
> > > +	DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
> > > +
> > >   #ifdef CONFIG_DEBUG_FS
> > >   	struct dentry *debugfs_dentry_lru, *debugfs_caps;
> > >   	struct dentry *debugfs_congestion_kb;
> > > @@ -281,7 +286,8 @@ struct ceph_dentry_info {
> > >   	struct dentry *dentry;
> > >   	struct ceph_mds_session *lease_session;
> > >   	struct list_head lease_list;
> > > -	unsigned flags;
> > > +	struct hlist_node hnode;
> > > +	unsigned long flags;
> > >   	int lease_shared_gen;
> > >   	u32 lease_gen;
> > >   	u32 lease_seq;
> > > @@ -290,10 +296,12 @@ struct ceph_dentry_info {
> > >   	u64 offset;
> > >   };
> > >   
> > > -#define CEPH_DENTRY_REFERENCED		1
> > > -#define CEPH_DENTRY_LEASE_LIST		2
> > > -#define CEPH_DENTRY_SHRINK_LIST		4
> > > -#define CEPH_DENTRY_PRIMARY_LINK	8
> > > +#define CEPH_DENTRY_REFERENCED		(1 << 0)
> > > +#define CEPH_DENTRY_LEASE_LIST		(1 << 1)
> > > +#define CEPH_DENTRY_SHRINK_LIST		(1 << 2)
> > > +#define CEPH_DENTRY_PRIMARY_LINK	(1 << 3)
> > > +#define CEPH_DENTRY_ASYNC_UNLINK_BIT	(4)
> > > +#define CEPH_DENTRY_ASYNC_UNLINK	(1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
> > >   
> > >   struct ceph_inode_xattrs_info {
> > >   	/*
> 

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir
  2022-05-16 12:20 ` [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir Xiubo Li
  2022-05-16 13:23   ` Jeff Layton
@ 2022-05-16 21:05   ` kernel test robot
  2022-05-16 23:18   ` kernel test robot
  2 siblings, 0 replies; 10+ messages in thread
From: kernel test robot @ 2022-05-16 21:05 UTC (permalink / raw)
  To: Xiubo Li, jlayton, viro
  Cc: llvm, kbuild-all, idryomov, vshankar, ceph-devel, mcgrof, akpm,
	arnd, linux-fsdevel, linux-kernel, Xiubo Li

Hi Xiubo,

I love your patch! Yet something to improve:

[auto build test ERROR on ceph-client/for-linus]
[also build test ERROR on linus/master v5.18-rc7 next-20220516]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Xiubo-Li/ceph-wait-async-unlink-to-finish/20220516-202249
base:   https://github.com/ceph/ceph-client.git for-linus
config: i386-randconfig-a001-20220516 (https://download.01.org/0day-ci/archive/20220517/202205170402.ez4Z5njh-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project 853fa8ee225edf2d0de94b0dcbd31bea916e825e)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/85d578952d01b70d71fccd86ccb0fdd1dbd0df8b
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Xiubo-Li/ceph-wait-async-unlink-to-finish/20220516-202249
        git checkout 85d578952d01b70d71fccd86ccb0fdd1dbd0df8b
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> ld.lld: error: undefined symbol: ceph_wait_on_conflict_unlink
   >>> referenced by dir.c
   >>>               ceph/dir.o:(ceph_link) in archive fs/built-in.a
   >>> referenced by dir.c
   >>>               ceph/dir.o:(ceph_symlink) in archive fs/built-in.a
   >>> referenced by dir.c
   >>>               ceph/dir.o:(ceph_mkdir) in archive fs/built-in.a
   >>> referenced 3 more times

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir
  2022-05-16 12:20 ` [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir Xiubo Li
  2022-05-16 13:23   ` Jeff Layton
  2022-05-16 21:05   ` kernel test robot
@ 2022-05-16 23:18   ` kernel test robot
  2 siblings, 0 replies; 10+ messages in thread
From: kernel test robot @ 2022-05-16 23:18 UTC (permalink / raw)
  To: Xiubo Li, jlayton, viro
  Cc: kbuild-all, idryomov, vshankar, ceph-devel, mcgrof, akpm, arnd,
	linux-fsdevel, linux-kernel, Xiubo Li

Hi Xiubo,

I love your patch! Yet something to improve:

[auto build test ERROR on ceph-client/for-linus]
[also build test ERROR on linus/master v5.18-rc7 next-20220516]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Xiubo-Li/ceph-wait-async-unlink-to-finish/20220516-202249
base:   https://github.com/ceph/ceph-client.git for-linus
config: i386-debian-10.3-kselftests (https://download.01.org/0day-ci/archive/20220517/202205170751.AZfL9JiX-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.2.0-20) 11.2.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/85d578952d01b70d71fccd86ccb0fdd1dbd0df8b
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Xiubo-Li/ceph-wait-async-unlink-to-finish/20220516-202249
        git checkout 85d578952d01b70d71fccd86ccb0fdd1dbd0df8b
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "ceph_wait_on_conflict_unlink" [fs/ceph/ceph.ko] undefined!

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir
  2022-05-16 16:58       ` Jeff Layton
@ 2022-05-17  0:48         ` Xiubo Li
  0 siblings, 0 replies; 10+ messages in thread
From: Xiubo Li @ 2022-05-17  0:48 UTC (permalink / raw)
  To: Jeff Layton, viro
  Cc: idryomov, vshankar, ceph-devel, mcgrof, akpm, arnd,
	linux-fsdevel, linux-kernel


On 5/17/22 12:58 AM, Jeff Layton wrote:
> On Mon, 2022-05-16 at 22:56 +0800, Xiubo Li wrote:
>> On 5/16/22 9:23 PM, Jeff Layton wrote:
>>> On Mon, 2022-05-16 at 20:20 +0800, Xiubo Li wrote:
>>>> In async unlink case the kclient won't wait for the first reply
>>>> from MDS and just drop all the links and unhash the dentry and then
>>>> succeeds immediately.
>>>>
>>>> For any new create/link/rename,etc requests followed by using the
>>>> same file names we must wait for the first reply of the inflight
>>>> unlink request, or the MDS possibly will fail these following
>>>> requests with -EEXIST if the inflight async unlink request was
>>>> delayed for some reasons.
>>>>
>>>> And the worst case is that for the none async openc request it will
>>>> successfully open the file if the CDentry hasn't been unlinked yet,
>>>> but later the previous delayed async unlink request will remove the
>>>> CDenty. That means the just created file is possiblly deleted later
>>>> by accident.
>>>>
>>>> We need to wait for the inflight async unlink requests to finish
>>>> when creating new files/directories by using the same file names.
>>>>
>>>> URL: https://tracker.ceph.com/issues/55332
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>>    fs/ceph/dir.c        | 55 +++++++++++++++++++++++++++++++----
>>>>    fs/ceph/file.c       |  5 ++++
>>>>    fs/ceph/mds_client.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
>>>>    fs/ceph/mds_client.h |  1 +
>>>>    fs/ceph/super.c      |  2 ++
>>>>    fs/ceph/super.h      | 18 ++++++++----
>>>>    6 files changed, 140 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
>>>> index eae417d71136..20c648406528 100644
>>>> --- a/fs/ceph/dir.c
>>>> +++ b/fs/ceph/dir.c
>>>> @@ -856,6 +856,10 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir,
>>>>    	if (ceph_snap(dir) != CEPH_NOSNAP)
>>>>    		return -EROFS;
>>>>    
>>>> +	err = ceph_wait_on_conflict_unlink(dentry);
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>>    	if (ceph_quota_is_max_files_exceeded(dir)) {
>>>>    		err = -EDQUOT;
>>>>    		goto out;
>>>> @@ -918,6 +922,10 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir,
>>>>    	if (ceph_snap(dir) != CEPH_NOSNAP)
>>>>    		return -EROFS;
>>>>    
>>>> +	err = ceph_wait_on_conflict_unlink(dentry);
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>>    	if (ceph_quota_is_max_files_exceeded(dir)) {
>>>>    		err = -EDQUOT;
>>>>    		goto out;
>>>> @@ -968,9 +976,13 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
>>>>    	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
>>>>    	struct ceph_mds_request *req;
>>>>    	struct ceph_acl_sec_ctx as_ctx = {};
>>>> -	int err = -EROFS;
>>>> +	int err;
>>>>    	int op;
>>>>    
>>>> +	err = ceph_wait_on_conflict_unlink(dentry);
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>>    	if (ceph_snap(dir) == CEPH_SNAPDIR) {
>>>>    		/* mkdir .snap/foo is a MKSNAP */
>>>>    		op = CEPH_MDS_OP_MKSNAP;
>>>> @@ -980,6 +992,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
>>>>    		dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode);
>>>>    		op = CEPH_MDS_OP_MKDIR;
>>>>    	} else {
>>>> +		err = -EROFS;
>>>>    		goto out;
>>>>    	}
>>>>    
>>>> @@ -1037,6 +1050,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>>>>    	struct ceph_mds_request *req;
>>>>    	int err;
>>>>    
>>>> +	err = ceph_wait_on_conflict_unlink(dentry);
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>>    	if (ceph_snap(dir) != CEPH_NOSNAP)
>>>>    		return -EROFS;
>>>>    
>>>> @@ -1071,9 +1088,24 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
>>>>    static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>>>>    				 struct ceph_mds_request *req)
>>>>    {
>>>> +	struct dentry *dentry = req->r_dentry;
>>>> +	struct ceph_dentry_info *di = ceph_dentry(dentry);
>>>>    	int result = req->r_err ? req->r_err :
>>>>    			le32_to_cpu(req->r_reply_info.head->result);
>>>>    
>>>> +	if (test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) {
>>>> +		BUG_ON(req->r_op != CEPH_MDS_OP_UNLINK);
>>>> +
>>>> +		hash_del_rcu(&di->hnode);
>>>> +
>>>> +		spin_lock(&dentry->d_lock);
>>>> +		di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK;
>>>> +		wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT);
>>>> +		spin_unlock(&dentry->d_lock);
>>>> +
>>>> +		synchronize_rcu();
>>>> +	}
>>>> +
>>>>    	if (result == -EJUKEBOX)
>>>>    		goto out;
>>>>    
>>>> @@ -1081,7 +1113,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>>>>    	if (result) {
>>>>    		int pathlen = 0;
>>>>    		u64 base = 0;
>>>> -		char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
>>>> +		char *path = ceph_mdsc_build_path(dentry, &pathlen,
>>>>    						  &base, 0);
>>>>    
>>>>    		/* mark error on parent + clear complete */
>>>> @@ -1089,13 +1121,13 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>>>>    		ceph_dir_clear_complete(req->r_parent);
>>>>    
>>>>    		/* drop the dentry -- we don't know its status */
>>>> -		if (!d_unhashed(req->r_dentry))
>>>> -			d_drop(req->r_dentry);
>>>> +		if (!d_unhashed(dentry))
>>>> +			d_drop(dentry);
>>>>    
>>>>    		/* mark inode itself for an error (since metadata is bogus) */
>>>>    		mapping_set_error(req->r_old_inode->i_mapping, result);
>>>>    
>>>> -		pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
>>>> +		pr_warn("async unlink failure path=(%llx)%s result=%d!\n",
>>>>    			base, IS_ERR(path) ? "<<bad>>" : path, result);
>>>>    		ceph_mdsc_free_path(path, pathlen);
>>>>    	}
>>>> @@ -1189,12 +1221,21 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>>>>    		ihold(req->r_old_inode);
>>>>    		err = ceph_mdsc_submit_request(mdsc, dir, req);
>>>>    		if (!err) {
>>>> +			struct ceph_dentry_info *di;
>>>> +
>>>>    			/*
>>>>    			 * We have enough caps, so we assume that the unlink
>>>>    			 * will succeed. Fix up the target inode and dcache.
>>>>    			 */
>>>>    			drop_nlink(inode);
>>>>    			d_delete(dentry);
>>>> +
>>>> +			spin_lock(&dentry->d_lock);
>>>> +			di = ceph_dentry(dentry);
>>>> +			di->flags |= CEPH_DENTRY_ASYNC_UNLINK;
>>>> +			hash_add_rcu(fsc->async_unlink_conflict, &di->hnode,
>>>> +				     dentry->d_name.hash);
>>>> +			spin_unlock(&dentry->d_lock);
>>> This looks racy. It's possible that the reply comes in before we get to
>>> the point of setting this flag. You probably want to do this before
>>> calling ceph_mdsc_submit_request, and just unwind it if the submission
>>> fails.
>> Ah, right. Will fix it.
>>
>>
>>> Also, you do still need some sort of lock to protect the
>>> hash_add/del/_rcu calls.
>> Sure, will fix it too.
>>
>>>    Those don't do any locking on their own. The
>>> d_lock is insufficient here since it can't protect the whole list. You
>>> may be able to use the i_ceph_lock of the parent though?
>> The hashtable is a global one, so we couldn't use the i_ceph_lock here.
>> I will add one dedicated spin lock for each sb.
>>
>>>>    		} else if (err == -EJUKEBOX) {
>>>>    			try_async = false;
>>>>    			ceph_mdsc_put_request(req);
>>>> @@ -1237,6 +1278,10 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
>>>>    	    (!ceph_quota_is_same_realm(old_dir, new_dir)))
>>>>    		return -EXDEV;
>>>>    
>>>> +	err = ceph_wait_on_conflict_unlink(new_dentry);
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>>>    	dout("rename dir %p dentry %p to dir %p dentry %p\n",
>>>>    	     old_dir, old_dentry, new_dir, new_dentry);
>>>>    	req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
>>>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>>>> index 8c8226c0feac..47d068e6436a 100644
>>>> --- a/fs/ceph/file.c
>>>> +++ b/fs/ceph/file.c
>>>> @@ -740,6 +740,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>>>>    	if (dentry->d_name.len > NAME_MAX)
>>>>    		return -ENAMETOOLONG;
>>>>    
>>>> +	err = ceph_wait_on_conflict_unlink(dentry);
>>>> +	if (err)
>>>> +		return err;
>>>> +
>>> What might be nice here eventually is to not block an async create here,
>>> but instead queue the request so that it gets transmitted after the
>>> async unlink reply comes in.
>>>
>>> That'll be hard to get right though, so this is fine for now.
>> Sure.
>>
>>>>    	if (flags & O_CREAT) {
>>>>    		if (ceph_quota_is_max_files_exceeded(dir))
>>>>    			return -EDQUOT;
>>>> @@ -757,6 +761,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
>>>>    		/* If it's not being looked up, it's negative */
>>>>    		return -ENOENT;
>>>>    	}
>>>> +
>>>>    retry:
>>>>    	/* do the open */
>>>>    	req = prepare_open_request(dir->i_sb, flags, mode);
>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>> index e8c87dea0551..0ae0e0110eb4 100644
>>>> --- a/fs/ceph/mds_client.c
>>>> +++ b/fs/ceph/mds_client.c
>>>> @@ -468,6 +468,75 @@ static int ceph_parse_deleg_inos(void **p, void *end,
>>>>    	return -EIO;
>>>>    }
>>>>    
>>>> +/*
>>>> + * In async unlink case the kclient won't wait for the first reply
>>>> + * from MDS and just drop all the links and unhash the dentry and then
>>>> + * succeeds immediately.
>>>> + *
>>>> + * For any new create/link/rename,etc requests followed by using the
>>>> + * same file names we must wait for the first reply of the inflight
>>>> + * unlink request, or the MDS possibly will fail these following
>>>> + * requests with -EEXIST if the inflight async unlink request was
>>>> + * delayed for some reasons.
>>>> + *
>>>> + * And the worst case is that for the none async openc request it will
>>>> + * successfully open the file if the CDentry hasn't been unlinked yet,
>>>> + * but later the previous delayed async unlink request will remove the
>>>> + * CDenty. That means the just created file is possiblly deleted later
>>>> + * by accident.
>>>> + *
>>>> + * We need to wait for the inflight async unlink requests to finish
>>>> + * when creating new files/directories by using the same file names.
>>>> + */
>>>> +int ceph_wait_on_conflict_unlink(struct dentry *dentry)
>>>> +{
>>>> +	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
>>>> +	struct dentry *pdentry = dentry->d_parent;
>>>> +	struct dentry *udentry, *found = NULL;
>>>> +	struct ceph_dentry_info *di;
>>>> +	struct qstr dname;
>>>> +	u32 hash = dentry->d_name.hash;
>>>> +	int err;
>>>> +
>>>> +	dname.name = dentry->d_name.name;
>>>> +	dname.len = dentry->d_name.len;
>>>> +
>>>> +	rcu_read_lock();
>>>> +	hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
>>>> +				   hnode, hash) {
>>>> +		udentry = di->dentry;
>>>> +
>>>> +		spin_lock(&udentry->d_lock);
>>>> +		if (udentry->d_name.hash != hash)
>>>> +			goto next;
>>>> +		if (unlikely(udentry->d_parent != pdentry))
>>>> +			goto next;
>>>> +		if (!hash_hashed(&di->hnode))
>>>> +			goto next;
>>>> +
>>>> +		if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
>>>> +			goto next;
>>>> +
>>> Maybe this should be a warning? Will we ever have entries in this
>>> hashtable that don't have this bit set?
>> Just before we take "spin_lock(&udentry->d_lock)" the udentry could be
>> already removed from hashtable and the bit was cleared ?
>>
>>
>
> The point is that you're removing the dentry from the hash before you
> clear the flag, so there should never be a dentry in the hash that has
> the flag cleared.
>
Yeah, makes sense. Will make it a warning here.

-- XIubo


>>>> +		if (d_compare(pdentry, udentry, &dname))
>>>> +			goto next;
>>>> +
>>>> +		spin_unlock(&udentry->d_lock);
>>>> +		found = dget(udentry);
>>>> +		break;
>>>> +next:
>>>> +		spin_unlock(&udentry->d_lock);
>>>> +	}
>>>> +	rcu_read_unlock();
>>>> +
>>>> +	if (likely(!found))
>>>> +		return 0;
>>>> +
>>>> +	err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
>>>> +			  TASK_INTERRUPTIBLE);
>>>> +	dput(found);
>>>> +	return err;
>>>> +}
>>>> +
>>>>    u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
>>>>    {
>>>>    	unsigned long ino;
>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>> index 33497846e47e..d1ae679c52c3 100644
>>>> --- a/fs/ceph/mds_client.h
>>>> +++ b/fs/ceph/mds_client.h
>>>> @@ -582,6 +582,7 @@ static inline int ceph_wait_on_async_create(struct inode *inode)
>>>>    			   TASK_INTERRUPTIBLE);
>>>>    }
>>>>    
>>>> +extern int ceph_wait_on_conflict_unlink(struct dentry *dentry);
>>>>    extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
>>>>    extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
>>>>    #endif
>>>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>>>> index b73b4f75462c..7ae65001f04c 100644
>>>> --- a/fs/ceph/super.c
>>>> +++ b/fs/ceph/super.c
>>>> @@ -816,6 +816,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>>>>    	if (!fsc->cap_wq)
>>>>    		goto fail_inode_wq;
>>>>    
>>>> +	hash_init(fsc->async_unlink_conflict);
>>>> +
>>>>    	spin_lock(&ceph_fsc_lock);
>>>>    	list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
>>>>    	spin_unlock(&ceph_fsc_lock);
>>>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>>>> index 506d52633627..58bbb5df42da 100644
>>>> --- a/fs/ceph/super.h
>>>> +++ b/fs/ceph/super.h
>>>> @@ -19,6 +19,7 @@
>>>>    #include <linux/security.h>
>>>>    #include <linux/netfs.h>
>>>>    #include <linux/fscache.h>
>>>> +#include <linux/hashtable.h>
>>>>    
>>>>    #include <linux/ceph/libceph.h>
>>>>    
>>>> @@ -99,6 +100,8 @@ struct ceph_mount_options {
>>>>    	char *mon_addr;
>>>>    };
>>>>    
>>>> +#define CEPH_ASYNC_CREATE_CONFLICT_BITS 12
>>>> +
>>>>    struct ceph_fs_client {
>>>>    	struct super_block *sb;
>>>>    
>>>> @@ -124,6 +127,8 @@ struct ceph_fs_client {
>>>>    	struct workqueue_struct *inode_wq;
>>>>    	struct workqueue_struct *cap_wq;
>>>>    
>>>> +	DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS);
>>>> +
>>>>    #ifdef CONFIG_DEBUG_FS
>>>>    	struct dentry *debugfs_dentry_lru, *debugfs_caps;
>>>>    	struct dentry *debugfs_congestion_kb;
>>>> @@ -281,7 +286,8 @@ struct ceph_dentry_info {
>>>>    	struct dentry *dentry;
>>>>    	struct ceph_mds_session *lease_session;
>>>>    	struct list_head lease_list;
>>>> -	unsigned flags;
>>>> +	struct hlist_node hnode;
>>>> +	unsigned long flags;
>>>>    	int lease_shared_gen;
>>>>    	u32 lease_gen;
>>>>    	u32 lease_seq;
>>>> @@ -290,10 +296,12 @@ struct ceph_dentry_info {
>>>>    	u64 offset;
>>>>    };
>>>>    
>>>> -#define CEPH_DENTRY_REFERENCED		1
>>>> -#define CEPH_DENTRY_LEASE_LIST		2
>>>> -#define CEPH_DENTRY_SHRINK_LIST		4
>>>> -#define CEPH_DENTRY_PRIMARY_LINK	8
>>>> +#define CEPH_DENTRY_REFERENCED		(1 << 0)
>>>> +#define CEPH_DENTRY_LEASE_LIST		(1 << 1)
>>>> +#define CEPH_DENTRY_SHRINK_LIST		(1 << 2)
>>>> +#define CEPH_DENTRY_PRIMARY_LINK	(1 << 3)
>>>> +#define CEPH_DENTRY_ASYNC_UNLINK_BIT	(4)
>>>> +#define CEPH_DENTRY_ASYNC_UNLINK	(1 << CEPH_DENTRY_ASYNC_UNLINK_BIT)
>>>>    
>>>>    struct ceph_inode_xattrs_info {
>>>>    	/*


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-05-17  0:48 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-16 12:20 [PATCH 0/2] ceph: wait async unlink to finish Xiubo Li
2022-05-16 12:20 ` [PATCH 1/2] fs/dcache: add d_compare() helper support Xiubo Li
2022-05-16 13:12   ` Jeff Layton
2022-05-16 12:20 ` [PATCH 2/2] ceph: wait the first reply of inflight unlink/rmdir Xiubo Li
2022-05-16 13:23   ` Jeff Layton
2022-05-16 14:56     ` Xiubo Li
2022-05-16 16:58       ` Jeff Layton
2022-05-17  0:48         ` Xiubo Li
2022-05-16 21:05   ` kernel test robot
2022-05-16 23:18   ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).