ceph-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/6] ceph: better error handling for failed async creates
@ 2021-09-08 13:03 Jeff Layton
  2021-09-08 13:03 ` [PATCH 1/6] ceph: print inode numbers instead of pointer values Jeff Layton
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Jeff Layton @ 2021-09-08 13:03 UTC (permalink / raw)
  To: ceph-devel; +Cc: idryomov, pdonnell

We've had an ongoing problem with hung umounts in teuthology, the start
of which seems to coincide with us starting to test "-o nowsync" more
regularly. Recently, we were able to get better debug output, and saw
that in at least one case, the issue was that the MDS started returning
-ENOSPC on an async create attempt, which left the client trying to
flush caps for an inode that never existed.

This patchset adds a new mechanism for "shutting down" inodes after the
fact. If an async create fails, we already d_drop the dentry associated
with it. This situation in this case is somewhat similar to the case
where we have dirty inodes at forced umount time, so the idea is to
extend the infrastructure that handles that case to also handle inodes
that failed create too.

There are also some cleanup/bugfix patches in here. This was tested
using a fault injection patch on the MDS that makes it start rejecting
openc calls with -ENOSPC.

Jeff Layton (6):
  ceph: print inode numbers instead of pointer values
  ceph: don't use -ESTALE as special return code in try_get_cap_refs
  ceph: drop private list from remove_session_caps_cb
  ceph: fix auth cap handling logic in remove_session_caps_cb
  ceph: refactor remove_session_caps_cb
  ceph: shut down access to inode when async create fails

 fs/ceph/addr.c       |  16 +++--
 fs/ceph/caps.c       | 150 ++++++++++++++++++++++++++++++++++++++-----
 fs/ceph/export.c     |  12 +++-
 fs/ceph/file.c       |  12 +++-
 fs/ceph/inode.c      |  39 +++++++++--
 fs/ceph/locks.c      |   6 ++
 fs/ceph/mds_client.c | 112 +-------------------------------
 fs/ceph/super.h      |  12 ++++
 8 files changed, 220 insertions(+), 139 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/6] ceph: print inode numbers instead of pointer values
  2021-09-08 13:03 [PATCH 0/6] ceph: better error handling for failed async creates Jeff Layton
@ 2021-09-08 13:03 ` Jeff Layton
  2021-09-08 13:03 ` [PATCH 2/6] ceph: don't use -ESTALE as special return code in try_get_cap_refs Jeff Layton
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Jeff Layton @ 2021-09-08 13:03 UTC (permalink / raw)
  To: ceph-devel; +Cc: idryomov, pdonnell

We have a lot of log messages that print inode pointer values. This is
of dubious utility. Switch a random assortment of the ones I've found
most useful to use ceph_vinop to print the snap:inum tuple instead.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c  | 8 ++++----
 fs/ceph/file.c  | 2 +-
 fs/ceph/inode.c | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 26be19d23ed6..b89c23a0b440 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1969,8 +1969,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		}
 	}
 
-	dout("check_caps %p file_want %s used %s dirty %s flushing %s"
-	     " issued %s revoking %s retain %s %s%s\n", inode,
+	dout("check_caps %llx:%llx file_want %s used %s dirty %s flushing %s"
+	     " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode),
 	     ceph_cap_string(file_wanted),
 	     ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
 	     ceph_cap_string(ci->i_flushing_caps),
@@ -1991,7 +1991,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	    (revoking & (CEPH_CAP_FILE_CACHE|
 			 CEPH_CAP_FILE_LAZYIO)) && /*  or revoking cache */
 	    !tried_invalidate) {
-		dout("check_caps trying to invalidate on %p\n", inode);
+		dout("check_caps trying to invalidate on %llx:%llx\n", ceph_vinop(inode));
 		if (try_nonblocking_invalidate(inode) < 0) {
 			dout("check_caps queuing invalidate\n");
 			queue_invalidate = true;
@@ -4334,7 +4334,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
 				      i_dirty_item);
 		inode = &ci->vfs_inode;
 		ihold(inode);
-		dout("flush_dirty_caps %p\n", inode);
+		dout("flush_dirty_caps %llx:%llx\n", ceph_vinop(inode));
 		spin_unlock(&mdsc->cap_dirty_lock);
 		ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
 		iput(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3daebfaec8c6..175366eede23 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -557,7 +557,7 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
 		}
 		ceph_kick_flushing_inode_caps(req->r_session, ci);
 		spin_unlock(&ci->i_ceph_lock);
-	} else {
+	} else if (!result) {
 		pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
 			req->r_deleg_ino);
 	}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 61ecf81ed875..43c02c4b2631 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1851,8 +1851,8 @@ static void ceph_do_invalidate_pages(struct inode *inode)
 	mutex_lock(&ci->i_truncate_mutex);
 
 	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
-		pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
-				    inode, ceph_ino(inode));
+		pr_warn_ratelimited("%s: inode %llx:%llx is shut down\n",
+				    __func__, ceph_vinop(inode));
 		mapping_set_error(inode->i_mapping, -EIO);
 		truncate_pagecache(inode, 0);
 		mutex_unlock(&ci->i_truncate_mutex);
@@ -1874,7 +1874,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
 
 	ceph_fscache_invalidate(inode);
 	if (invalidate_inode_pages2(inode->i_mapping) < 0) {
-		pr_err("invalidate_pages %p fails\n", inode);
+		pr_err("invalidate_pages %llx:%llx failed\n", ceph_vinop(inode));
 	}
 
 	spin_lock(&ci->i_ceph_lock);
-- 
2.31.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/6] ceph: don't use -ESTALE as special return code in try_get_cap_refs
  2021-09-08 13:03 [PATCH 0/6] ceph: better error handling for failed async creates Jeff Layton
  2021-09-08 13:03 ` [PATCH 1/6] ceph: print inode numbers instead of pointer values Jeff Layton
@ 2021-09-08 13:03 ` Jeff Layton
  2021-09-08 13:03 ` [PATCH 3/6] ceph: drop private list from remove_session_caps_cb Jeff Layton
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Jeff Layton @ 2021-09-08 13:03 UTC (permalink / raw)
  To: ceph-devel; +Cc: idryomov, pdonnell

In some cases, we may want to return -ESTALE if it ends up that we're
dealing with an inode that no longer exists. Switch to using -EUCLEAN as
the "special" error return.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b89c23a0b440..f8307d70674b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2636,9 +2636,9 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
  *
  * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
  * or a negative error code. There are 3 speical error codes:
- *  -EAGAIN: need to sleep but non-blocking is specified
- *  -EFBIG:  ask caller to call check_max_size() and try again.
- *  -ESTALE: ask caller to call ceph_renew_caps() and try again.
+ *  -EAGAIN:  need to sleep but non-blocking is specified
+ *  -EFBIG:   ask caller to call check_max_size() and try again.
+ *  -EUCLEAN: ask caller to call ceph_renew_caps() and try again.
  */
 enum {
 	/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
@@ -2686,7 +2686,7 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 			dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
 			     inode, endoff, ci->i_max_size);
 			if (endoff > ci->i_requested_max_size)
-				ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
+				ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN;
 			goto out_unlock;
 		}
 		/*
@@ -2766,7 +2766,7 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 			dout("get_cap_refs %p need %s > mds_wanted %s\n",
 			     inode, ceph_cap_string(need),
 			     ceph_cap_string(mds_wanted));
-			ret = -ESTALE;
+			ret = -EUCLEAN;
 			goto out_unlock;
 		}
 
@@ -2850,7 +2850,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
 
 	ret = try_get_cap_refs(inode, need, want, 0, flags, got);
 	/* three special error codes */
-	if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE)
+	if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN)
 		ret = 0;
 	return ret;
 }
@@ -2933,7 +2933,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
 		}
 
 		if (ret < 0) {
-			if (ret == -EFBIG || ret == -ESTALE) {
+			if (ret == -EFBIG || ret == -EUCLEAN) {
 				int ret2 = ceph_wait_on_async_create(inode);
 				if (ret2 < 0)
 					return ret2;
@@ -2942,7 +2942,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
 				check_max_size(inode, endoff);
 				continue;
 			}
-			if (ret == -ESTALE) {
+			if (ret == -EUCLEAN) {
 				/* session was killed, try renew caps */
 				ret = ceph_renew_caps(inode, flags);
 				if (ret == 0)
-- 
2.31.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 3/6] ceph: drop private list from remove_session_caps_cb
  2021-09-08 13:03 [PATCH 0/6] ceph: better error handling for failed async creates Jeff Layton
  2021-09-08 13:03 ` [PATCH 1/6] ceph: print inode numbers instead of pointer values Jeff Layton
  2021-09-08 13:03 ` [PATCH 2/6] ceph: don't use -ESTALE as special return code in try_get_cap_refs Jeff Layton
@ 2021-09-08 13:03 ` Jeff Layton
  2021-09-08 13:03 ` [PATCH 4/6] ceph: fix auth cap handling logic in remove_session_caps_cb Jeff Layton
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Jeff Layton @ 2021-09-08 13:03 UTC (permalink / raw)
  To: ceph-devel; +Cc: idryomov, pdonnell

This function does a lot of list-shuffling with cap flushes, all to
avoid possibly freeing a slab allocation under spinlock (which is
totally ok).  Simplify the code by just detaching and freeing the cap
flushes in place.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/mds_client.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b41d62f08c0a..2ff4e6481d09 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1626,7 +1626,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	LIST_HEAD(to_remove);
 	bool dirty_dropped = false;
 	bool invalidate = false;
 	int capsnap_release = 0;
@@ -1645,16 +1644,17 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 				mapping_set_error(&inode->i_data, -EIO);
 		}
 
+		spin_lock(&mdsc->cap_dirty_lock);
+
+		/* trash all of the cap flushes for this inode */
 		while (!list_empty(&ci->i_cap_flush_list)) {
 			cf = list_first_entry(&ci->i_cap_flush_list,
 					      struct ceph_cap_flush, i_list);
-			list_move(&cf->i_list, &to_remove);
-		}
-
-		spin_lock(&mdsc->cap_dirty_lock);
-
-		list_for_each_entry(cf, &to_remove, i_list)
 			list_del_init(&cf->g_list);
+			list_del_init(&cf->i_list);
+			if (!cf->is_capsnap)
+				ceph_free_cap_flush(cf);
+		}
 
 		if (!list_empty(&ci->i_dirty_item)) {
 			pr_warn_ratelimited(
@@ -1697,22 +1697,16 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		}
 
 		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
-			list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
+			cf = ci->i_prealloc_cap_flush;
 			ci->i_prealloc_cap_flush = NULL;
+			if (!cf->is_capsnap)
+				ceph_free_cap_flush(cf);
 		}
 
 		if (!list_empty(&ci->i_cap_snaps))
 			capsnap_release = remove_capsnaps(mdsc, inode);
 	}
 	spin_unlock(&ci->i_ceph_lock);
-	while (!list_empty(&to_remove)) {
-		struct ceph_cap_flush *cf;
-		cf = list_first_entry(&to_remove,
-				      struct ceph_cap_flush, i_list);
-		list_del_init(&cf->i_list);
-		if (!cf->is_capsnap)
-			ceph_free_cap_flush(cf);
-	}
 
 	wake_up_all(&ci->i_cap_wq);
 	if (invalidate)
-- 
2.31.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 4/6] ceph: fix auth cap handling logic in remove_session_caps_cb
  2021-09-08 13:03 [PATCH 0/6] ceph: better error handling for failed async creates Jeff Layton
                   ` (2 preceding siblings ...)
  2021-09-08 13:03 ` [PATCH 3/6] ceph: drop private list from remove_session_caps_cb Jeff Layton
@ 2021-09-08 13:03 ` Jeff Layton
  2021-09-08 13:03 ` [PATCH 5/6] ceph: refactor remove_session_caps_cb Jeff Layton
  2021-09-08 13:03 ` [PATCH 6/6] ceph: shut down access to inode when async create fails Jeff Layton
  5 siblings, 0 replies; 7+ messages in thread
From: Jeff Layton @ 2021-09-08 13:03 UTC (permalink / raw)
  To: ceph-devel; +Cc: idryomov, pdonnell

The existing logic relies on ci->i_auth_cap being NULL, but if we end up
removing the auth cap early, then we'll do a lot of useless work and
lock-taking on the remaining caps. Ensure that we only do the auth cap
removal when we're _actually_ removing the auth cap.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/mds_client.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 2ff4e6481d09..4b3ef0412539 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1626,6 +1626,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	bool is_auth;
 	bool dirty_dropped = false;
 	bool invalidate = false;
 	int capsnap_release = 0;
@@ -1633,8 +1634,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
 	spin_lock(&ci->i_ceph_lock);
+	is_auth = (cap == ci->i_auth_cap);
 	__ceph_remove_cap(cap, false);
-	if (!ci->i_auth_cap) {
+	if (is_auth) {
 		struct ceph_cap_flush *cf;
 
 		if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
-- 
2.31.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 5/6] ceph: refactor remove_session_caps_cb
  2021-09-08 13:03 [PATCH 0/6] ceph: better error handling for failed async creates Jeff Layton
                   ` (3 preceding siblings ...)
  2021-09-08 13:03 ` [PATCH 4/6] ceph: fix auth cap handling logic in remove_session_caps_cb Jeff Layton
@ 2021-09-08 13:03 ` Jeff Layton
  2021-09-08 13:03 ` [PATCH 6/6] ceph: shut down access to inode when async create fails Jeff Layton
  5 siblings, 0 replies; 7+ messages in thread
From: Jeff Layton @ 2021-09-08 13:03 UTC (permalink / raw)
  To: ceph-devel; +Cc: idryomov, pdonnell

Move remove_capsnaps to caps.c. Move the part of remove_session_caps_cb
under i_ceph_lock into a separate function that lives in caps.c. Have
remove_session_caps_cb call the new helper after taking the lock.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c       | 116 +++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/mds_client.c | 108 ++--------------------------------------
 fs/ceph/super.h      |   1 +
 3 files changed, 120 insertions(+), 105 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f8307d70674b..bc17515a11c9 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4579,3 +4579,119 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
 	spin_unlock(&dentry->d_lock);
 	return ret;
 }
+
+static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap_snap *capsnap;
+	int capsnap_release = 0;
+
+	lockdep_assert_held(&ci->i_ceph_lock);
+
+	dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
+
+	while (!list_empty(&ci->i_cap_snaps)) {
+		capsnap = list_first_entry(&ci->i_cap_snaps,
+					   struct ceph_cap_snap, ci_item);
+		__ceph_remove_capsnap(inode, capsnap, NULL, NULL);
+		ceph_put_snap_context(capsnap->context);
+		ceph_put_cap_snap(capsnap);
+		capsnap_release++;
+	}
+	wake_up_all(&ci->i_cap_wq);
+	wake_up_all(&mdsc->cap_flushing_wq);
+	return capsnap_release;
+}
+
+int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate)
+{
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	bool is_auth;
+	bool dirty_dropped = false;
+	int iputs = 0;
+
+	lockdep_assert_held(&ci->i_ceph_lock);
+
+	dout("removing cap %p, ci is %p, inode is %p\n",
+	     cap, ci, &ci->vfs_inode);
+
+	is_auth = (cap == ci->i_auth_cap);
+	__ceph_remove_cap(cap, false);
+	if (is_auth) {
+		struct ceph_cap_flush *cf;
+
+		if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
+			if (inode->i_data.nrpages > 0)
+				*invalidate = true;
+			if (ci->i_wrbuffer_ref > 0)
+				mapping_set_error(&inode->i_data, -EIO);
+		}
+
+		spin_lock(&mdsc->cap_dirty_lock);
+
+		/* trash all of the cap flushes for this inode */
+		while (!list_empty(&ci->i_cap_flush_list)) {
+			cf = list_first_entry(&ci->i_cap_flush_list,
+					      struct ceph_cap_flush, i_list);
+			list_del_init(&cf->g_list);
+			list_del_init(&cf->i_list);
+			if (!cf->is_capsnap)
+				ceph_free_cap_flush(cf);
+		}
+
+		if (!list_empty(&ci->i_dirty_item)) {
+			pr_warn_ratelimited(
+				" dropping dirty %s state for %p %lld\n",
+				ceph_cap_string(ci->i_dirty_caps),
+				inode, ceph_ino(inode));
+			ci->i_dirty_caps = 0;
+			list_del_init(&ci->i_dirty_item);
+			dirty_dropped = true;
+		}
+		if (!list_empty(&ci->i_flushing_item)) {
+			pr_warn_ratelimited(
+				" dropping dirty+flushing %s state for %p %lld\n",
+				ceph_cap_string(ci->i_flushing_caps),
+				inode, ceph_ino(inode));
+			ci->i_flushing_caps = 0;
+			list_del_init(&ci->i_flushing_item);
+			mdsc->num_cap_flushing--;
+			dirty_dropped = true;
+		}
+		spin_unlock(&mdsc->cap_dirty_lock);
+
+		if (dirty_dropped) {
+			errseq_set(&ci->i_meta_err, -EIO);
+
+			if (ci->i_wrbuffer_ref_head == 0 &&
+			    ci->i_wr_ref == 0 &&
+			    ci->i_dirty_caps == 0 &&
+			    ci->i_flushing_caps == 0) {
+				ceph_put_snap_context(ci->i_head_snapc);
+				ci->i_head_snapc = NULL;
+			}
+		}
+
+		if (atomic_read(&ci->i_filelock_ref) > 0) {
+			/* make further file lock syscall return -EIO */
+			ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
+			pr_warn_ratelimited(" dropping file locks for %p %lld\n",
+					    inode, ceph_ino(inode));
+		}
+
+		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+			cf = ci->i_prealloc_cap_flush;
+			ci->i_prealloc_cap_flush = NULL;
+			if (!cf->is_capsnap)
+				ceph_free_cap_flush(cf);
+		}
+
+		if (!list_empty(&ci->i_cap_snaps))
+			iputs = remove_capsnaps(mdsc, inode);
+	}
+	if (dirty_dropped)
+		++iputs;
+	return iputs;
+}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4b3ef0412539..44bc780b2b0e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1597,125 +1597,23 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
 	return ret;
 }
 
-static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
-{
-	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_cap_snap *capsnap;
-	int capsnap_release = 0;
-
-	lockdep_assert_held(&ci->i_ceph_lock);
-
-	dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
-
-	while (!list_empty(&ci->i_cap_snaps)) {
-		capsnap = list_first_entry(&ci->i_cap_snaps,
-					   struct ceph_cap_snap, ci_item);
-		__ceph_remove_capsnap(inode, capsnap, NULL, NULL);
-		ceph_put_snap_context(capsnap->context);
-		ceph_put_cap_snap(capsnap);
-		capsnap_release++;
-	}
-	wake_up_all(&ci->i_cap_wq);
-	wake_up_all(&mdsc->cap_flushing_wq);
-	return capsnap_release;
-}
-
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 				  void *arg)
 {
-	struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
-	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	bool is_auth;
-	bool dirty_dropped = false;
 	bool invalidate = false;
-	int capsnap_release = 0;
+	int iputs;
 
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
 	spin_lock(&ci->i_ceph_lock);
-	is_auth = (cap == ci->i_auth_cap);
-	__ceph_remove_cap(cap, false);
-	if (is_auth) {
-		struct ceph_cap_flush *cf;
-
-		if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
-			if (inode->i_data.nrpages > 0)
-				invalidate = true;
-			if (ci->i_wrbuffer_ref > 0)
-				mapping_set_error(&inode->i_data, -EIO);
-		}
-
-		spin_lock(&mdsc->cap_dirty_lock);
-
-		/* trash all of the cap flushes for this inode */
-		while (!list_empty(&ci->i_cap_flush_list)) {
-			cf = list_first_entry(&ci->i_cap_flush_list,
-					      struct ceph_cap_flush, i_list);
-			list_del_init(&cf->g_list);
-			list_del_init(&cf->i_list);
-			if (!cf->is_capsnap)
-				ceph_free_cap_flush(cf);
-		}
-
-		if (!list_empty(&ci->i_dirty_item)) {
-			pr_warn_ratelimited(
-				" dropping dirty %s state for %p %lld\n",
-				ceph_cap_string(ci->i_dirty_caps),
-				inode, ceph_ino(inode));
-			ci->i_dirty_caps = 0;
-			list_del_init(&ci->i_dirty_item);
-			dirty_dropped = true;
-		}
-		if (!list_empty(&ci->i_flushing_item)) {
-			pr_warn_ratelimited(
-				" dropping dirty+flushing %s state for %p %lld\n",
-				ceph_cap_string(ci->i_flushing_caps),
-				inode, ceph_ino(inode));
-			ci->i_flushing_caps = 0;
-			list_del_init(&ci->i_flushing_item);
-			mdsc->num_cap_flushing--;
-			dirty_dropped = true;
-		}
-		spin_unlock(&mdsc->cap_dirty_lock);
-
-		if (dirty_dropped) {
-			errseq_set(&ci->i_meta_err, -EIO);
-
-			if (ci->i_wrbuffer_ref_head == 0 &&
-			    ci->i_wr_ref == 0 &&
-			    ci->i_dirty_caps == 0 &&
-			    ci->i_flushing_caps == 0) {
-				ceph_put_snap_context(ci->i_head_snapc);
-				ci->i_head_snapc = NULL;
-			}
-		}
-
-		if (atomic_read(&ci->i_filelock_ref) > 0) {
-			/* make further file lock syscall return -EIO */
-			ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
-			pr_warn_ratelimited(" dropping file locks for %p %lld\n",
-					    inode, ceph_ino(inode));
-		}
-
-		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
-			cf = ci->i_prealloc_cap_flush;
-			ci->i_prealloc_cap_flush = NULL;
-			if (!cf->is_capsnap)
-				ceph_free_cap_flush(cf);
-		}
-
-		if (!list_empty(&ci->i_cap_snaps))
-			capsnap_release = remove_capsnaps(mdsc, inode);
-	}
+	iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
 	spin_unlock(&ci->i_ceph_lock);
 
 	wake_up_all(&ci->i_cap_wq);
 	if (invalidate)
 		ceph_queue_invalidate(inode);
-	if (dirty_dropped)
-		iput(inode);
-	while (capsnap_release--)
+	while (iputs--)
 		iput(inode);
 	return 0;
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c1add4ed59d2..67a0e1d2ecbb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1207,6 +1207,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
 extern int ceph_pool_perm_check(struct inode *inode, int need);
 extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
+int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
 
 /* file.c */
 extern const struct file_operations ceph_file_fops;
-- 
2.31.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 6/6] ceph: shut down access to inode when async create fails
  2021-09-08 13:03 [PATCH 0/6] ceph: better error handling for failed async creates Jeff Layton
                   ` (4 preceding siblings ...)
  2021-09-08 13:03 ` [PATCH 5/6] ceph: refactor remove_session_caps_cb Jeff Layton
@ 2021-09-08 13:03 ` Jeff Layton
  5 siblings, 0 replies; 7+ messages in thread
From: Jeff Layton @ 2021-09-08 13:03 UTC (permalink / raw)
  To: ceph-devel; +Cc: idryomov, pdonnell

Add proper error handling for when an async create fails. The inode
never existed, so any dirty caps or data are now toast. We already
d_drop the dentry in that case, but the now-stale inode may still be
around. We want to shut down access to these inodes, and ensure that
they can't harbor any more dirty data, which can cause problems at
umount time.

When this occurs, flag such inodes as being SHUTDOWN, and trash any caps
and cap flushes that may be in flight for them, and invalidate the
pagecache for the inode. Add a new helper that can check whether an
inode or an entire mount is now shut down, and call it instead of
accessing the mount_state directly in places where we test that now.

URL: https://tracker.ceph.com/issues/51279
Signed-off-by: Jeff Layton  <jlayton@kernel.org>
---
 fs/ceph/addr.c   | 16 +++++++++++-----
 fs/ceph/caps.c   | 12 ++++++------
 fs/ceph/export.c | 12 +++++++++++-
 fs/ceph/file.c   | 10 +++++++++-
 fs/ceph/inode.c  | 33 +++++++++++++++++++++++++++++++--
 fs/ceph/locks.c  |  6 ++++++
 fs/ceph/super.h  | 11 +++++++++++
 7 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6d3f74d46e5b..c4658f6db8d1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -724,7 +724,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
+	if (ceph_inode_is_shutdown(inode)) {
 		if (ci->i_wrbuffer_ref > 0) {
 			pr_warn_ratelimited(
 				"writepage_start %p %lld forced umount\n",
@@ -1145,12 +1145,12 @@ static struct ceph_snap_context *
 ceph_find_incompatible(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
-		dout(" page %p forced umount\n", page);
-		return ERR_PTR(-EIO);
+	if (ceph_inode_is_shutdown(inode)) {
+		dout(" page %p %llx:%llx is shutdown\n", page,
+		     ceph_vinop(inode));
+		return ERR_PTR(-ESTALE);
 	}
 
 	for (;;) {
@@ -1356,6 +1356,9 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
 	sigset_t oldset;
 	vm_fault_t ret = VM_FAULT_SIGBUS;
 
+	if (ceph_inode_is_shutdown(inode))
+		return ret;
+
 	ceph_block_sigs(&oldset);
 
 	dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
@@ -1444,6 +1447,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
 	sigset_t oldset;
 	vm_fault_t ret = VM_FAULT_SIGBUS;
 
+	if (ceph_inode_is_shutdown(inode))
+		return ret;
+
 	prealloc_cf = ceph_alloc_cap_flush();
 	if (!prealloc_cf)
 		return VM_FAULT_OOM;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index bc17515a11c9..cdeb5b2d7920 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1188,11 +1188,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 
 	lockdep_assert_held(&ci->i_ceph_lock);
 
-	fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+	fsc = ceph_inode_to_client(&ci->vfs_inode);
 	WARN_ON_ONCE(ci->i_auth_cap == cap &&
 		     !list_empty(&ci->i_dirty_item) &&
 		     !fsc->blocklisted &&
-		     READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);
+		     !ceph_inode_is_shutdown(&ci->vfs_inode));
 
 	__ceph_remove_cap(cap, queue_release);
 }
@@ -2756,9 +2756,9 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
 			goto out_unlock;
 		}
 
-		if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
-			dout("get_cap_refs %p forced umount\n", inode);
-			ret = -EIO;
+		if (ceph_inode_is_shutdown(inode)) {
+			dout("get_cap_refs %p inode is shutdown\n", inode);
+			ret = -ESTALE;
 			goto out_unlock;
 		}
 		mds_wanted = __ceph_caps_mds_wanted(ci, false);
@@ -4622,7 +4622,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
 	if (is_auth) {
 		struct ceph_cap_flush *cf;
 
-		if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
+		if (ceph_inode_is_shutdown(inode)) {
 			if (inode->i_data.nrpages > 0)
 				*invalidate = true;
 			if (ci->i_wrbuffer_ref > 0)
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 1d65934c1262..e0fa66ac8b9f 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -157,6 +157,11 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino)
 		ceph_mdsc_put_request(req);
 		if (!inode)
 			return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE);
+	} else {
+		if (ceph_inode_is_shutdown(inode)) {
+			iput(inode);
+			return ERR_PTR(-ESTALE);
+		}
 	}
 	return inode;
 }
@@ -223,8 +228,13 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
 		return ERR_PTR(-ESTALE);
 
 	inode = ceph_find_inode(sb, vino);
-	if (inode)
+	if (inode) {
+		if (ceph_inode_is_shutdown(inode)) {
+			iput(inode);
+			return ERR_PTR(-ESTALE);
+		}
 		return d_obtain_alias(inode);
+	}
 
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
 				       USE_ANY_MDS);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 175366eede23..950e1b44f0aa 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -526,6 +526,7 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
 
 	if (result) {
 		struct dentry *dentry = req->r_dentry;
+		struct inode *inode = d_inode(dentry);
 		int pathlen = 0;
 		u64 base = 0;
 		char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
@@ -535,7 +536,8 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
 		if (!d_unhashed(dentry))
 			d_drop(dentry);
 
-		/* FIXME: start returning I/O errors on all accesses? */
+		ceph_inode_shutdown(inode);
+
 		pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
 			base, IS_ERR(path) ? "<<bad>>" : path, result);
 		ceph_mdsc_free_path(path, pathlen);
@@ -1527,6 +1529,9 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
 	     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
 
+	if (ceph_inode_is_shutdown(inode))
+		return -ESTALE;
+
 	if (direct_lock)
 		ceph_start_io_direct(inode);
 	else
@@ -1679,6 +1684,9 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	loff_t pos;
 	loff_t limit = max(i_size_read(inode), fsc->max_file_size);
 
+	if (ceph_inode_is_shutdown(inode))
+		return -ESTALE;
+
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 43c02c4b2631..23b5a0867e3a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1844,13 +1844,12 @@ void ceph_queue_inode_work(struct inode *inode, int work_bit)
 static void ceph_do_invalidate_pages(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	u32 orig_gen;
 	int check = 0;
 
 	mutex_lock(&ci->i_truncate_mutex);
 
-	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
+	if (ceph_inode_is_shutdown(inode)) {
 		pr_warn_ratelimited("%s: inode %llx:%llx is shut down\n",
 				    __func__, ceph_vinop(inode));
 		mapping_set_error(inode->i_mapping, -EIO);
@@ -2220,6 +2219,9 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
+	if (ceph_inode_is_shutdown(inode))
+		return -ESTALE;
+
 	err = setattr_prepare(&init_user_ns, dentry, attr);
 	if (err != 0)
 		return err;
@@ -2350,6 +2352,9 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
 	u32 valid_mask = STATX_BASIC_STATS;
 	int err = 0;
 
+	if (ceph_inode_is_shutdown(inode))
+		return -ESTALE;
+
 	/* Skip the getattr altogether if we're asked not to sync */
 	if (!(flags & AT_STATX_DONT_SYNC)) {
 		err = ceph_do_getattr(inode,
@@ -2397,3 +2402,27 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
 	stat->result_mask = request_mask & valid_mask;
 	return err;
 }
+
+void ceph_inode_shutdown(struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct rb_node *p;
+	int iputs = 0;
+	bool invalidate = false;
+
+	spin_lock(&ci->i_ceph_lock);
+	ci->i_ceph_flags |= CEPH_I_SHUTDOWN;
+	p = rb_first(&ci->i_caps);
+	while (p) {
+		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+
+		p = rb_next(p);
+		iputs += ceph_purge_inode_cap(inode, cap, &invalidate);
+	}
+	spin_unlock(&ci->i_ceph_lock);
+
+	if (invalidate)
+		ceph_queue_invalidate(inode);
+	while (iputs--)
+		iput(inode);
+}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index fa8a847743d0..bdc8bd04055b 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -244,6 +244,9 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 	if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
+	if (ceph_inode_is_shutdown(inode))
+		return -ESTALE;
+
 	dout("ceph_lock, fl_owner: %p\n", fl->fl_owner);
 
 	/* set wait bit as appropriate, then make command as Ceph expects it*/
@@ -309,6 +312,9 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 	if (fl->fl_type & LOCK_MAND)
 		return -EOPNOTSUPP;
 
+	if (ceph_inode_is_shutdown(inode))
+		return -ESTALE;
+
 	dout("ceph_flock, fl_file: %p\n", fl->fl_file);
 
 	spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 67a0e1d2ecbb..13dd3c71c95b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -588,6 +588,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
 #define CEPH_I_ODIRECT		(1 << 11) /* inode in direct I/O mode */
 #define CEPH_ASYNC_CREATE_BIT	(12)	  /* async create in flight for this */
 #define CEPH_I_ASYNC_CREATE	(1 << CEPH_ASYNC_CREATE_BIT)
+#define CEPH_I_SHUTDOWN		(1 << 13) /* inode is no longer usable */
 
 /*
  * Masks of ceph inode work.
@@ -1036,6 +1037,16 @@ extern int ceph_setattr(struct user_namespace *mnt_userns,
 extern int ceph_getattr(struct user_namespace *mnt_userns,
 			const struct path *path, struct kstat *stat,
 			u32 request_mask, unsigned int flags);
+void ceph_inode_shutdown(struct inode *inode);
+
+static inline bool ceph_inode_is_shutdown(struct inode *inode)
+{
+	unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags);
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	int state = READ_ONCE(fsc->mount_state);
+
+	return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN;
+}
 
 /* xattr.c */
 int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
-- 
2.31.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-09-08 13:04 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-08 13:03 [PATCH 0/6] ceph: better error handling for failed async creates Jeff Layton
2021-09-08 13:03 ` [PATCH 1/6] ceph: print inode numbers instead of pointer values Jeff Layton
2021-09-08 13:03 ` [PATCH 2/6] ceph: don't use -ESTALE as special return code in try_get_cap_refs Jeff Layton
2021-09-08 13:03 ` [PATCH 3/6] ceph: drop private list from remove_session_caps_cb Jeff Layton
2021-09-08 13:03 ` [PATCH 4/6] ceph: fix auth cap handling logic in remove_session_caps_cb Jeff Layton
2021-09-08 13:03 ` [PATCH 5/6] ceph: refactor remove_session_caps_cb Jeff Layton
2021-09-08 13:03 ` [PATCH 6/6] ceph: shut down access to inode when async create fails Jeff Layton

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).