ceph-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: xiubli@redhat.com
To: idryomov@gmail.com, ceph-devel@vger.kernel.org
Cc: jlayton@kernel.org, lhenriques@suse.de, vshankar@redhat.com,
	mchangir@redhat.com, Xiubo Li <xiubli@redhat.com>
Subject: [PATCH v16 68/68] ceph: drop the messages from MDS when unmounting
Date: Mon, 27 Feb 2023 11:28:13 +0800	[thread overview]
Message-ID: <20230227032813.337906-69-xiubli@redhat.com> (raw)
In-Reply-To: <20230227032813.337906-1-xiubli@redhat.com>

From: Xiubo Li <xiubli@redhat.com>

When unmounting and all the dirty buffer will be flushed and after
the last osd request is finished the last reference of the i_count
will be released. Then it will flush the dirty cap/snap to MDSs,
and the unmounting won't wait the possible acks, which will ihode
the inodes when updating the metadata locally but makes no sense
any more, of this. This will make the evict_inodes() to skip these
inodes.

If encrypt is enabled the kernel generate a warning when removing
the encrypt keys when the skipped inodes still hold the keyring:

WARNING: CPU: 4 PID: 168846 at fs/crypto/keyring.c:242 fscrypt_destroy_keyring+0x7e/0xd0
CPU: 4 PID: 168846 Comm: umount Tainted: G S  6.1.0-rc5-ceph-g72ead199864c #1
Hardware name: Supermicro SYS-5018R-WR/X10SRW-F, BIOS 2.0 12/17/2015
RIP: 0010:fscrypt_destroy_keyring+0x7e/0xd0
RSP: 0018:ffffc9000b277e28 EFLAGS: 00010202
RAX: 0000000000000002 RBX: ffff88810d52ac00 RCX: ffff88810b56aa00
RDX: 0000000080000000 RSI: ffffffff822f3a09 RDI: ffff888108f59000
RBP: ffff8881d394fb88 R08: 0000000000000028 R09: 0000000000000000
R10: 0000000000000001 R11: 11ff4fe6834fcd91 R12: ffff8881d394fc40
R13: ffff888108f59000 R14: ffff8881d394f800 R15: 0000000000000000
FS:  00007fd83f6f1080(0000) GS:ffff88885fd00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f918d417000 CR3: 000000017f89a005 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
generic_shutdown_super+0x47/0x120
kill_anon_super+0x14/0x30
ceph_kill_sb+0x36/0x90 [ceph]
deactivate_locked_super+0x29/0x60
cleanup_mnt+0xb8/0x140
task_work_run+0x67/0xb0
exit_to_user_mode_prepare+0x23d/0x240
syscall_exit_to_user_mode+0x25/0x60
do_syscall_64+0x40/0x80
entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7fd83dc39e9b

Later the kernel will crash when iput() the inodes and dereferencing
the "sb->s_master_keys", which has been released by the
generic_shutdown_super().

URL: https://tracker.ceph.com/issues/58126
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/caps.c       |  5 +++++
 fs/ceph/mds_client.c | 12 +++++++++-
 fs/ceph/mds_client.h | 11 +++++++++-
 fs/ceph/quota.c      |  4 ++++
 fs/ceph/snap.c       |  6 +++++
 fs/ceph/super.c      | 52 ++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/super.h      |  2 ++
 7 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 424b2d54fe71..9b04c2d930b2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4221,6 +4221,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
 	dout("handle_caps from mds%d\n", session->s_mds);
 
+	if (!ceph_inc_stopping_blocker(mdsc))
+		return;
+
 	/* decode */
 	end = msg->front.iov_base + msg->front.iov_len;
 	if (msg->front.iov_len < sizeof(*h))
@@ -4434,6 +4437,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 done_unlocked:
 	iput(inode);
 out:
+	ceph_dec_stopping_blocker(mdsc);
+
 	ceph_put_string(extra_info.pool_ns);
 
 	/* Defer closing the sessions after s_mutex lock being released */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 85d639f75ea1..b8d6cca16005 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4877,6 +4877,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 
 	dout("handle_lease from mds%d\n", mds);
 
+	if (!ceph_inc_stopping_blocker(mdsc))
+		return;
+
 	/* decode */
 	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
 		goto bad;
@@ -4958,9 +4961,13 @@ static void handle_lease(struct ceph_mds_client *mdsc,
 out:
 	mutex_unlock(&session->s_mutex);
 	iput(inode);
+
+	ceph_dec_stopping_blocker(mdsc);
 	return;
 
 bad:
+	ceph_dec_stopping_blocker(mdsc);
+
 	pr_err("corrupt lease message\n");
 	ceph_msg_dump(msg);
 }
@@ -5156,6 +5163,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	}
 
 	init_completion(&mdsc->safe_umount_waiters);
+	spin_lock_init(&mdsc->stopping_lock);
+	atomic_set(&mdsc->stopping_blockers, 0);
+	init_completion(&mdsc->stopping_waiter);
 	init_waitqueue_head(&mdsc->session_close_wq);
 	INIT_LIST_HEAD(&mdsc->waiting_for_map);
 	mdsc->quotarealms_inodes = RB_ROOT;
@@ -5270,7 +5280,7 @@ void send_flush_mdlog(struct ceph_mds_session *s)
 void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 {
 	dout("pre_umount\n");
-	mdsc->stopping = 1;
+	mdsc->stopping = CEPH_MDSC_STOPPING_BEGAIN;
 
 	ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
 	ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 81a1f9a4ac3b..5bf32701c84c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -398,6 +398,11 @@ struct cap_wait {
 	int			want;
 };
 
+enum {
+	CEPH_MDSC_STOPPING_BEGAIN = 1,
+	CEPH_MDSC_STOPPING_FLUSHED = 2,
+};
+
 /*
  * mds client state
  */
@@ -414,7 +419,11 @@ struct ceph_mds_client {
 	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
 	atomic_t		num_sessions;
 	int                     max_sessions;  /* len of sessions array */
-	int                     stopping;      /* true if shutting down */
+
+	spinlock_t              stopping_lock;  /* protect snap_empty */
+	int                     stopping;      /* the stage of shutting down */
+	atomic_t                stopping_blockers;
+	struct completion	stopping_waiter;
 
 	atomic64_t		quotarealms_count; /* # realms with quota */
 	/*
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 64592adfe48f..3309ae071739 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -47,6 +47,9 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
 	struct inode *inode;
 	struct ceph_inode_info *ci;
 
+	if (!ceph_inc_stopping_blocker(mdsc))
+		return;
+
 	if (msg->front.iov_len < sizeof(*h)) {
 		pr_err("%s corrupt message mds%d len %d\n", __func__,
 		       session->s_mds, (int)msg->front.iov_len);
@@ -78,6 +81,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
 	spin_unlock(&ci->i_ceph_lock);
 
 	iput(inode);
+	ceph_dec_stopping_blocker(mdsc);
 }
 
 static struct ceph_quotarealm_inode *
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index aa8e0657fc03..2775d526a6e0 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1011,6 +1011,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 	int locked_rwsem = 0;
 	bool close_sessions = false;
 
+	if (!ceph_inc_stopping_blocker(mdsc))
+		return;
+
 	/* decode */
 	if (msg->front.iov_len < sizeof(*h))
 		goto bad;
@@ -1134,12 +1137,15 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 	up_write(&mdsc->snap_rwsem);
 
 	flush_snaps(mdsc);
+	ceph_dec_stopping_blocker(mdsc);
 	return;
 
 bad:
 	pr_err("%s corrupt snap message from mds%d\n", __func__, mds);
 	ceph_msg_dump(msg);
 out:
+	ceph_dec_stopping_blocker(mdsc);
+
 	if (locked_rwsem)
 		up_write(&mdsc->snap_rwsem);
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f10a076f47e5..fa111c2a3732 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1474,15 +1474,67 @@ static int ceph_init_fs_context(struct fs_context *fc)
 	return -ENOMEM;
 }
 
+/*
+ * Return true if mdsc successfully increase blocker counter,
+ * or false if the mdsc is in stopping and flushed state.
+ */
+bool ceph_inc_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+	spin_lock(&mdsc->stopping_lock);
+	if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED) {
+		spin_unlock(&mdsc->stopping_lock);
+		return false;
+	}
+	atomic_inc(&mdsc->stopping_blockers);
+	spin_unlock(&mdsc->stopping_lock);
+	return true;
+}
+
+void ceph_dec_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+	spin_lock(&mdsc->stopping_lock);
+	if (!atomic_dec_return(&mdsc->stopping_blockers) &&
+	    mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
+		complete_all(&mdsc->stopping_waiter);
+	spin_unlock(&mdsc->stopping_lock);
+}
+
 static void ceph_kill_sb(struct super_block *s)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+	bool wait;
 
 	dout("kill_sb %p\n", s);
 
 	ceph_mdsc_pre_umount(fsc->mdsc);
 	flush_fs_workqueues(fsc);
 
+	/*
+	 * Though the kill_anon_super() will finally trigger the
+	 * sync_filesystem() anyway, we still need to do it here and
+	 * then bump the stage of shutdown. This will allow us to
+	 * drop any further message, which will increase the inodes'
+	 * i_count reference counters but makes no sense any more,
+	 * from MDSs.
+	 *
+	 * Without this when evicting the inodes it may fail in the
+	 * kill_anon_super(), which will trigger a warning when
+	 * destroying the fscrypt keyring and then possibly trigger
+	 * a further crash in ceph module when the iput() tries to
+	 * evict the inodes later.
+	 */
+	sync_filesystem(s);
+
+	spin_lock(&fsc->mdsc->stopping_lock);
+	fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
+	wait = !!atomic_read(&fsc->mdsc->stopping_blockers);
+	spin_unlock(&fsc->mdsc->stopping_lock);
+
+	while (wait || atomic_read(&fsc->mdsc->stopping_blockers)) {
+		wait = false;
+		wait_for_completion(&fsc->mdsc->stopping_waiter);
+	}
+
 	kill_anon_super(s);
 
 	fsc->client->extra_mon_dispatch = NULL;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3df88811c7df..9bc958c5fc20 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1398,4 +1398,6 @@ extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
 				     struct kstatfs *buf);
 extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc);
 
+bool ceph_inc_stopping_blocker(struct ceph_mds_client *mdsc);
+void ceph_dec_stopping_blocker(struct ceph_mds_client *mdsc);
 #endif /* _FS_CEPH_SUPER_H */
-- 
2.31.1


  parent reply	other threads:[~2023-02-27  3:34 UTC|newest]

Thread overview: 84+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-27  3:27 [PATCH v16 00/68] ceph+fscrypt: full support xiubli
2023-02-27  3:27 ` [PATCH v16 01/68] libceph: add spinlock around osd->o_requests xiubli
2023-02-27  3:27 ` [PATCH v16 02/68] libceph: define struct ceph_sparse_extent and add some helpers xiubli
2023-02-27  3:27 ` [PATCH v16 03/68] libceph: add sparse read support to msgr2 crc state machine xiubli
2023-02-27  3:27 ` [PATCH v16 04/68] libceph: add sparse read support to OSD client xiubli
2023-02-27  3:27 ` [PATCH v16 05/68] libceph: support sparse reads on msgr2 secure codepath xiubli
2023-02-27  3:27 ` [PATCH v16 06/68] libceph: add sparse read support to msgr1 xiubli
2023-02-27  3:27 ` [PATCH v16 07/68] ceph: add new mount option to enable sparse reads xiubli
2023-02-27  3:27 ` [PATCH v16 08/68] ceph: preallocate inode for ops that may create one xiubli
2023-02-27  3:27 ` [PATCH v16 09/68] ceph: make ceph_msdc_build_path use ref-walk xiubli
2023-02-27  3:27 ` [PATCH v16 10/68] libceph: add new iov_iter-based ceph_msg_data_type and ceph_osd_data_type xiubli
2023-02-27  3:27 ` [PATCH v16 11/68] ceph: use osd_req_op_extent_osd_iter for netfs reads xiubli
2023-02-27  3:27 ` [PATCH v16 12/68] ceph: fscrypt_auth handling for ceph xiubli
2023-02-27  3:27 ` [PATCH v16 13/68] ceph: ensure that we accept a new context from MDS for new inodes xiubli
2023-02-27  3:27 ` [PATCH v16 14/68] ceph: add support for fscrypt_auth/fscrypt_file to cap messages xiubli
2023-02-27  3:27 ` [PATCH v16 15/68] ceph: implement -o test_dummy_encryption mount option xiubli
2023-02-27  3:27 ` [PATCH v16 16/68] ceph: decode alternate_name in lease info xiubli
2023-02-27  3:27 ` [PATCH v16 17/68] ceph: add fscrypt ioctls xiubli
2023-02-27  3:27 ` [PATCH v16 18/68] ceph: make the ioctl cmd more readable in debug log xiubli
2023-02-27  3:27 ` [PATCH v16 19/68] ceph: add base64 endcoding routines for encrypted names xiubli
2023-02-27  3:27 ` [PATCH v16 20/68] ceph: add encrypted fname handling to ceph_mdsc_build_path xiubli
2023-02-27  3:27 ` [PATCH v16 21/68] ceph: send altname in MClientRequest xiubli
2023-02-27  3:27 ` [PATCH v16 22/68] ceph: encode encrypted name in dentry release xiubli
2023-02-27  3:27 ` [PATCH v16 23/68] ceph: properly set DCACHE_NOKEY_NAME flag in lookup xiubli
2023-02-27  3:27 ` [PATCH v16 24/68] ceph: set DCACHE_NOKEY_NAME in atomic open xiubli
2023-02-27  3:27 ` [PATCH v16 25/68] ceph: make d_revalidate call fscrypt revalidator for encrypted dentries xiubli
2023-03-07 18:53   ` Luís Henriques
2023-03-08  1:50     ` Xiubo Li
2023-03-08  9:29       ` Luís Henriques
2023-03-08 10:42         ` Xiubo Li
2023-03-08 17:14           ` Luís Henriques
2023-03-08 17:54             ` Jeff Layton
2023-03-08 18:30               ` Luís Henriques
2023-03-08 19:32                 ` Jeff Layton
2023-03-09  9:52                   ` Luís Henriques
2023-03-09  7:06             ` Xiubo Li
2023-03-09  9:55               ` Luís Henriques
2023-03-09 11:41                 ` Xiubo Li
2023-02-27  3:27 ` [PATCH v16 26/68] ceph: add helpers for converting names for userland presentation xiubli
2023-02-27  3:27 ` [PATCH v16 27/68] ceph: fix base64 encoded name's length check in ceph_fname_to_usr() xiubli
2023-02-27  3:27 ` [PATCH v16 28/68] ceph: add fscrypt support to ceph_fill_trace xiubli
2023-02-27  3:27 ` [PATCH v16 29/68] ceph: pass the request to parse_reply_info_readdir() xiubli
2023-02-27  3:27 ` [PATCH v16 30/68] ceph: add ceph_encode_encrypted_dname() helper xiubli
2023-02-27  3:27 ` [PATCH v16 31/68] ceph: add support to readdir for encrypted filenames xiubli
2023-02-27  3:27 ` [PATCH v16 32/68] ceph: create symlinks with encrypted and base64-encoded targets xiubli
2023-02-27  3:27 ` [PATCH v16 33/68] ceph: make ceph_get_name decrypt filenames xiubli
2023-02-27  3:27 ` [PATCH v16 34/68] ceph: add a new ceph.fscrypt.auth vxattr xiubli
2023-02-27  3:27 ` [PATCH v16 35/68] ceph: add some fscrypt guardrails xiubli
2023-02-27  3:27 ` [PATCH v16 36/68] ceph: allow encrypting a directory while not having Ax caps xiubli
2023-02-27  3:27 ` [PATCH v16 37/68] ceph: mark directory as non-complete after loading key xiubli
2023-02-27  3:27 ` [PATCH v16 38/68] ceph: don't allow changing layout on encrypted files/directories xiubli
2023-02-27  3:27 ` [PATCH v16 39/68] libceph: add CEPH_OSD_OP_ASSERT_VER support xiubli
2023-02-27  3:27 ` [PATCH v16 40/68] ceph: size handling for encrypted inodes in cap updates xiubli
2023-02-27  3:27 ` [PATCH v16 41/68] ceph: fscrypt_file field handling in MClientRequest messages xiubli
2023-02-27  3:27 ` [PATCH v16 42/68] ceph: get file size from fscrypt_file when present in inode traces xiubli
2023-02-27  3:27 ` [PATCH v16 43/68] ceph: handle fscrypt fields in cap messages from MDS xiubli
2023-02-27  3:27 ` [PATCH v16 44/68] ceph: update WARN_ON message to pr_warn xiubli
2023-02-27  3:27 ` [PATCH v16 45/68] ceph: add __ceph_get_caps helper support xiubli
2023-02-27  3:27 ` [PATCH v16 46/68] ceph: add __ceph_sync_read " xiubli
2023-02-27  3:27 ` [PATCH v16 47/68] ceph: add object version support for sync read xiubli
2023-02-27  3:27 ` [PATCH v16 48/68] ceph: add infrastructure for file encryption and decryption xiubli
2023-02-27  3:27 ` [PATCH v16 49/68] ceph: add truncate size handling support for fscrypt xiubli
2023-02-27  3:27 ` [PATCH v16 50/68] libceph: allow ceph_osdc_new_request to accept a multi-op read xiubli
2023-02-27  3:27 ` [PATCH v16 51/68] ceph: disable fallocate for encrypted inodes xiubli
2023-02-27  3:27 ` [PATCH v16 52/68] ceph: disable copy offload on " xiubli
2023-02-27  3:27 ` [PATCH v16 53/68] ceph: don't use special DIO path for " xiubli
2023-02-27  3:27 ` [PATCH v16 54/68] ceph: align data in pages in ceph_sync_write xiubli
2023-02-27  3:28 ` [PATCH v16 55/68] ceph: add read/modify/write to ceph_sync_write xiubli
2023-02-27  3:28 ` [PATCH v16 56/68] ceph: plumb in decryption during sync reads xiubli
2023-02-27  3:28 ` [PATCH v16 57/68] ceph: add fscrypt decryption support to ceph_netfs_issue_op xiubli
2023-02-27  3:28 ` [PATCH v16 58/68] ceph: set i_blkbits to crypto block size for encrypted inodes xiubli
2023-02-27  3:28 ` [PATCH v16 59/68] ceph: add encryption support to writepage xiubli
2023-02-27  3:28 ` [PATCH v16 60/68] ceph: fscrypt support for writepages xiubli
2023-02-27  3:28 ` [PATCH v16 61/68] ceph: invalidate pages when doing direct/sync writes xiubli
2023-02-27  3:28 ` [PATCH v16 62/68] ceph: add support for encrypted snapshot names xiubli
2023-02-27  3:28 ` [PATCH v16 63/68] ceph: add support for handling " xiubli
2023-02-27  3:28 ` [PATCH v16 64/68] ceph: update documentation regarding snapshot naming limitations xiubli
2023-02-27  3:28 ` [PATCH v16 65/68] ceph: prevent snapshots to be created in encrypted locked directories xiubli
2023-02-27  3:28 ` [PATCH v16 66/68] ceph: report STATX_ATTR_ENCRYPTED on encrypted inodes xiubli
2023-02-27  3:28 ` [PATCH v16 67/68] libceph: defer removing the req from osdc just after req->r_callback xiubli
2023-02-27  3:28 ` xiubli [this message]
2023-02-27  9:27 ` [PATCH v16 00/68] ceph+fscrypt: full support Luís Henriques
2023-02-27  9:58   ` Xiubo Li
2023-02-27 10:30     ` Luís Henriques

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230227032813.337906-69-xiubli@redhat.com \
    --to=xiubli@redhat.com \
    --cc=ceph-devel@vger.kernel.org \
    --cc=idryomov@gmail.com \
    --cc=jlayton@kernel.org \
    --cc=lhenriques@suse.de \
    --cc=mchangir@redhat.com \
    --cc=vshankar@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).