All of lore.kernel.org
 help / color / mirror / Atom feed
From: Heming Zhao via Ocfs2-devel <ocfs2-devel@oss.oracle.com>
To: ocfs2-devel@oss.oracle.com, joseph.qi@linux.alibaba.com
Subject: [Ocfs2-devel] [PATCH 4/4] ocfs2: introduce ext4 MMP feature
Date: Sat, 30 Jul 2022 09:14:11 +0800	[thread overview]
Message-ID: <20220730011411.11214-5-heming.zhao@suse.com> (raw)
In-Reply-To: <20220730011411.11214-1-heming.zhao@suse.com>

MMP (multiple mount protection) gives filesystem ability to prevent
from being mounted multiple times.

For avoiding data corruption when non-clustered and/or clustered mount
are happening at same time, this commit introduced MMP feature. MMP
idea is from ext4 MMP (fs/ext4/mmp.c) code. For ocfs2 is a clustered
fs and also for compatible with existing slotmap feature, I did some
optimization and modification when porting from ext4 to ocfs2.

For optimization:
mmp has a kthread kmmpd-<dev>, which is only created in non-clustered
mode.

We set a rule:
If last mount didn't do unmount, (eg: crash), the next mount MUST be
same mount type.

At last, this commit also fix commit c80af0c250c8 ("Revert "ocfs2:
mount shared volume without ha stack") mentioned issue.

Signed-off-by: Heming Zhao <heming.zhao@suse.com>
---
 fs/ocfs2/ocfs2.h    |   2 +
 fs/ocfs2/ocfs2_fs.h |  13 +-
 fs/ocfs2/slot_map.c | 459 ++++++++++++++++++++++++++++++++++++++++++--
 fs/ocfs2/slot_map.h |   3 +
 fs/ocfs2/super.c    |  23 ++-
 5 files changed, 479 insertions(+), 21 deletions(-)

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 337527571461..37a7c5855d07 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -337,6 +337,8 @@ struct ocfs2_super
 	unsigned int node_num;
 	int slot_num;
 	int preferred_slot;
+	u16 mmp_update_interval;
+	struct task_struct *mmp_task;
 	int s_sectsize_bits;
 	int s_clustersize;
 	int s_clustersize_bits;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 638d875eccc7..015672f75563 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -87,7 +87,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
 					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	\
 					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \
-					 | OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
+					 | OCFS2_FEATURE_INCOMPAT_APPEND_DIO \
+					 | OCFS2_FEATURE_INCOMPAT_MMP)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
 					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -167,6 +168,11 @@
  */
 #define OCFS2_FEATURE_INCOMPAT_APPEND_DIO	0x8000
 
+/*
+ * Multiple mount protection
+ */
+#define OCFS2_FEATURE_INCOMPAT_MMP	0x10000
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -535,8 +541,7 @@ struct ocfs2_slot_map {
 };
 
 struct ocfs2_extended_slot {
-/*00*/	__u8	es_valid;
-	__u8	es_reserved1[3];
+/*00*/	__le32	es_valid;
 	__le32	es_node_num;
 /*08*/
 };
@@ -611,7 +616,7 @@ struct ocfs2_super_block {
 						     INCOMPAT flag set. */
 /*B8*/	__le16 s_xattr_inline_size;	/* extended attribute inline size
 					   for this fs*/
-	__le16 s_reserved0;
+	__le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
 	__le32 s_dx_seed[3];		/* seed[0-2] for dx dir hash.
 					 * s_uuid_hash serves as seed[3]. */
 /*C0*/  __le64 s_reserved2[15];		/* Fill out superblock */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 0b0ae3ebb0cf..86a21140ead6 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -8,6 +8,8 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/random.h>
+#include <linux/kthread.h>
 
 #include <cluster/masklog.h>
 
@@ -24,9 +26,48 @@
 
 #include "buffer_head_io.h"
 
+/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the '//slot_map' field in the system dir.
+ * Programs that check MMP should assume that if SEQ_FSCK (or any unknown
+ * code above SEQ_MAX) is present then it is NOT safe to use the filesystem.
+ */
+#define OCFS2_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define OCFS2_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
+#define OCFS2_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */
+#define OCFS2_MMP_SEQ_INIT  0x0         /* mmp_seq init value */
+#define OCFS2_VALID_CLUSTER   0xE24D4D55U /* value for clustered mount
+											   under MMP disabled */
+#define OCFS2_VALID_NOCLUSTER 0xE24D4D5AU /* value for noclustered mount
+											   under MMP disabled */
+
+#define OCFS2_SLOT_INFO_OLD_VALID   1 /* use for old slot info */
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define OCFS2_MMP_CHECK_MULT		2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define OCFS2_MMP_MIN_CHECK_INTERVAL	5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define OCFS2_MMP_MAX_CHECK_INTERVAL	300UL
 
 struct ocfs2_slot {
-	int sl_valid;
+	union {
+		unsigned int sl_valid;
+		unsigned int mmp_seq;
+	};
 	unsigned int sl_node_num;
 };
 
@@ -52,11 +93,11 @@ static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
 }
 
 static void ocfs2_set_slot(struct ocfs2_slot_info *si,
-			   int slot_num, unsigned int node_num)
+			   int slot_num, unsigned int node_num, unsigned int valid)
 {
 	BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
 
-	si->si_slots[slot_num].sl_valid = 1;
+	si->si_slots[slot_num].sl_valid = valid;
 	si->si_slots[slot_num].sl_node_num = node_num;
 }
 
@@ -75,7 +116,8 @@ static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
 		     i++, slotno++) {
 			if (se->se_slots[i].es_valid)
 				ocfs2_set_slot(si, slotno,
-					       le32_to_cpu(se->se_slots[i].es_node_num));
+					       le32_to_cpu(se->se_slots[i].es_node_num),
+					       le32_to_cpu(se->se_slots[i].es_valid));
 			else
 				ocfs2_invalidate_slot(si, slotno);
 		}
@@ -97,7 +139,8 @@ static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
 		if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
 			ocfs2_invalidate_slot(si, i);
 		else
-			ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
+			ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]),
+						OCFS2_SLOT_INFO_OLD_VALID);
 	}
 }
 
@@ -252,16 +295,14 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
 	int i, ret = -ENOSPC;
 
 	if ((preferred >= 0) && (preferred < si->si_num_slots)) {
-		if (!si->si_slots[preferred].sl_valid ||
-		    !si->si_slots[preferred].sl_node_num) {
+		if (!si->si_slots[preferred].sl_valid) {
 			ret = preferred;
 			goto out;
 		}
 	}
 
 	for(i = 0; i < si->si_num_slots; i++) {
-		if (!si->si_slots[i].sl_valid ||
-		    !si->si_slots[i].sl_node_num) {
+		if (!si->si_slots[i].sl_valid) {
 			ret = i;
 			break;
 		}
@@ -270,6 +311,43 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
 	return ret;
 }
 
+/* Return first used slot.
+ * -ENOENT means all slots are clean, ->sl_valid should be
+ * OCFS2_MMP_SEQ_CLEAN or ZERO */
+static int __ocfs2_find_used_slot(struct ocfs2_slot_info *si)
+{
+	int i, ret = -ENOENT, valid;
+
+	for (i = 0; i < si->si_num_slots; i++) {
+		valid = si->si_slots[i].sl_valid;
+		if (valid == 0 || valid == OCFS2_MMP_SEQ_CLEAN)
+			continue;
+		if (valid <= OCFS2_MMP_SEQ_MAX ||
+			valid == OCFS2_MMP_SEQ_FSCK ||
+			valid == OCFS2_VALID_CLUSTER ||
+			valid == OCFS2_VALID_NOCLUSTER) {
+			ret = i;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int __ocfs2_find_expected_slot(struct ocfs2_slot_info *si,
+								unsigned int expected)
+{
+	int i;
+
+	for (i = 0; i < si->si_num_slots; i++) {
+		if (si->si_slots[i].sl_valid == expected) {
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
 int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
 {
 	int slot;
@@ -445,21 +523,357 @@ void ocfs2_free_slot_info(struct ocfs2_super *osb)
 	__ocfs2_free_slot_info(si);
 }
 
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+	u32 new_seq;
+
+	do {
+		new_seq = prandom_u32();
+	} while (new_seq > OCFS2_MMP_SEQ_MAX);
+
+	if (new_seq == 0)
+		return 1;
+	else
+		return new_seq;
+}
+
+/*
+ * kmmpd will update the MMP sequence every mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+	struct ocfs2_super *osb = data;
+	struct super_block *sb = osb->sb;
+	struct ocfs2_slot_info *si = osb->slot_info;
+	int slot = osb->slot_num;
+	u32 seq, mmp_seq;
+	unsigned long failed_writes = 0;
+	u16 mmp_update_interval = osb->mmp_update_interval;
+	unsigned int mmp_check_interval;
+	unsigned long last_update_time;
+	unsigned long diff;
+	int retval = 0;
+
+	if (!ocfs2_mount_local(osb)) {
+		mlog(ML_ERROR, "kmmpd thread only works for local mount mode.\n");
+		goto wait_to_exit;
+	}
+
+	retval = ocfs2_refresh_slot_info(osb);
+	seq = si->si_slots[slot].mmp_seq;
+
+	/*
+	 * Start with the higher mmp_check_interval and reduce it if
+	 * the MMP block is being updated on time.
+	 */
+	mmp_check_interval = max(OCFS2_MMP_CHECK_MULT * mmp_update_interval,
+				 OCFS2_MMP_MIN_CHECK_INTERVAL);
+
+	while (!kthread_should_stop() && !sb_rdonly(sb)) {
+		if (!OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_MMP)) {
+			mlog(ML_WARNING, "kmmpd being stopped since MMP feature"
+				     " has been disabled.");
+			goto wait_to_exit;
+		}
+		if (++seq > OCFS2_MMP_SEQ_MAX)
+			seq = 1;
+
+		spin_lock(&osb->osb_lock);
+		si->si_slots[slot].mmp_seq = mmp_seq = seq;
+		spin_unlock(&osb->osb_lock);
+
+		last_update_time = jiffies;
+		retval = ocfs2_update_disk_slot(osb, si, slot);
+
+		/*
+		 * Don't spew too many error messages. Print one every
+		 * (s_mmp_update_interval * 60) seconds.
+		 */
+		if (retval) {
+			if ((failed_writes % 60) == 0) {
+				ocfs2_error(sb, "Error writing to MMP block");
+			}
+			failed_writes++;
+		}
+
+		diff = jiffies - last_update_time;
+		if (diff < mmp_update_interval * HZ)
+			schedule_timeout_interruptible(mmp_update_interval *
+						       HZ - diff);
+
+		/*
+		 * We need to make sure that more than mmp_check_interval
+		 * seconds have not passed since writing. If that has happened
+		 * we need to check if the MMP block is as we left it.
+		 */
+		diff = jiffies - last_update_time;
+		if (diff > mmp_check_interval * HZ) {
+			retval = ocfs2_refresh_slot_info(osb);
+			if (retval) {
+				ocfs2_error(sb, "error reading MMP data: %d", retval);
+				goto wait_to_exit;
+			}
+
+			if (si->si_slots[slot].mmp_seq != mmp_seq) {
+				ocfs2_error(sb, "Error while updating MMP info. "
+					     "The filesystem seems to have been"
+					     " multiply mounted.");
+				retval = -EBUSY;
+				goto wait_to_exit;
+			}
+		}
+
+		 /*
+		 * Adjust the mmp_check_interval depending on how much time
+		 * it took for the MMP block to be written.
+		 */
+		mmp_check_interval = max(min(OCFS2_MMP_CHECK_MULT * diff / HZ,
+					     OCFS2_MMP_MAX_CHECK_INTERVAL),
+					     OCFS2_MMP_MIN_CHECK_INTERVAL);
+	}
+
+	/*
+	 * Unmount seems to be clean.
+	 */
+	spin_lock(&osb->osb_lock);
+	si->si_slots[slot].mmp_seq = OCFS2_MMP_SEQ_CLEAN;
+	spin_unlock(&osb->osb_lock);
+
+	retval = ocfs2_update_disk_slot(osb, si, 0);
+
+wait_to_exit:
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!kthread_should_stop())
+			schedule();
+	}
+	set_current_state(TASK_RUNNING);
+	return retval;
+}
+
+void ocfs2_stop_mmpd(struct ocfs2_super *osb)
+{
+	if (osb->mmp_task) {
+		kthread_stop(osb->mmp_task);
+		osb->mmp_task = NULL;
+	}
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ *
+ * This function was inspired by ext4 MMP feature. Because HA stack
+ * helps ocfs2 to manage nodes join/leave, so we only focus on MMP
+ * under nocluster mode.
+ * Another info is ocfs2 only uses slot 0 on nocuster mode.
+ *
+ * es_valid:
+ *  0: not available
+ *  1: valid, cluster mode
+ *  2: valid, nocluster mode
+ *
+ * parameters:
+ *  osb: the struct ocfs2_super
+ *  noclustered: under noclustered mount
+ *  slot: prefer slot number
+ */
+int ocfs2_multi_mount_protect(struct ocfs2_super *osb, int noclustered)
+{
+	struct buffer_head *bh = NULL;
+	u32 seq;
+	struct ocfs2_slot_info *si = osb->slot_info;
+	unsigned int mmp_check_interval = osb->mmp_update_interval;
+	unsigned int wait_time = 0;
+	int retval = 0;
+	int slot = osb->slot_num;
+
+	if (!ocfs2_uses_extended_slot_map(osb)) {
+		mlog(ML_WARNING, "MMP only works on extended slot map.\n");
+		retval = -EINVAL;
+		goto bail;
+	}
+
+	retval = ocfs2_refresh_slot_info(osb);
+	if (retval)
+		goto bail;
+
+	if (mmp_check_interval < OCFS2_MMP_MIN_CHECK_INTERVAL)
+		mmp_check_interval = OCFS2_MMP_MIN_CHECK_INTERVAL;
+
+	spin_lock(&osb->osb_lock);
+	seq = si->si_slots[slot].mmp_seq;
+
+	if (__ocfs2_find_used_slot(si) == -ENOENT)
+		goto skip;
+
+	/* TODO ocfs2-tools need to support this flag */
+	if (__ocfs2_find_expected_slot(si, OCFS2_MMP_SEQ_FSCK)) {
+		mlog(ML_NOTICE, "fsck is running on the filesystem");
+		spin_unlock(&osb->osb_lock);
+		retval = -EBUSY;
+		goto bail;
+	}
+	spin_unlock(&osb->osb_lock);
+
+	wait_time = min(mmp_check_interval * 2 + 1, mmp_check_interval + 60);
+
+	/* Print MMP interval if more than 20 secs. */
+	if (wait_time > OCFS2_MMP_MIN_CHECK_INTERVAL * 4)
+		mlog(ML_WARNING, "MMP interval %u higher than expected, please"
+			     " wait.\n", wait_time * 2);
+
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		mlog(ML_WARNING, "MMP startup interrupted, failing mount.\n");
+		retval = -EPERM;
+		goto bail;
+	}
+
+	retval = ocfs2_refresh_slot_info(osb);
+	if (retval)
+		goto bail;
+	if (seq != si->si_slots[slot].mmp_seq) {
+		mlog(ML_ERROR, "Device is already active on another node.\n");
+		retval = -EPERM;
+		goto bail;
+	}
+
+	spin_lock(&osb->osb_lock);
+skip:
+	/*
+	 * write a new random sequence number.
+	 */
+	seq = mmp_new_seq();
+	mlog(ML_ERROR, "seq: 0x%x mmp_seq: 0x%x\n", seq, si->si_slots[slot].mmp_seq);
+	ocfs2_set_slot(si, slot, osb->node_num, seq);
+	spin_unlock(&osb->osb_lock);
+
+	ocfs2_update_disk_slot_extended(si, slot, &bh);
+	mlog(ML_ERROR, "seq: 0x%x mmp_seq: 0x%x\n", seq, si->si_slots[slot].mmp_seq);
+	retval = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
+	if (retval < 0) {
+		mlog_errno(retval);
+		goto bail;
+	}
+	mlog(ML_ERROR, "seq: 0x%x mmp_seq: 0x%x wait_time: %u\n", seq, si->si_slots[slot].mmp_seq, wait_time);
+
+	/*
+	 * wait for MMP interval and check mmp_seq.
+	 */
+	if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+		mlog(ML_WARNING, "MMP startup interrupted, failing mount.\n");
+		retval = -EPERM;
+		goto bail;
+	}
+
+	retval = ocfs2_refresh_slot_info(osb);
+	if (retval)
+		goto bail;
+
+	mlog(ML_ERROR, "seq: 0x%x mmp_seq: 0x%x\n", seq, si->si_slots[slot].mmp_seq);
+	if (seq != si->si_slots[slot].mmp_seq) {
+		mlog(ML_ERROR, "Update seq failed, device is already active on another node.\n");
+		retval = -EPERM;
+		goto bail;
+	}
+
+	/*
+	 * There are two reasons we don't create kmmpd on clustered mount:
+	 * - ocfs2 needs to grab osb->osb_lock to modify/access osb->si.
+	 * - For huge number nodes cluster, nodes update same sector
+	 *   of '//slot_map' will cause IO performance issue.
+	 *
+	 * Then there has another question:
+	 * On clustered mount, MMP seq won't update, and MMP how to
+	 * handle a noclustered mount when there already exist
+	 * clustered mount.
+	 * The answer is the rule mentioned in ocfs2_find_slot().
+	 */
+	if (!noclustered) {
+		spin_lock(&osb->osb_lock);
+		ocfs2_set_slot(si, slot, osb->node_num, OCFS2_VALID_CLUSTER);
+		spin_unlock(&osb->osb_lock);
+
+		ocfs2_update_disk_slot_extended(si, slot, &bh);
+		retval = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
+		goto bail;
+	}
+
+	/*
+	 * Start a kernel thread to update the MMP block periodically.
+	 */
+	osb->mmp_task = kthread_run(kmmpd, osb, "kmmpd-%s", osb->sb->s_id);
+	if (IS_ERR(osb->mmp_task)) {
+		osb->mmp_task = NULL;
+		mlog(ML_WARNING, "Unable to create kmmpd thread for %s.",
+			     osb->sb->s_id);
+		retval = -EPERM;
+		goto bail;
+	}
+
+bail:
+	return retval;
+}
+
+static void show_conflict_mnt_msg(int clustered)
+{
+	const char *exist = clustered ? "non-clustered" : "clustered";
+
+	mlog(ML_ERROR, "Found %s mount info!", exist);
+	mlog(ML_ERROR, "Please clean %s slotmap info for mounting.\n", exist);
+	mlog(ML_ERROR, "eg. remount then unmount with %s mode\n", exist);
+}
+
+/*
+ * Even under readonly mode, we write slot info on disk.
+ * The logic is correct: if not change slot info on readonly
+ * mode, in cluster env, later mount from another node
+ * may reuse the same slot, deadlock happen!
+ */
 int ocfs2_find_slot(struct ocfs2_super *osb)
 {
-	int status;
+	int status = -EPERM;
 	int slot;
+	int noclustered = 0;
 	struct ocfs2_slot_info *si;
 
 	si = osb->slot_info;
 
 	spin_lock(&osb->osb_lock);
 	ocfs2_update_slot_info(si);
+	slot = __ocfs2_find_used_slot(si);
+	if (slot == 0 &&
+		((si->si_slots[0].sl_valid == OCFS2_VALID_NOCLUSTER) ||
+		 (si->si_slots[0].sl_valid < OCFS2_MMP_SEQ_MAX)))
+		noclustered = 1;
 
-	if (ocfs2_mount_local(osb))
-		/* use slot 0 directly in local mode */
-		slot = 0;
-	else {
+	/*
+	 * We set a rule:
+	 * If last mount didn't do unmount, (eg: crash), the next mount
+	 * MUST be same mount type.
+	 */
+	if (ocfs2_mount_local(osb)) {
+		/* empty slotmap, or device didn't unmount from last time */
+		if ((slot == -ENOENT) || noclustered) {
+			/* use slot 0 directly in local mode */
+			slot = 0;
+			noclustered = 1;
+		} else {
+			spin_unlock(&osb->osb_lock);
+			show_conflict_mnt_msg(0);
+			status = -EINVAL;
+			goto bail;
+		}
+	} else {
+		if (noclustered) {
+			spin_unlock(&osb->osb_lock);
+			show_conflict_mnt_msg(1);
+			status = -EINVAL;
+			goto bail;
+		}
 		/* search for ourselves first and take the slot if it already
 		 * exists. Perhaps we need to mark this in a variable for our
 		 * own journal recovery? Possibly not, though we certainly
@@ -481,7 +895,21 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 			       slot, osb->dev_str);
 	}
 
-	ocfs2_set_slot(si, slot, osb->node_num);
+	if (OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, OCFS2_FEATURE_INCOMPAT_MMP)) {
+		osb->slot_num = slot;
+		spin_unlock(&osb->osb_lock);
+		status = ocfs2_multi_mount_protect(osb, noclustered);
+		if (status < 0) {
+			mlog(ML_ERROR, "MMP failed to start.\n");
+			goto mmp_fail;
+		}
+
+		trace_ocfs2_find_slot(osb->slot_num);
+		return status;
+	}
+
+	ocfs2_set_slot(si, slot, osb->node_num, noclustered ?
+			OCFS2_VALID_NOCLUSTER : OCFS2_VALID_CLUSTER);
 	osb->slot_num = slot;
 	spin_unlock(&osb->osb_lock);
 
@@ -490,6 +918,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 	status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
 	if (status < 0) {
 		mlog_errno(status);
+mmp_fail:
 		/*
 		 * if write block failed, invalidate slot to avoid overwrite
 		 * slot during dismount in case another node rightly has mounted
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index a43644570b53..d4d147b0c190 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -25,4 +25,7 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
 
 int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
 
+int ocfs2_multi_mount_protect(struct ocfs2_super *osb, int noclustered);
+void ocfs2_stop_mmpd(struct ocfs2_super *osb);
+
 #endif
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f7298816d8d9..b0e76b06efc3 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -609,6 +609,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	struct mount_options parsed_options;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	u32 tmp;
+	int noclustered;
 
 	sync_filesystem(sb);
 
@@ -619,7 +620,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	tmp = OCFS2_MOUNT_NOCLUSTER;
-	if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
+	noclustered = osb->s_mount_opt & tmp;
+	if (noclustered != (parsed_options.mount_opt & tmp)) {
 		ret = -EINVAL;
 		mlog(ML_ERROR, "Cannot change nocluster option on remount\n");
 		goto out;
@@ -686,10 +688,20 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 			}
 			sb->s_flags &= ~SB_RDONLY;
 			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
+			if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_MMP)) {
+				spin_unlock(&osb->osb_lock);
+				if (ocfs2_multi_mount_protect(osb, noclustered)) {
+					mlog(ML_ERROR, "started MMP failed.\n");
+					ocfs2_stop_mmpd(osb);
+					ret = -EROFS;
+					goto unlocked_osb;
+				}
+			}
 		}
 		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
+unlocked_osb:
 		/* Enable quota accounting after remounting RW */
 		if (!ret && !(*flags & SB_RDONLY)) {
 			if (sb_any_quota_suspended(sb))
@@ -722,6 +734,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
 			((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
 							SB_POSIXACL : 0);
+		if (sb_rdonly(osb->sb))
+			ocfs2_stop_mmpd(osb);
 	}
 out:
 	return ret;
@@ -1833,7 +1847,7 @@ static int ocfs2_mount_volume(struct super_block *sb)
 	status = ocfs2_init_local_system_inodes(osb);
 	if (status < 0) {
 		mlog_errno(status);
-		goto out_super_lock;
+		goto out_find_slot;
 	}
 
 	status = ocfs2_check_volume(osb);
@@ -1858,6 +1872,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
 	/* before journal shutdown, we should release slot_info */
 	ocfs2_free_slot_info(osb);
 	ocfs2_journal_shutdown(osb);
+out_find_slot:
+	ocfs2_stop_mmpd(osb);
 out_super_lock:
 	ocfs2_super_unlock(osb, 1);
 out_dlm:
@@ -1878,6 +1894,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 
+	ocfs2_stop_mmpd(osb);
+
 	/* Remove file check sysfs related directores/files,
 	 * and wait for the pending file check operations */
 	ocfs2_filecheck_remove_sysfs(osb);
@@ -2086,6 +2104,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
 		 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
 
+	osb->mmp_update_interval = le16_to_cpu(di->id2.i_super.s_mmp_update_interval);
 	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
 	if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) {
 		mlog(ML_ERROR, "Invalid number of node slots (%u)\n",
-- 
2.37.1


_______________________________________________
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel

  parent reply	other threads:[~2022-07-30  1:16 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-30  1:14 [Ocfs2-devel] [PATCH 0/4] re-enable non-clustered mount & add MMP support Heming Zhao via Ocfs2-devel
2022-07-30  1:14 ` [Ocfs2-devel] [PATCH 1/4] ocfs2: Fix freeing uninitialized resource on ocfs2_dlm_shutdown Heming Zhao via Ocfs2-devel
2022-08-08  6:51   ` Joseph Qi via Ocfs2-devel
2022-08-08 12:09     ` Heming Zhao via Ocfs2-devel
2022-08-10  1:31       ` Joseph Qi via Ocfs2-devel
2022-08-10 23:52         ` heming.zhao--- via Ocfs2-devel
2022-07-30  1:14 ` [Ocfs2-devel] [PATCH 2/4] ocfs2: add mlog ML_WARNING support Heming Zhao via Ocfs2-devel
2022-07-30  1:14 ` [Ocfs2-devel] [PATCH 3/4] re-enable "ocfs2: mount shared volume without ha stack" Heming Zhao via Ocfs2-devel
2022-07-31 17:42   ` Mark Fasheh via Ocfs2-devel
2022-08-01  1:01     ` heming.zhao--- via Ocfs2-devel
2022-08-01  2:25       ` heming.zhao--- via Ocfs2-devel
2022-08-04 23:53       ` Mark Fasheh via Ocfs2-devel
2022-08-05  4:11         ` Mark Fasheh via Ocfs2-devel
2022-08-06 15:53           ` heming.zhao--- via Ocfs2-devel
2022-08-06 16:20           ` Heming Zhao via Ocfs2-devel
2022-08-06 15:44         ` heming.zhao--- via Ocfs2-devel
2022-08-06 16:15         ` Heming Zhao via Ocfs2-devel
2022-07-30  1:14 ` Heming Zhao via Ocfs2-devel [this message]
2022-07-31  9:13   ` [Ocfs2-devel] [PATCH 4/4] ocfs2: introduce ext4 MMP feature heming.zhao--- via Ocfs2-devel
2022-08-08  8:19   ` Joseph Qi via Ocfs2-devel
2022-08-08  9:07     ` Heming Zhao via Ocfs2-devel
2022-08-08  9:26       ` Heming Zhao via Ocfs2-devel
2022-08-08  9:29       ` Joseph Qi via Ocfs2-devel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220730011411.11214-5-heming.zhao@suse.com \
    --to=ocfs2-devel@oss.oracle.com \
    --cc=heming.zhao@suse.com \
    --cc=joseph.qi@linux.alibaba.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.