All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] md-cluster: Protect communication with mutexes
@ 2015-11-06  3:50 rgoldwyn
  2015-11-06  3:50 ` [PATCH 2/6] md-cluster: Avoid the resync ping-pong rgoldwyn
                   ` (5 more replies)
  0 siblings, 6 replies; 17+ messages in thread
From: rgoldwyn @ 2015-11-06  3:50 UTC (permalink / raw)
  To: neilb, linux-raid; +Cc: Goldwyn Rodrigues

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

Communication can happen through multiple threads. It is possible that
one thread steps over another threads sequence. So, we use mutexes to
protect both the send and receive sequences.

We use a new flag CLUSTER_MD_COMM_LOCKED which is used to detect
if communication is already locked. This is useful for cases where we need to
take and hold the token DLM lock for a while. This bit is set only
after locking communication.

Also, Remove stray/ununsed sb_mutex.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index b73729b..a93734e 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -47,6 +47,7 @@ struct resync_info {
 #define		MD_CLUSTER_WAITING_FOR_NEWDISK		1
 #define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
 #define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3
+#define		MD_CLUSTER_COMM_LOCKED			4
 
 
 struct md_cluster_info {
@@ -54,7 +55,8 @@ struct md_cluster_info {
 	dlm_lockspace_t *lockspace;
 	int slot_number;
 	struct completion completion;
-	struct mutex sb_mutex;
+	struct mutex recv_mutex;
+	struct mutex send_mutex;
 	struct dlm_lock_resource *bitmap_lockres;
 	struct dlm_lock_resource *resync_lockres;
 	struct list_head suspend_list;
@@ -503,6 +505,7 @@ static void recv_daemon(struct md_thread *thread)
 	struct cluster_msg msg;
 	int ret;
 
+	mutex_lock(&cinfo->recv_mutex);
 	/*get CR on Message*/
 	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
 		pr_err("md/raid1:failed to get CR on MESSAGE\n");
@@ -529,6 +532,7 @@ static void recv_daemon(struct md_thread *thread)
 	ret = dlm_unlock_sync(message_lockres);
 	if (unlikely(ret != 0))
 		pr_info("unlock msg failed return %d\n", ret);
+	mutex_unlock(&cinfo->recv_mutex);
 }
 
 /* lock_comm()
@@ -542,20 +546,22 @@ static int lock_comm(struct md_cluster_info *cinfo)
 {
 	int error;
 
-	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
-		return 0;
+	mutex_lock(&cinfo->send_mutex);
 
 	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
 	if (error)
 		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
 				__func__, __LINE__, error);
+	mutex_lock(&cinfo->recv_mutex);
 	return error;
 }
 
 static void unlock_comm(struct md_cluster_info *cinfo)
 {
 	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
+	mutex_unlock(&cinfo->recv_mutex);
 	dlm_unlock_sync(cinfo->token_lockres);
+	mutex_unlock(&cinfo->send_mutex);
 }
 
 /* __sendmsg()
@@ -709,7 +715,8 @@ static int join(struct mddev *mddev, int nodes)
 	init_completion(&cinfo->completion);
 	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
 
-	mutex_init(&cinfo->sb_mutex);
+	mutex_init(&cinfo->send_mutex);
+	mutex_init(&cinfo->recv_mutex);
 	mddev->cluster_info = cinfo;
 
 	memset(str, 0, 64);
@@ -839,7 +846,12 @@ static int slot_number(struct mddev *mddev)
 
 static int metadata_update_start(struct mddev *mddev)
 {
-	return lock_comm(mddev->cluster_info);
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int ret;
+	if (test_and_clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state))
+		return 0;
+	ret = lock_comm(cinfo);
+	return ret;
 }
 
 static int metadata_update_finish(struct mddev *mddev)
@@ -864,6 +876,7 @@ static int metadata_update_finish(struct mddev *mddev)
 		ret = __sendmsg(cinfo, &cmsg);
 	} else
 		pr_warn("md-cluster: No good device id found to send\n");
+	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
 	unlock_comm(cinfo);
 	return ret;
 }
@@ -871,6 +884,7 @@ static int metadata_update_finish(struct mddev *mddev)
 static void metadata_update_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
 	unlock_comm(cinfo);
 }
 
@@ -945,6 +959,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 	memcpy(cmsg.uuid, uuid, 16);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
 	lock_comm(cinfo);
+	set_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
 	ret = __sendmsg(cinfo, &cmsg);
 	if (ret)
 		return ret;
@@ -964,6 +979,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 static void add_new_disk_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
 	unlock_comm(cinfo);
 }
 
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/6] md-cluster: Avoid the resync ping-pong
  2015-11-06  3:50 [PATCH 1/6] md-cluster: Protect communication with mutexes rgoldwyn
@ 2015-11-06  3:50 ` rgoldwyn
  2015-11-09 23:39   ` NeilBrown
  2015-11-06  3:50 ` [PATCH 3/6] md-cluster: remove a disk asynchronously from cluster environment rgoldwyn
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 17+ messages in thread
From: rgoldwyn @ 2015-11-06  3:50 UTC (permalink / raw)
  To: neilb, linux-raid; +Cc: Goldwyn Rodrigues

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

If a RESYNCING message with (0,0) has been sent before, do not send it
again. This avoids a resync ping pong between the nodes. We read
the bitmap lockresource's LVB to figure out the previous value
of the RESYNCING message.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index a93734e..3daa464 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -898,8 +898,16 @@ static int resync_start(struct mddev *mddev)
 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct resync_info ri;
 	struct cluster_msg cmsg = {0};
 
+	/* do not send zero again, if we have sent before */
+	if (hi == 0) {
+		memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
+		if (le64_to_cpu(ri.hi) == 0)
+			return 0;
+	}
+
 	add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
 	/* Re-acquire the lock to refresh LVB */
 	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/6] md-cluster: remove a disk asynchronously from cluster environment
  2015-11-06  3:50 [PATCH 1/6] md-cluster: Protect communication with mutexes rgoldwyn
  2015-11-06  3:50 ` [PATCH 2/6] md-cluster: Avoid the resync ping-pong rgoldwyn
@ 2015-11-06  3:50 ` rgoldwyn
  2015-11-09 23:43   ` NeilBrown
  2015-11-06  3:50 ` [PATCH 4/6] md-cluster: Defer MD reloading to mddev->thread rgoldwyn
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 17+ messages in thread
From: rgoldwyn @ 2015-11-06  3:50 UTC (permalink / raw)
  To: neilb, linux-raid; +Cc: Guoqing Jiang, Goldwyn Rodrigues

From: Guoqing Jiang <gqjiang@suse.com>

For cluster raid, if one disk couldn't be reach in one node, then
other nodes would receive the REMOVE message for the disk.

In receiving node, we can't call md_kick_rdev_from_array to remove
the disk from array synchronously since the disk might still be busy
in this node. So let's set a ClusterRemove flag on the disk, then
let the thread to do the removal job eventually.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c |  7 +++++--
 drivers/md/md.c         | 12 ++++++++++++
 drivers/md/md.h         |  1 +
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 3daa464..a681706 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -443,8 +443,11 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
 						   le32_to_cpu(msg->raid_slot));
 
-	if (rdev)
-		md_kick_rdev_from_array(rdev);
+	if (rdev) {
+		set_bit(ClusterRemove, &rdev->flags);
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+	}
 	else
 		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
 			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 44d0342..32ca592 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8222,6 +8222,18 @@ void md_check_recovery(struct mddev *mddev)
 			goto unlock;
 		}
 
+		if (mddev_is_clustered(mddev)) {
+			struct md_rdev *rdev;
+			/* kick the device if another node issued a
+			 * remove disk.
+			 */
+			rdev_for_each(rdev, mddev) {
+				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
+						rdev->raid_disk < 0)
+					md_kick_rdev_from_array(rdev);
+			}
+		}
+
 		if (!mddev->external) {
 			int did_change = 0;
 			spin_lock(&mddev->lock);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2ea0035..db54341 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -172,6 +172,7 @@ enum flag_bits {
 				 * This device is seen locally but not
 				 * by the whole cluster
 				 */
+	ClusterRemove,
 };
 
 #define BB_LEN_MASK	(0x00000000000001FFULL)
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/6] md-cluster: Defer MD reloading to mddev->thread
  2015-11-06  3:50 [PATCH 1/6] md-cluster: Protect communication with mutexes rgoldwyn
  2015-11-06  3:50 ` [PATCH 2/6] md-cluster: Avoid the resync ping-pong rgoldwyn
  2015-11-06  3:50 ` [PATCH 3/6] md-cluster: remove a disk asynchronously from cluster environment rgoldwyn
@ 2015-11-06  3:50 ` rgoldwyn
  2015-11-09 23:48   ` NeilBrown
  2015-11-06  3:50 ` [PATCH 5/6] md-cluster: Fix the remove sequence with the new MD reload code rgoldwyn
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 17+ messages in thread
From: rgoldwyn @ 2015-11-06  3:50 UTC (permalink / raw)
  To: neilb, linux-raid; +Cc: Goldwyn Rodrigues

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

Reloading of superblock must be performed under reconfig_mutex. However,
this cannot be done with md_reload_sb because it would deadlock with
the message DLM lock. So, we defer it in md_check_recovery() which is
executed by mddev->thread.

This introduces a new flag, MD_RELOAD_SB, which if set, will reload the
superblock.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 12 +++++++++++-
 drivers/md/md-cluster.h |  1 +
 drivers/md/md.c         |  3 +++
 drivers/md/md.h         |  3 +++
 4 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index a681706..9a36ad6 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -71,6 +71,7 @@ struct md_cluster_info {
 	struct md_thread *recv_thread;
 	struct completion newdisk_completion;
 	unsigned long state;
+	atomic_t good_device_nr;
 };
 
 enum msg_type {
@@ -434,8 +435,10 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
+	atomic_set(&cinfo->good_device_nr, le32_to_cpu(msg->raid_slot));
+	set_bit(MD_RELOAD_SB, &mddev->flags);
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+	md_wakeup_thread(mddev->thread);
 }
 
 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
@@ -1047,6 +1050,12 @@ out:
 	return err;
 }
 
+static int good_device_nr(struct mddev *mddev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	return atomic_read(&cinfo->good_device_nr);
+}
+
 static struct md_cluster_operations cluster_ops = {
 	.join   = join,
 	.leave  = leave,
@@ -1063,6 +1072,7 @@ static struct md_cluster_operations cluster_ops = {
 	.new_disk_ack = new_disk_ack,
 	.remove_disk = remove_disk,
 	.gather_bitmaps = gather_bitmaps,
+	.good_device_nr = good_device_nr,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index e75ea26..c699c6c 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -24,6 +24,7 @@ struct md_cluster_operations {
 	int (*new_disk_ack)(struct mddev *mddev, bool ack);
 	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
 	int (*gather_bitmaps)(struct md_rdev *rdev);
+	int (*good_device_nr)(struct mddev *mddev);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 32ca592..65b6326 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8184,6 +8184,7 @@ void md_check_recovery(struct mddev *mddev)
 		(mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+		test_bit(MD_RELOAD_SB, &mddev->flags) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -8232,6 +8233,8 @@ void md_check_recovery(struct mddev *mddev)
 						rdev->raid_disk < 0)
 					md_kick_rdev_from_array(rdev);
 			}
+			if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
+				md_reload_sb(mddev, md_cluster_ops->good_device_nr(mddev));
 		}
 
 		if (!mddev->external) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index db54341..f89866d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -222,6 +222,9 @@ struct mddev {
 #define MD_STILL_CLOSED	4	/* If set, then array has not been opened since
 				 * md_ioctl checked on it.
 				 */
+#define MD_RELOAD_SB	5	/* Reload the superblock because another node
+				 * updated it.
+				 */
 
 	int				suspended;
 	atomic_t			active_io;
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 5/6] md-cluster: Fix the remove sequence with the new MD reload code
  2015-11-06  3:50 [PATCH 1/6] md-cluster: Protect communication with mutexes rgoldwyn
                   ` (2 preceding siblings ...)
  2015-11-06  3:50 ` [PATCH 4/6] md-cluster: Defer MD reloading to mddev->thread rgoldwyn
@ 2015-11-06  3:50 ` rgoldwyn
  2015-11-09 23:49   ` NeilBrown
  2015-11-06  3:50 ` [PATCH 6/6] md-cluster: Allow spare devices to be marked as faulty rgoldwyn
  2015-11-09 23:31 ` [PATCH 1/6] md-cluster: Protect communication with mutexes NeilBrown
  5 siblings, 1 reply; 17+ messages in thread
From: rgoldwyn @ 2015-11-06  3:50 UTC (permalink / raw)
  To: neilb, linux-raid; +Cc: Goldwyn Rodrigues, Guoqing Jiang

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

The remove disk message does not need metadata_update_start(), but
can be an independent message.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 2 +-
 drivers/md/md.c         | 9 +--------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 9a36ad6..33f5d7a 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1018,7 +1018,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	cmsg.type = cpu_to_le32(REMOVE);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
-	return __sendmsg(cinfo, &cmsg);
+	return sendmsg(cinfo, &cmsg);
 }
 
 static int gather_bitmaps(struct md_rdev *rdev)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 65b6326..4222285 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6045,15 +6045,11 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 {
 	char b[BDEVNAME_SIZE];
 	struct md_rdev *rdev;
-	int ret = -1;
 
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
 		return -ENXIO;
 
-	if (mddev_is_clustered(mddev))
-		ret = md_cluster_ops->metadata_update_start(mddev);
-
 	if (rdev->raid_disk < 0)
 		goto kick_rdev;
 
@@ -6064,7 +6060,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 		goto busy;
 
 kick_rdev:
-	if (mddev_is_clustered(mddev) && ret == 0)
+	if (mddev_is_clustered(mddev))
 		md_cluster_ops->remove_disk(mddev, rdev);
 
 	md_kick_rdev_from_array(rdev);
@@ -6073,9 +6069,6 @@ kick_rdev:
 
 	return 0;
 busy:
-	if (mddev_is_clustered(mddev) && ret == 0)
-		md_cluster_ops->metadata_update_cancel(mddev);
-
 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
 		bdevname(rdev->bdev,b), mdname(mddev));
 	return -EBUSY;
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 6/6] md-cluster: Allow spare devices to be marked as faulty
  2015-11-06  3:50 [PATCH 1/6] md-cluster: Protect communication with mutexes rgoldwyn
                   ` (3 preceding siblings ...)
  2015-11-06  3:50 ` [PATCH 5/6] md-cluster: Fix the remove sequence with the new MD reload code rgoldwyn
@ 2015-11-06  3:50 ` rgoldwyn
  2015-11-09 23:51   ` NeilBrown
  2015-11-09 23:31 ` [PATCH 1/6] md-cluster: Protect communication with mutexes NeilBrown
  5 siblings, 1 reply; 17+ messages in thread
From: rgoldwyn @ 2015-11-06  3:50 UTC (permalink / raw)
  To: neilb, linux-raid; +Cc: Goldwyn Rodrigues

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

If a spare device was marked faulty, it would not be reflected
in receiving nodes because it would mark it as activated and continue.
Continue the operation, so it may be set as faulty.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4222285..b98cf04 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9013,7 +9013,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 				ret = remove_and_add_spares(mddev, rdev2);
 				pr_info("Activated spare: %s\n",
 						bdevname(rdev2->bdev,b));
-				continue;
 			}
 			/* device faulty
 			 * We just want to do the minimum to mark the disk
-- 
1.8.5.6


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/6] md-cluster: Protect communication with mutexes
  2015-11-06  3:50 [PATCH 1/6] md-cluster: Protect communication with mutexes rgoldwyn
                   ` (4 preceding siblings ...)
  2015-11-06  3:50 ` [PATCH 6/6] md-cluster: Allow spare devices to be marked as faulty rgoldwyn
@ 2015-11-09 23:31 ` NeilBrown
  2015-11-10  3:23   ` Goldwyn Rodrigues
  5 siblings, 1 reply; 17+ messages in thread
From: NeilBrown @ 2015-11-09 23:31 UTC (permalink / raw)
  To: rgoldwyn, linux-raid; +Cc: Goldwyn Rodrigues

[-- Attachment #1: Type: text/plain, Size: 5333 bytes --]

On Fri, Nov 06 2015, rgoldwyn@suse.de wrote:

> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>
> Communication can happen through multiple threads. It is possible that
> one thread steps over another threads sequence. So, we use mutexes to
> protect both the send and receive sequences.
>
> We use a new flag CLUSTER_MD_COMM_LOCKED which is used to detect
> if communication is already locked. This is useful for cases where we need to
> take and hold the token DLM lock for a while. This bit is set only
> after locking communication.

I don't understand what this flag is trying to achieve, but I'm fairly
sure it doesn't achieve it.

Maybe if it was test_and_set_bit in metadata_update_start, it might make
sense.  But then I would suggest that clearing the bit be moved to
unlock_comm()

Do you just want to use mutex_try_lock() to detect if communication is
already locked?

>
> Also, Remove stray/ununsed sb_mutex.

I already removed that it mainline - I should have mentioned.

Thanks,
NeilBrown


>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  drivers/md/md-cluster.c | 26 +++++++++++++++++++++-----
>  1 file changed, 21 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index b73729b..a93734e 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -47,6 +47,7 @@ struct resync_info {
>  #define		MD_CLUSTER_WAITING_FOR_NEWDISK		1
>  #define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
>  #define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3
> +#define		MD_CLUSTER_COMM_LOCKED			4
>  
>  
>  struct md_cluster_info {
> @@ -54,7 +55,8 @@ struct md_cluster_info {
>  	dlm_lockspace_t *lockspace;
>  	int slot_number;
>  	struct completion completion;
> -	struct mutex sb_mutex;
> +	struct mutex recv_mutex;
> +	struct mutex send_mutex;
>  	struct dlm_lock_resource *bitmap_lockres;
>  	struct dlm_lock_resource *resync_lockres;
>  	struct list_head suspend_list;
> @@ -503,6 +505,7 @@ static void recv_daemon(struct md_thread *thread)
>  	struct cluster_msg msg;
>  	int ret;
>  
> +	mutex_lock(&cinfo->recv_mutex);
>  	/*get CR on Message*/
>  	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
>  		pr_err("md/raid1:failed to get CR on MESSAGE\n");
> @@ -529,6 +532,7 @@ static void recv_daemon(struct md_thread *thread)
>  	ret = dlm_unlock_sync(message_lockres);
>  	if (unlikely(ret != 0))
>  		pr_info("unlock msg failed return %d\n", ret);
> +	mutex_unlock(&cinfo->recv_mutex);
>  }
>  
>  /* lock_comm()
> @@ -542,20 +546,22 @@ static int lock_comm(struct md_cluster_info *cinfo)
>  {
>  	int error;
>  
> -	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
> -		return 0;
> +	mutex_lock(&cinfo->send_mutex);
>  
>  	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
>  	if (error)
>  		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
>  				__func__, __LINE__, error);
> +	mutex_lock(&cinfo->recv_mutex);
>  	return error;
>  }
>  
>  static void unlock_comm(struct md_cluster_info *cinfo)
>  {
>  	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
> +	mutex_unlock(&cinfo->recv_mutex);
>  	dlm_unlock_sync(cinfo->token_lockres);
> +	mutex_unlock(&cinfo->send_mutex);
>  }
>  
>  /* __sendmsg()
> @@ -709,7 +715,8 @@ static int join(struct mddev *mddev, int nodes)
>  	init_completion(&cinfo->completion);
>  	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
>  
> -	mutex_init(&cinfo->sb_mutex);
> +	mutex_init(&cinfo->send_mutex);
> +	mutex_init(&cinfo->recv_mutex);
>  	mddev->cluster_info = cinfo;
>  
>  	memset(str, 0, 64);
> @@ -839,7 +846,12 @@ static int slot_number(struct mddev *mddev)
>  
>  static int metadata_update_start(struct mddev *mddev)
>  {
> -	return lock_comm(mddev->cluster_info);
> +	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	int ret;
> +	if (test_and_clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state))
> +		return 0;
> +	ret = lock_comm(cinfo);
> +	return ret;
>  }
>  
>  static int metadata_update_finish(struct mddev *mddev)
> @@ -864,6 +876,7 @@ static int metadata_update_finish(struct mddev *mddev)
>  		ret = __sendmsg(cinfo, &cmsg);
>  	} else
>  		pr_warn("md-cluster: No good device id found to send\n");
> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>  	unlock_comm(cinfo);
>  	return ret;
>  }
> @@ -871,6 +884,7 @@ static int metadata_update_finish(struct mddev *mddev)
>  static void metadata_update_cancel(struct mddev *mddev)
>  {
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>  	unlock_comm(cinfo);
>  }
>  
> @@ -945,6 +959,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
>  	memcpy(cmsg.uuid, uuid, 16);
>  	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
>  	lock_comm(cinfo);
> +	set_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>  	ret = __sendmsg(cinfo, &cmsg);
>  	if (ret)
>  		return ret;
> @@ -964,6 +979,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
>  static void add_new_disk_cancel(struct mddev *mddev)
>  {
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>  	unlock_comm(cinfo);
>  }
>  
> -- 
> 1.8.5.6

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/6] md-cluster: Avoid the resync ping-pong
  2015-11-06  3:50 ` [PATCH 2/6] md-cluster: Avoid the resync ping-pong rgoldwyn
@ 2015-11-09 23:39   ` NeilBrown
  0 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2015-11-09 23:39 UTC (permalink / raw)
  To: rgoldwyn, linux-raid; +Cc: Goldwyn Rodrigues

[-- Attachment #1: Type: text/plain, Size: 1969 bytes --]

On Fri, Nov 06 2015, rgoldwyn@suse.de wrote:

> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>
> If a RESYNCING message with (0,0) has been sent before, do not send it
> again. This avoids a resync ping pong between the nodes. We read
> the bitmap lockresource's LVB to figure out the previous value
> of the RESYNCING message.

This seems like a bandaid more than a real fix, but maybe I'm wrong.
I'll apply it for now.  If I end of figuring out that there is some
other issue I discuss it then.

>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  drivers/md/md-cluster.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index a93734e..3daa464 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -898,8 +898,16 @@ static int resync_start(struct mddev *mddev)
>  static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
>  {
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	struct resync_info ri;
>  	struct cluster_msg cmsg = {0};
>  
> +	/* do not send zero again, if we have sent before */
> +	if (hi == 0) {
> +		memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
> +		if (le64_to_cpu(ri.hi) == 0)
> +			return 0;

I feel this would look so much better as:
        if (hi == 0) { 
               struct resync_info ri;
               memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr,
                      sizeof(ri));
        }

so if you happened to resend it like that I'd probably remove what I
have already applied and use this instead.
(make variable more local, use sizeof(var) instead of sizeof(type), and
wrap the long line)

Thanks,
NeilBrown

> +	}
> +
>  	add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
>  	/* Re-acquire the lock to refresh LVB */
>  	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
> -- 
> 1.8.5.6

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 3/6] md-cluster: remove a disk asynchronously from cluster environment
  2015-11-06  3:50 ` [PATCH 3/6] md-cluster: remove a disk asynchronously from cluster environment rgoldwyn
@ 2015-11-09 23:43   ` NeilBrown
  0 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2015-11-09 23:43 UTC (permalink / raw)
  To: rgoldwyn, linux-raid; +Cc: Guoqing Jiang, Goldwyn Rodrigues

[-- Attachment #1: Type: text/plain, Size: 2965 bytes --]

On Fri, Nov 06 2015, rgoldwyn@suse.de wrote:

> From: Guoqing Jiang <gqjiang@suse.com>
>
> For cluster raid, if one disk couldn't be reach in one node, then
> other nodes would receive the REMOVE message for the disk.
>
> In receiving node, we can't call md_kick_rdev_from_array to remove
> the disk from array synchronously since the disk might still be busy
> in this node. So let's set a ClusterRemove flag on the disk, then
> let the thread to do the removal job eventually.

Thanks.
I've applied this patch.
However
1/ it isn't against mainline.
2/ While the ClusterRemove flag is (currently) only used in a cluster
   configuration, the functionality that it represents isn't necessarily
   cluster specific.  So I would prefer a more generic name (like
   AutoRemove).
3/ similarly the test on mddev_is_cluster() in md_check_recovery()
   doesn't really search much purpose.

Thanks,
NeilBrown

>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  drivers/md/md-cluster.c |  7 +++++--
>  drivers/md/md.c         | 12 ++++++++++++
>  drivers/md/md.h         |  1 +
>  3 files changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index 3daa464..a681706 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -443,8 +443,11 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
>  	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
>  						   le32_to_cpu(msg->raid_slot));
>  
> -	if (rdev)
> -		md_kick_rdev_from_array(rdev);
> +	if (rdev) {
> +		set_bit(ClusterRemove, &rdev->flags);
> +		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> +		md_wakeup_thread(mddev->thread);
> +	}
>  	else
>  		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
>  			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 44d0342..32ca592 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -8222,6 +8222,18 @@ void md_check_recovery(struct mddev *mddev)
>  			goto unlock;
>  		}
>  
> +		if (mddev_is_clustered(mddev)) {
> +			struct md_rdev *rdev;
> +			/* kick the device if another node issued a
> +			 * remove disk.
> +			 */
> +			rdev_for_each(rdev, mddev) {
> +				if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
> +						rdev->raid_disk < 0)
> +					md_kick_rdev_from_array(rdev);
> +			}
> +		}
> +
>  		if (!mddev->external) {
>  			int did_change = 0;
>  			spin_lock(&mddev->lock);
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 2ea0035..db54341 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -172,6 +172,7 @@ enum flag_bits {
>  				 * This device is seen locally but not
>  				 * by the whole cluster
>  				 */
> +	ClusterRemove,
>  };
>  
>  #define BB_LEN_MASK	(0x00000000000001FFULL)
> -- 
> 1.8.5.6

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/6] md-cluster: Defer MD reloading to mddev->thread
  2015-11-06  3:50 ` [PATCH 4/6] md-cluster: Defer MD reloading to mddev->thread rgoldwyn
@ 2015-11-09 23:48   ` NeilBrown
  2015-11-10  3:26     ` Goldwyn Rodrigues
  0 siblings, 1 reply; 17+ messages in thread
From: NeilBrown @ 2015-11-09 23:48 UTC (permalink / raw)
  To: rgoldwyn, linux-raid; +Cc: Goldwyn Rodrigues

[-- Attachment #1: Type: text/plain, Size: 4619 bytes --]

On Fri, Nov 06 2015, rgoldwyn@suse.de wrote:

> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>
> Reloading of superblock must be performed under reconfig_mutex. However,
> this cannot be done with md_reload_sb because it would deadlock with
> the message DLM lock. So, we defer it in md_check_recovery() which is
> executed by mddev->thread.
>
> This introduces a new flag, MD_RELOAD_SB, which if set, will reload the
> superblock.

I can see no justification for good_device_nr being atomic_t - if you
can explain what you were trying to achieve I could possible suggest why
it isn't needed.

Also good_device_nr is directly related to MD_RELOAD_SB, so it makes
sense to put them both in 'struct mddev' - that would save creating a
new cluster_operation which does very little.

so: not applied.

Thanks,
NeilBrown


>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  drivers/md/md-cluster.c | 12 +++++++++++-
>  drivers/md/md-cluster.h |  1 +
>  drivers/md/md.c         |  3 +++
>  drivers/md/md.h         |  3 +++
>  4 files changed, 18 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index a681706..9a36ad6 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -71,6 +71,7 @@ struct md_cluster_info {
>  	struct md_thread *recv_thread;
>  	struct completion newdisk_completion;
>  	unsigned long state;
> +	atomic_t good_device_nr;
>  };
>  
>  enum msg_type {
> @@ -434,8 +435,10 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
>  static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
>  {
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
> -	md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
> +	atomic_set(&cinfo->good_device_nr, le32_to_cpu(msg->raid_slot));
> +	set_bit(MD_RELOAD_SB, &mddev->flags);
>  	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
> +	md_wakeup_thread(mddev->thread);
>  }
>  
>  static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
> @@ -1047,6 +1050,12 @@ out:
>  	return err;
>  }
>  
> +static int good_device_nr(struct mddev *mddev)
> +{
> +	struct md_cluster_info *cinfo = mddev->cluster_info;
> +	return atomic_read(&cinfo->good_device_nr);
> +}
> +
>  static struct md_cluster_operations cluster_ops = {
>  	.join   = join,
>  	.leave  = leave,
> @@ -1063,6 +1072,7 @@ static struct md_cluster_operations cluster_ops = {
>  	.new_disk_ack = new_disk_ack,
>  	.remove_disk = remove_disk,
>  	.gather_bitmaps = gather_bitmaps,
> +	.good_device_nr = good_device_nr,
>  };
>  
>  static int __init cluster_init(void)
> diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
> index e75ea26..c699c6c 100644
> --- a/drivers/md/md-cluster.h
> +++ b/drivers/md/md-cluster.h
> @@ -24,6 +24,7 @@ struct md_cluster_operations {
>  	int (*new_disk_ack)(struct mddev *mddev, bool ack);
>  	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
>  	int (*gather_bitmaps)(struct md_rdev *rdev);
> +	int (*good_device_nr)(struct mddev *mddev);
>  };
>  
>  #endif /* _MD_CLUSTER_H */
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 32ca592..65b6326 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -8184,6 +8184,7 @@ void md_check_recovery(struct mddev *mddev)
>  		(mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
>  		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
>  		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
> +		test_bit(MD_RELOAD_SB, &mddev->flags) ||
>  		(mddev->external == 0 && mddev->safemode == 1) ||
>  		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
>  		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
> @@ -8232,6 +8233,8 @@ void md_check_recovery(struct mddev *mddev)
>  						rdev->raid_disk < 0)
>  					md_kick_rdev_from_array(rdev);
>  			}
> +			if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
> +				md_reload_sb(mddev, md_cluster_ops->good_device_nr(mddev));
>  		}
>  
>  		if (!mddev->external) {
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index db54341..f89866d 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -222,6 +222,9 @@ struct mddev {
>  #define MD_STILL_CLOSED	4	/* If set, then array has not been opened since
>  				 * md_ioctl checked on it.
>  				 */
> +#define MD_RELOAD_SB	5	/* Reload the superblock because another node
> +				 * updated it.
> +				 */
>  
>  	int				suspended;
>  	atomic_t			active_io;
> -- 
> 1.8.5.6

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 5/6] md-cluster: Fix the remove sequence with the new MD reload code
  2015-11-06  3:50 ` [PATCH 5/6] md-cluster: Fix the remove sequence with the new MD reload code rgoldwyn
@ 2015-11-09 23:49   ` NeilBrown
  0 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2015-11-09 23:49 UTC (permalink / raw)
  To: rgoldwyn, linux-raid; +Cc: Goldwyn Rodrigues, Guoqing Jiang

[-- Attachment #1: Type: text/plain, Size: 2207 bytes --]

On Fri, Nov 06 2015, rgoldwyn@suse.de wrote:

> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>
> The remove disk message does not need metadata_update_start(), but
> can be an independent message.

This looks good.
Applied - thanks.

NeilBrown


>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  drivers/md/md-cluster.c | 2 +-
>  drivers/md/md.c         | 9 +--------
>  2 files changed, 2 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
> index 9a36ad6..33f5d7a 100644
> --- a/drivers/md/md-cluster.c
> +++ b/drivers/md/md-cluster.c
> @@ -1018,7 +1018,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
>  	struct md_cluster_info *cinfo = mddev->cluster_info;
>  	cmsg.type = cpu_to_le32(REMOVE);
>  	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
> -	return __sendmsg(cinfo, &cmsg);
> +	return sendmsg(cinfo, &cmsg);
>  }
>  
>  static int gather_bitmaps(struct md_rdev *rdev)
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 65b6326..4222285 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -6045,15 +6045,11 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
>  {
>  	char b[BDEVNAME_SIZE];
>  	struct md_rdev *rdev;
> -	int ret = -1;
>  
>  	rdev = find_rdev(mddev, dev);
>  	if (!rdev)
>  		return -ENXIO;
>  
> -	if (mddev_is_clustered(mddev))
> -		ret = md_cluster_ops->metadata_update_start(mddev);
> -
>  	if (rdev->raid_disk < 0)
>  		goto kick_rdev;
>  
> @@ -6064,7 +6060,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
>  		goto busy;
>  
>  kick_rdev:
> -	if (mddev_is_clustered(mddev) && ret == 0)
> +	if (mddev_is_clustered(mddev))
>  		md_cluster_ops->remove_disk(mddev, rdev);
>  
>  	md_kick_rdev_from_array(rdev);
> @@ -6073,9 +6069,6 @@ kick_rdev:
>  
>  	return 0;
>  busy:
> -	if (mddev_is_clustered(mddev) && ret == 0)
> -		md_cluster_ops->metadata_update_cancel(mddev);
> -
>  	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
>  		bdevname(rdev->bdev,b), mdname(mddev));
>  	return -EBUSY;
> -- 
> 1.8.5.6

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 6/6] md-cluster: Allow spare devices to be marked as faulty
  2015-11-06  3:50 ` [PATCH 6/6] md-cluster: Allow spare devices to be marked as faulty rgoldwyn
@ 2015-11-09 23:51   ` NeilBrown
  0 siblings, 0 replies; 17+ messages in thread
From: NeilBrown @ 2015-11-09 23:51 UTC (permalink / raw)
  To: rgoldwyn, linux-raid; +Cc: Goldwyn Rodrigues

[-- Attachment #1: Type: text/plain, Size: 966 bytes --]

On Fri, Nov 06 2015, rgoldwyn@suse.de wrote:

> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>
> If a spare device was marked faulty, it would not be reflected
> in receiving nodes because it would mark it as activated and continue.
> Continue the operation, so it may be set as faulty.

Yes, that makes sense.
Applied, thanks.

NeilBrown

>
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  drivers/md/md.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 4222285..b98cf04 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -9013,7 +9013,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
>  				ret = remove_and_add_spares(mddev, rdev2);
>  				pr_info("Activated spare: %s\n",
>  						bdevname(rdev2->bdev,b));
> -				continue;
>  			}
>  			/* device faulty
>  			 * We just want to do the minimum to mark the disk
> -- 
> 1.8.5.6

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/6] md-cluster: Protect communication with mutexes
  2015-11-09 23:31 ` [PATCH 1/6] md-cluster: Protect communication with mutexes NeilBrown
@ 2015-11-10  3:23   ` Goldwyn Rodrigues
  2015-11-12 21:59     ` NeilBrown
  0 siblings, 1 reply; 17+ messages in thread
From: Goldwyn Rodrigues @ 2015-11-10  3:23 UTC (permalink / raw)
  To: NeilBrown, linux-raid; +Cc: Goldwyn Rodrigues



On 11/09/2015 05:31 PM, NeilBrown wrote:
> On Fri, Nov 06 2015, rgoldwyn@suse.de wrote:
>
>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>
>> Communication can happen through multiple threads. It is possible that
>> one thread steps over another threads sequence. So, we use mutexes to
>> protect both the send and receive sequences.
>>
>> We use a new flag CLUSTER_MD_COMM_LOCKED which is used to detect
>> if communication is already locked. This is useful for cases where we need to
>> take and hold the token DLM lock for a while. This bit is set only
>> after locking communication.
>
> I don't understand what this flag is trying to achieve, but I'm fairly
> sure it doesn't achieve it.

Lets consider three specific cases of locking communication channels to 
show the conflict:

1. resync_info_update(): communication is locked and release for sending 
a message.
2. A regular md_update_sb(): communication is locked in 
metadata_update_start() and unlocked in metadata_update_finish() after 
writing to disk. In metadata_update_finish(), the sendmsg is called to 
send METADATA_UPDATED.
3. An add operation which culminates in a md_update_sb(): Here the 
communication is locked before initiating add. If the add is successful, 
it results in md_update_sb(). In md_update_sb(), metadata_update_start() 
has to check if the communication is already locked. If locked, it 
should not lock again.

The flag is used only for case 3. If the communication is already 
locked, it should not lock again. This flag is set only after 
lock_comm() has executed, but is checked for in metadata_update_start(). 
This should insure that any of the operations 1, 2 or 3 do not interfere 
with each other.

I am not sure if I have made the best effort to explain this. I had a 
tough time getting it right (which may or may not be complete).

>
> Maybe if it was test_and_set_bit in metadata_update_start, it might make
> sense.  But then I would suggest that clearing the bit be moved to
> unlock_comm()
>
> Do you just want to use mutex_try_lock() to detect if communication is
> already locked?

Consider a race between md_update_sb() and sendmsg() [Case 1. and 2]

mutex_try_lock() may not work in this situation because lock_comm() 
could have been called by sendmsg(), which will release it as soon as 
the message is sent. In the meantime (while the lock is locked), if a 
metadata_update_sb() operation executes. It will not relock the 
communication. This will result in the WARN_ON in unlock_comm() since 
sendmsg() sequence had already unlocked it.


>
>>
>> Also, Remove stray/ununsed sb_mutex.
>
> I already removed that it mainline - I should have mentioned.
>
> Thanks,
> NeilBrown
>
>
>>
>> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
>> ---
>>   drivers/md/md-cluster.c | 26 +++++++++++++++++++++-----
>>   1 file changed, 21 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
>> index b73729b..a93734e 100644
>> --- a/drivers/md/md-cluster.c
>> +++ b/drivers/md/md-cluster.c
>> @@ -47,6 +47,7 @@ struct resync_info {
>>   #define		MD_CLUSTER_WAITING_FOR_NEWDISK		1
>>   #define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
>>   #define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3
>> +#define		MD_CLUSTER_COMM_LOCKED			4
>>
>>
>>   struct md_cluster_info {
>> @@ -54,7 +55,8 @@ struct md_cluster_info {
>>   	dlm_lockspace_t *lockspace;
>>   	int slot_number;
>>   	struct completion completion;
>> -	struct mutex sb_mutex;
>> +	struct mutex recv_mutex;
>> +	struct mutex send_mutex;
>>   	struct dlm_lock_resource *bitmap_lockres;
>>   	struct dlm_lock_resource *resync_lockres;
>>   	struct list_head suspend_list;
>> @@ -503,6 +505,7 @@ static void recv_daemon(struct md_thread *thread)
>>   	struct cluster_msg msg;
>>   	int ret;
>>
>> +	mutex_lock(&cinfo->recv_mutex);
>>   	/*get CR on Message*/
>>   	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
>>   		pr_err("md/raid1:failed to get CR on MESSAGE\n");
>> @@ -529,6 +532,7 @@ static void recv_daemon(struct md_thread *thread)
>>   	ret = dlm_unlock_sync(message_lockres);
>>   	if (unlikely(ret != 0))
>>   		pr_info("unlock msg failed return %d\n", ret);
>> +	mutex_unlock(&cinfo->recv_mutex);
>>   }
>>
>>   /* lock_comm()
>> @@ -542,20 +546,22 @@ static int lock_comm(struct md_cluster_info *cinfo)
>>   {
>>   	int error;
>>
>> -	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
>> -		return 0;
>> +	mutex_lock(&cinfo->send_mutex);
>>
>>   	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
>>   	if (error)
>>   		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
>>   				__func__, __LINE__, error);
>> +	mutex_lock(&cinfo->recv_mutex);
>>   	return error;
>>   }
>>
>>   static void unlock_comm(struct md_cluster_info *cinfo)
>>   {
>>   	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
>> +	mutex_unlock(&cinfo->recv_mutex);
>>   	dlm_unlock_sync(cinfo->token_lockres);
>> +	mutex_unlock(&cinfo->send_mutex);
>>   }
>>
>>   /* __sendmsg()
>> @@ -709,7 +715,8 @@ static int join(struct mddev *mddev, int nodes)
>>   	init_completion(&cinfo->completion);
>>   	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
>>
>> -	mutex_init(&cinfo->sb_mutex);
>> +	mutex_init(&cinfo->send_mutex);
>> +	mutex_init(&cinfo->recv_mutex);
>>   	mddev->cluster_info = cinfo;
>>
>>   	memset(str, 0, 64);
>> @@ -839,7 +846,12 @@ static int slot_number(struct mddev *mddev)
>>
>>   static int metadata_update_start(struct mddev *mddev)
>>   {
>> -	return lock_comm(mddev->cluster_info);
>> +	struct md_cluster_info *cinfo = mddev->cluster_info;
>> +	int ret;
>> +	if (test_and_clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state))
>> +		return 0;
>> +	ret = lock_comm(cinfo);
>> +	return ret;
>>   }
>>
>>   static int metadata_update_finish(struct mddev *mddev)
>> @@ -864,6 +876,7 @@ static int metadata_update_finish(struct mddev *mddev)
>>   		ret = __sendmsg(cinfo, &cmsg);
>>   	} else
>>   		pr_warn("md-cluster: No good device id found to send\n");
>> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>>   	unlock_comm(cinfo);
>>   	return ret;
>>   }
>> @@ -871,6 +884,7 @@ static int metadata_update_finish(struct mddev *mddev)
>>   static void metadata_update_cancel(struct mddev *mddev)
>>   {
>>   	struct md_cluster_info *cinfo = mddev->cluster_info;
>> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>>   	unlock_comm(cinfo);
>>   }
>>
>> @@ -945,6 +959,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
>>   	memcpy(cmsg.uuid, uuid, 16);
>>   	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
>>   	lock_comm(cinfo);
>> +	set_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>>   	ret = __sendmsg(cinfo, &cmsg);
>>   	if (ret)
>>   		return ret;
>> @@ -964,6 +979,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
>>   static void add_new_disk_cancel(struct mddev *mddev)
>>   {
>>   	struct md_cluster_info *cinfo = mddev->cluster_info;
>> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>>   	unlock_comm(cinfo);
>>   }
>>
>> --
>> 1.8.5.6

-- 
Goldwyn

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 4/6] md-cluster: Defer MD reloading to mddev->thread
  2015-11-09 23:48   ` NeilBrown
@ 2015-11-10  3:26     ` Goldwyn Rodrigues
  2015-11-20  8:25       ` [v2 PATCH] " Guoqing Jiang
  0 siblings, 1 reply; 17+ messages in thread
From: Goldwyn Rodrigues @ 2015-11-10  3:26 UTC (permalink / raw)
  To: NeilBrown, linux-raid; +Cc: Goldwyn Rodrigues



On 11/09/2015 05:48 PM, NeilBrown wrote:
> On Fri, Nov 06 2015, rgoldwyn@suse.de wrote:
>
>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>
>> Reloading of superblock must be performed under reconfig_mutex. However,
>> this cannot be done with md_reload_sb because it would deadlock with
>> the message DLM lock. So, we defer it in md_check_recovery() which is
>> executed by mddev->thread.
>>
>> This introduces a new flag, MD_RELOAD_SB, which if set, will reload the
>> superblock.
>
> I can see no justification for good_device_nr being atomic_t - if you
> can explain what you were trying to achieve I could possible suggest why
> it isn't needed.

Yes, I think it does not need to be atomic.

>
> Also good_device_nr is directly related to MD_RELOAD_SB, so it makes
> sense to put them both in 'struct mddev' - that would save creating a
> new cluster_operation which does very little.

Agree here as well. I got too carried away in keeping cluster-md as 
isolated as possible.


>
> so: not applied.
>
> Thanks,
> NeilBrown
>
>
>>
>> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
>> ---
>>   drivers/md/md-cluster.c | 12 +++++++++++-
>>   drivers/md/md-cluster.h |  1 +
>>   drivers/md/md.c         |  3 +++
>>   drivers/md/md.h         |  3 +++
>>   4 files changed, 18 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
>> index a681706..9a36ad6 100644
>> --- a/drivers/md/md-cluster.c
>> +++ b/drivers/md/md-cluster.c
>> @@ -71,6 +71,7 @@ struct md_cluster_info {
>>   	struct md_thread *recv_thread;
>>   	struct completion newdisk_completion;
>>   	unsigned long state;
>> +	atomic_t good_device_nr;
>>   };
>>
>>   enum msg_type {
>> @@ -434,8 +435,10 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
>>   static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
>>   {
>>   	struct md_cluster_info *cinfo = mddev->cluster_info;
>> -	md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
>> +	atomic_set(&cinfo->good_device_nr, le32_to_cpu(msg->raid_slot));
>> +	set_bit(MD_RELOAD_SB, &mddev->flags);
>>   	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
>> +	md_wakeup_thread(mddev->thread);
>>   }
>>
>>   static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
>> @@ -1047,6 +1050,12 @@ out:
>>   	return err;
>>   }
>>
>> +static int good_device_nr(struct mddev *mddev)
>> +{
>> +	struct md_cluster_info *cinfo = mddev->cluster_info;
>> +	return atomic_read(&cinfo->good_device_nr);
>> +}
>> +
>>   static struct md_cluster_operations cluster_ops = {
>>   	.join   = join,
>>   	.leave  = leave,
>> @@ -1063,6 +1072,7 @@ static struct md_cluster_operations cluster_ops = {
>>   	.new_disk_ack = new_disk_ack,
>>   	.remove_disk = remove_disk,
>>   	.gather_bitmaps = gather_bitmaps,
>> +	.good_device_nr = good_device_nr,
>>   };
>>
>>   static int __init cluster_init(void)
>> diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
>> index e75ea26..c699c6c 100644
>> --- a/drivers/md/md-cluster.h
>> +++ b/drivers/md/md-cluster.h
>> @@ -24,6 +24,7 @@ struct md_cluster_operations {
>>   	int (*new_disk_ack)(struct mddev *mddev, bool ack);
>>   	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
>>   	int (*gather_bitmaps)(struct md_rdev *rdev);
>> +	int (*good_device_nr)(struct mddev *mddev);
>>   };
>>
>>   #endif /* _MD_CLUSTER_H */
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 32ca592..65b6326 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -8184,6 +8184,7 @@ void md_check_recovery(struct mddev *mddev)
>>   		(mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
>>   		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
>>   		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
>> +		test_bit(MD_RELOAD_SB, &mddev->flags) ||
>>   		(mddev->external == 0 && mddev->safemode == 1) ||
>>   		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
>>   		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
>> @@ -8232,6 +8233,8 @@ void md_check_recovery(struct mddev *mddev)
>>   						rdev->raid_disk < 0)
>>   					md_kick_rdev_from_array(rdev);
>>   			}
>> +			if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
>> +				md_reload_sb(mddev, md_cluster_ops->good_device_nr(mddev));
>>   		}
>>
>>   		if (!mddev->external) {
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index db54341..f89866d 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -222,6 +222,9 @@ struct mddev {
>>   #define MD_STILL_CLOSED	4	/* If set, then array has not been opened since
>>   				 * md_ioctl checked on it.
>>   				 */
>> +#define MD_RELOAD_SB	5	/* Reload the superblock because another node
>> +				 * updated it.
>> +				 */
>>
>>   	int				suspended;
>>   	atomic_t			active_io;
>> --
>> 1.8.5.6

-- 
Goldwyn

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/6] md-cluster: Protect communication with mutexes
  2015-11-10  3:23   ` Goldwyn Rodrigues
@ 2015-11-12 21:59     ` NeilBrown
  2015-11-20  8:27       ` [v2 PATCH 2/5] " Guoqing Jiang
  0 siblings, 1 reply; 17+ messages in thread
From: NeilBrown @ 2015-11-12 21:59 UTC (permalink / raw)
  To: Goldwyn Rodrigues, linux-raid; +Cc: Goldwyn Rodrigues

[-- Attachment #1: Type: text/plain, Size: 9182 bytes --]

On Tue, Nov 10 2015, Goldwyn Rodrigues wrote:

> On 11/09/2015 05:31 PM, NeilBrown wrote:
>> On Fri, Nov 06 2015, rgoldwyn@suse.de wrote:
>>
>>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>>
>>> Communication can happen through multiple threads. It is possible that
>>> one thread steps over another threads sequence. So, we use mutexes to
>>> protect both the send and receive sequences.
>>>
>>> We use a new flag CLUSTER_MD_COMM_LOCKED which is used to detect
>>> if communication is already locked. This is useful for cases where we need to
>>> take and hold the token DLM lock for a while. This bit is set only
>>> after locking communication.
>>
>> I don't understand what this flag is trying to achieve, but I'm fairly
>> sure it doesn't achieve it.
>
> Lets consider three specific cases of locking communication channels to 
> show the conflict:
>
> 1. resync_info_update(): communication is locked and release for sending 
> a message.
> 2. A regular md_update_sb(): communication is locked in 
> metadata_update_start() and unlocked in metadata_update_finish() after 
> writing to disk. In metadata_update_finish(), the sendmsg is called to 
> send METADATA_UPDATED.
> 3. An add operation which culminates in a md_update_sb(): Here the 
> communication is locked before initiating add. If the add is successful, 
> it results in md_update_sb(). In md_update_sb(), metadata_update_start() 
> has to check if the communication is already locked. If locked, it 
> should not lock again.

Oh, I get it now - thanks.

Going back and looking at the original commit I can now see that it does
say that, but I didn't understand the implication at the time.

I might be wrong again, but I think this approach is broken.
The 'add disk' sequence does:
  1/ ->add_new_disk which takes the lock
  2/ other stuff, protected by the lock
  3/ schedule a metadata update
  4/ metadata update is initiated, which doesn't take the lock
        because the flag is set
  5/ metadata update completes, lock is dropped.

What if some other event causes the metadata update to happen during
'2'?

Maybe if you delayed setting MD_CLUSTER_COMM_LOCKED until after '2'.
i.e. set it when scheduling the update.  So the flag means:
  "lock has been taken for metadata update"

One tricky bit there would be if metadata_update_start() found the bit
wasn't set, and then entered mutex_lock() in lock_comm().
When MD_CLUSTER_COMM_LOCKED gets set we want that code to stop waiting
and start doing useful things.  But it won't.


It might be easiest to make our own 'mutex' with a flag bit and a wait
queue.

Then calls to mutex_lock become
 wait_event(mddev->wait, !test_and_set_bit(bit, &cinfo->state));
mutex_unlock becoems
 clear_bit(bit, &cinfo->state);

and the mutex_lock used from metadata_update_start is
  wait_event(mddev->wait, !test_and_set_bit(bit, &cinfo->state) ||
                          test_and_clear_bit(MD_CLUSTER_COMM_LOCKED,
  ....));

That might work.... providing it is well documented.

(and you are right - mutex_trylock wouldn't have helped, I was
completely misunderstanding).

Thanks,
NeilBrown



>
> The flag is used only for case 3. If the communication is already 
> locked, it should not lock again. This flag is set only after 
> lock_comm() has executed, but is checked for in metadata_update_start(). 
> This should insure that any of the operations 1, 2 or 3 do not interfere 
> with each other.
>
> I am not sure if I have made the best effort to explain this. I had a 
> tough time getting it right (which may or may not be complete).
>
>>
>> Maybe if it was test_and_set_bit in metadata_update_start, it might make
>> sense.  But then I would suggest that clearing the bit be moved to
>> unlock_comm()
>>
>> Do you just want to use mutex_try_lock() to detect if communication is
>> already locked?
>
> Consider a race between md_update_sb() and sendmsg() [Case 1. and 2]
>
> mutex_try_lock() may not work in this situation because lock_comm() 
> could have been called by sendmsg(), which will release it as soon as 
> the message is sent. In the meantime (while the lock is locked), if a 
> metadata_update_sb() operation executes. It will not relock the 
> communication. This will result in the WARN_ON in unlock_comm() since 
> sendmsg() sequence had already unlocked it.
>
>
>>
>>>
>>> Also, Remove stray/ununsed sb_mutex.
>>
>> I already removed that it mainline - I should have mentioned.
>>
>> Thanks,
>> NeilBrown
>>
>>
>>>
>>> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>> ---
>>>   drivers/md/md-cluster.c | 26 +++++++++++++++++++++-----
>>>   1 file changed, 21 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
>>> index b73729b..a93734e 100644
>>> --- a/drivers/md/md-cluster.c
>>> +++ b/drivers/md/md-cluster.c
>>> @@ -47,6 +47,7 @@ struct resync_info {
>>>   #define		MD_CLUSTER_WAITING_FOR_NEWDISK		1
>>>   #define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
>>>   #define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3
>>> +#define		MD_CLUSTER_COMM_LOCKED			4
>>>
>>>
>>>   struct md_cluster_info {
>>> @@ -54,7 +55,8 @@ struct md_cluster_info {
>>>   	dlm_lockspace_t *lockspace;
>>>   	int slot_number;
>>>   	struct completion completion;
>>> -	struct mutex sb_mutex;
>>> +	struct mutex recv_mutex;
>>> +	struct mutex send_mutex;
>>>   	struct dlm_lock_resource *bitmap_lockres;
>>>   	struct dlm_lock_resource *resync_lockres;
>>>   	struct list_head suspend_list;
>>> @@ -503,6 +505,7 @@ static void recv_daemon(struct md_thread *thread)
>>>   	struct cluster_msg msg;
>>>   	int ret;
>>>
>>> +	mutex_lock(&cinfo->recv_mutex);
>>>   	/*get CR on Message*/
>>>   	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
>>>   		pr_err("md/raid1:failed to get CR on MESSAGE\n");
>>> @@ -529,6 +532,7 @@ static void recv_daemon(struct md_thread *thread)
>>>   	ret = dlm_unlock_sync(message_lockres);
>>>   	if (unlikely(ret != 0))
>>>   		pr_info("unlock msg failed return %d\n", ret);
>>> +	mutex_unlock(&cinfo->recv_mutex);
>>>   }
>>>
>>>   /* lock_comm()
>>> @@ -542,20 +546,22 @@ static int lock_comm(struct md_cluster_info *cinfo)
>>>   {
>>>   	int error;
>>>
>>> -	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
>>> -		return 0;
>>> +	mutex_lock(&cinfo->send_mutex);
>>>
>>>   	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
>>>   	if (error)
>>>   		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
>>>   				__func__, __LINE__, error);
>>> +	mutex_lock(&cinfo->recv_mutex);
>>>   	return error;
>>>   }
>>>
>>>   static void unlock_comm(struct md_cluster_info *cinfo)
>>>   {
>>>   	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
>>> +	mutex_unlock(&cinfo->recv_mutex);
>>>   	dlm_unlock_sync(cinfo->token_lockres);
>>> +	mutex_unlock(&cinfo->send_mutex);
>>>   }
>>>
>>>   /* __sendmsg()
>>> @@ -709,7 +715,8 @@ static int join(struct mddev *mddev, int nodes)
>>>   	init_completion(&cinfo->completion);
>>>   	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
>>>
>>> -	mutex_init(&cinfo->sb_mutex);
>>> +	mutex_init(&cinfo->send_mutex);
>>> +	mutex_init(&cinfo->recv_mutex);
>>>   	mddev->cluster_info = cinfo;
>>>
>>>   	memset(str, 0, 64);
>>> @@ -839,7 +846,12 @@ static int slot_number(struct mddev *mddev)
>>>
>>>   static int metadata_update_start(struct mddev *mddev)
>>>   {
>>> -	return lock_comm(mddev->cluster_info);
>>> +	struct md_cluster_info *cinfo = mddev->cluster_info;
>>> +	int ret;
>>> +	if (test_and_clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state))
>>> +		return 0;
>>> +	ret = lock_comm(cinfo);
>>> +	return ret;
>>>   }
>>>
>>>   static int metadata_update_finish(struct mddev *mddev)
>>> @@ -864,6 +876,7 @@ static int metadata_update_finish(struct mddev *mddev)
>>>   		ret = __sendmsg(cinfo, &cmsg);
>>>   	} else
>>>   		pr_warn("md-cluster: No good device id found to send\n");
>>> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>>>   	unlock_comm(cinfo);
>>>   	return ret;
>>>   }
>>> @@ -871,6 +884,7 @@ static int metadata_update_finish(struct mddev *mddev)
>>>   static void metadata_update_cancel(struct mddev *mddev)
>>>   {
>>>   	struct md_cluster_info *cinfo = mddev->cluster_info;
>>> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>>>   	unlock_comm(cinfo);
>>>   }
>>>
>>> @@ -945,6 +959,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
>>>   	memcpy(cmsg.uuid, uuid, 16);
>>>   	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
>>>   	lock_comm(cinfo);
>>> +	set_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>>>   	ret = __sendmsg(cinfo, &cmsg);
>>>   	if (ret)
>>>   		return ret;
>>> @@ -964,6 +979,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
>>>   static void add_new_disk_cancel(struct mddev *mddev)
>>>   {
>>>   	struct md_cluster_info *cinfo = mddev->cluster_info;
>>> +	clear_bit(MD_CLUSTER_COMM_LOCKED, &cinfo->state);
>>>   	unlock_comm(cinfo);
>>>   }
>>>
>>> --
>>> 1.8.5.6
>
> -- 
> Goldwyn

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 818 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [v2 PATCH] md-cluster: Defer MD reloading to mddev->thread
  2015-11-10  3:26     ` Goldwyn Rodrigues
@ 2015-11-20  8:25       ` Guoqing Jiang
  0 siblings, 0 replies; 17+ messages in thread
From: Guoqing Jiang @ 2015-11-20  8:25 UTC (permalink / raw)
  To: neilb, rgoldwyn; +Cc: linux-raid, Guoqing Jiang, Goldwyn Rodrigues

Reloading of superblock must be performed under reconfig_mutex. However,
this cannot be done with md_reload_sb because it would deadlock with
the message DLM lock. So, we defer it in md_check_recovery() which is
executed by mddev->thread.

This introduces a new flag, MD_RELOAD_SB, which if set, will reload the
superblock. And good_device_nr is also added to 'struct mddev' which is
used to get the num of the good device within cluster raid.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
v2 changes: remove atomic and add good_device_nr to struct mddev

 drivers/md/md-cluster.c | 4 +++-
 drivers/md/md.c         | 4 ++++
 drivers/md/md.h         | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index b58374d..89343c3 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -431,8 +431,10 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
+	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
+	set_bit(MD_RELOAD_SB, &mddev->flags);
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+	md_wakeup_thread(mddev->thread);
 }
 
 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7ab9ed9..06066e67 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8260,6 +8260,7 @@ void md_check_recovery(struct mddev *mddev)
 		(mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+		test_bit(MD_RELOAD_SB, &mddev->flags) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
 		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -8308,6 +8309,9 @@ void md_check_recovery(struct mddev *mddev)
 						rdev->raid_disk < 0)
 					md_kick_rdev_from_array(rdev);
 			}
+
+			if (test_and_clear_bit(MD_RELOAD_SB, &mddev->flags))
+				md_reload_sb(mddev, mddev->good_device_nr);
 		}
 
 		if (!mddev->external) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f5b9aad..7a6da5c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -235,6 +235,9 @@ struct mddev {
 				 */
 #define MD_JOURNAL_CLEAN 5	/* A raid with journal is already clean */
 #define MD_HAS_JOURNAL	6	/* The raid array has journal feature set */
+#define MD_RELOAD_SB	7	/* Reload the superblock because another node
+				 * updated it.
+				 */
 
 	int				suspended;
 	atomic_t			active_io;
@@ -465,6 +468,7 @@ struct mddev {
 	struct work_struct event_work;	/* used by dm to report failure event */
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
 	struct md_cluster_info		*cluster_info;
+	unsigned int			good_device_nr;	/* good device num within cluster raid */
 };
 
 static inline int __must_check mddev_lock(struct mddev *mddev)
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [v2 PATCH 2/5] md-cluster: Protect communication with mutexes
  2015-11-12 21:59     ` NeilBrown
@ 2015-11-20  8:27       ` Guoqing Jiang
  0 siblings, 0 replies; 17+ messages in thread
From: Guoqing Jiang @ 2015-11-20  8:27 UTC (permalink / raw)
  To: neilb, rgoldwyn; +Cc: linux-raid, Guoqing Jiang, Goldwyn Rodrigues

Communication can happen through multiple threads. It is possible that
one thread steps over another threads sequence. So, we use mutexes to
protect both the send and receive sequences.

Send communication is locked through state bit, MD_CLUSTER_SEND_LOCK.
Communication is locked with bit manipulation in order to allow
"lock and hold" for the add operation. In case of an add operation,
if the lock is held, MD_CLUSTER_SEND_LOCKED_ALREADY is set.
When md_update_sb() calls metadata_update_start(), it checks
(in a single statement to avoid races), if the communication
is already locked. If yes, it merely returns zero, else it
locks the token lockresource.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
v2 changes:
1. replace send_mutex with wait_queue.
2. change MD_CLUSTER_COMM_LOCKED to MD_CLUSTER_SEND_LOCKED_ALREADY,
   and set it later with add_new_disk.
3. add a new lock_token and change lock_comm to make it better
   suit for the specific cases of locking communication.
   

 drivers/md/md-cluster.c | 72 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 62 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 89343c3..430b139 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -48,12 +48,26 @@ struct resync_info {
 #define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
 #define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3
 
+/* Lock the send communication. This is done through
+ * bit manipulation as opposed to a mutex in order to
+ * accomodate lock and hold. See next comment.
+ */
+#define		MD_CLUSTER_SEND_LOCK			4
+/* If cluster operations must lock the communication channel,
+ * so as to perform extra operations (and no other operation
+ * is allowed on the MD, such as adding a disk. Token needs
+ * to be locked and held until the operation completes with
+ * a md_update_sb(), which would eventually release the lock.
+ */
+#define		MD_CLUSTER_SEND_LOCKED_ALREADY		5
+
 
 struct md_cluster_info {
 	/* dlm lock space and resources for clustered raid. */
 	dlm_lockspace_t *lockspace;
 	int slot_number;
 	struct completion completion;
+	struct mutex recv_mutex;
 	struct dlm_lock_resource *bitmap_lockres;
 	struct dlm_lock_resource *resync_lockres;
 	struct list_head suspend_list;
@@ -67,6 +81,7 @@ struct md_cluster_info {
 	struct dlm_lock_resource *no_new_dev_lockres;
 	struct md_thread *recv_thread;
 	struct completion newdisk_completion;
+	wait_queue_head_t wait;
 	unsigned long state;
 };
 
@@ -507,6 +522,7 @@ static void recv_daemon(struct md_thread *thread)
 	struct cluster_msg msg;
 	int ret;
 
+	mutex_lock(&cinfo->recv_mutex);
 	/*get CR on Message*/
 	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
 		pr_err("md/raid1:failed to get CR on MESSAGE\n");
@@ -533,33 +549,45 @@ static void recv_daemon(struct md_thread *thread)
 	ret = dlm_unlock_sync(message_lockres);
 	if (unlikely(ret != 0))
 		pr_info("unlock msg failed return %d\n", ret);
+	mutex_unlock(&cinfo->recv_mutex);
 }
 
-/* lock_comm()
+/* lock_token()
  * Takes the lock on the TOKEN lock resource so no other
  * node can communicate while the operation is underway.
- * If called again, and the TOKEN lock is alread in EX mode
- * return success. However, care must be taken that unlock_comm()
- * is called only once.
  */
-static int lock_comm(struct md_cluster_info *cinfo)
+static int lock_token(struct md_cluster_info *cinfo)
 {
 	int error;
 
-	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
-		return 0;
-
 	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
 	if (error)
 		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
 				__func__, __LINE__, error);
+
+	/* Lock the receive sequence */
+	mutex_lock(&cinfo->recv_mutex);
 	return error;
 }
 
+/* lock_comm()
+ * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
+ */
+static int lock_comm(struct md_cluster_info *cinfo)
+{
+	wait_event(cinfo->wait,
+		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
+
+	return lock_token(cinfo);
+}
+
 static void unlock_comm(struct md_cluster_info *cinfo)
 {
 	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
+	mutex_unlock(&cinfo->recv_mutex);
 	dlm_unlock_sync(cinfo->token_lockres);
+	clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state);
+	wake_up(&cinfo->wait);
 }
 
 /* __sendmsg()
@@ -712,6 +740,8 @@ static int join(struct mddev *mddev, int nodes)
 	spin_lock_init(&cinfo->suspend_lock);
 	init_completion(&cinfo->completion);
 	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
+	init_waitqueue_head(&cinfo->wait);
+	mutex_init(&cinfo->recv_mutex);
 
 	mddev->cluster_info = cinfo;
 
@@ -840,9 +870,25 @@ static int slot_number(struct mddev *mddev)
 	return cinfo->slot_number - 1;
 }
 
+/*
+ * Check if the communication is already locked, else lock the communication
+ * channel.
+ * If it is already locked, token is in EX mode, and hence lock_token()
+ * should not be called.
+ */
 static int metadata_update_start(struct mddev *mddev)
 {
-	return lock_comm(mddev->cluster_info);
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	wait_event(cinfo->wait,
+		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
+		   test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
+
+	/* If token is already locked, return 0 */
+	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+		return 0;
+
+	return lock_token(cinfo);
 }
 
 static int metadata_update_finish(struct mddev *mddev)
@@ -867,6 +913,7 @@ static int metadata_update_finish(struct mddev *mddev)
 		ret = __sendmsg(cinfo, &cmsg);
 	} else
 		pr_warn("md-cluster: No good device id found to send\n");
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 	return ret;
 }
@@ -874,6 +921,7 @@ static int metadata_update_finish(struct mddev *mddev)
 static void metadata_update_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 }
 
@@ -967,14 +1015,18 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 		ret = -ENOENT;
 	if (ret)
 		unlock_comm(cinfo);
-	else
+	else {
 		dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+		set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
+		wake_up(&cinfo->wait);
+	}
 	return ret;
 }
 
 static void add_new_disk_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
 	unlock_comm(cinfo);
 }
 
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2015-11-20  8:27 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-06  3:50 [PATCH 1/6] md-cluster: Protect communication with mutexes rgoldwyn
2015-11-06  3:50 ` [PATCH 2/6] md-cluster: Avoid the resync ping-pong rgoldwyn
2015-11-09 23:39   ` NeilBrown
2015-11-06  3:50 ` [PATCH 3/6] md-cluster: remove a disk asynchronously from cluster environment rgoldwyn
2015-11-09 23:43   ` NeilBrown
2015-11-06  3:50 ` [PATCH 4/6] md-cluster: Defer MD reloading to mddev->thread rgoldwyn
2015-11-09 23:48   ` NeilBrown
2015-11-10  3:26     ` Goldwyn Rodrigues
2015-11-20  8:25       ` [v2 PATCH] " Guoqing Jiang
2015-11-06  3:50 ` [PATCH 5/6] md-cluster: Fix the remove sequence with the new MD reload code rgoldwyn
2015-11-09 23:49   ` NeilBrown
2015-11-06  3:50 ` [PATCH 6/6] md-cluster: Allow spare devices to be marked as faulty rgoldwyn
2015-11-09 23:51   ` NeilBrown
2015-11-09 23:31 ` [PATCH 1/6] md-cluster: Protect communication with mutexes NeilBrown
2015-11-10  3:23   ` Goldwyn Rodrigues
2015-11-12 21:59     ` NeilBrown
2015-11-20  8:27       ` [v2 PATCH 2/5] " Guoqing Jiang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.