[PATCH 1/2] md-cluster: fix wrong condition check in raid1_write

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] md-cluster: fix wrong condition check in raid1_write_request
@ 2017-09-29  1:16 Guoqing Jiang
  2017-09-29  1:16 ` [PATCH 2/2] md: clear THREAD_WAKEUP flag in mddev_trylock Guoqing Jiang
  2017-09-29 17:54 ` [PATCH 1/2] md-cluster: fix wrong condition check in raid1_write_request Shaohua Li
  0 siblings, 2 replies; 4+ messages in thread
From: Guoqing Jiang @ 2017-09-29  1:16 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang

The check used here is to avoid conflict between write and
resync, however we used the wrong logic, it should be the
inverse of the checking inside "if".

Fixes: 589a1c4 ("Suspend writes in RAID1 if within range")
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/raid1.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f3f3e40..35264ad 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1325,12 +1325,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			sigset_t full, old;
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_INTERRUPTIBLE);
-			if (bio_end_sector(bio) <= mddev->suspend_lo ||
-			    bio->bi_iter.bi_sector >= mddev->suspend_hi ||
-			    (mddev_is_clustered(mddev) &&
+			if ((bio_end_sector(bio) <= mddev->suspend_lo ||
+			     bio->bi_iter.bi_sector >= mddev->suspend_hi) &&
+			    (!mddev_is_clustered(mddev) ||
 			     !md_cluster_ops->area_resyncing(mddev, WRITE,
-				     bio->bi_iter.bi_sector,
-				     bio_end_sector(bio))))
+							bio->bi_iter.bi_sector,
+							bio_end_sector(bio))))
 				break;
 			sigfillset(&full);
 			sigprocmask(SIG_BLOCK, &full, &old);
-- 
2.6.6


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/2] md: clear THREAD_WAKEUP flag in mddev_trylock
  2017-09-29  1:16 [PATCH 1/2] md-cluster: fix wrong condition check in raid1_write_request Guoqing Jiang
@ 2017-09-29  1:16 ` Guoqing Jiang
  2017-09-29 18:00   ` Shaohua Li
  2017-09-29 17:54 ` [PATCH 1/2] md-cluster: fix wrong condition check in raid1_write_request Shaohua Li
  1 sibling, 1 reply; 4+ messages in thread
From: Guoqing Jiang @ 2017-09-29  1:16 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang

Since commit 4ad23a976413aa57fe5ba7a25953dc35ccca5b71 ("MD: use
per-cpu counter for writes_pending"), the wait_queue is only got
invoked if THREAD_WAKEUP is not set previously.

With above change, I can see process_metadata_update could always
hang on the wait queue, because mddev->thread could stay on 'D'
status and the THREAD_WAKEUP flag is not cleared since there are
lots of place to wake up mddev->thread. Then deadlock happened
as follows:

linux175:~ # ps aux|grep md|grep D
root    20117   0.0 0.0         0   0 ? D   03:45   0:00 [md0_raid1]
root    20125   0.0 0.0         0   0 ? D   03:45   0:00 [md0_cluster_rec]
linux175:~ # cat /proc/20117/stack
[<ffffffffa0635604>] dlm_lock_sync+0x94/0xd0 [md_cluster]
[<ffffffffa0635674>] lock_token+0x34/0xd0 [md_cluster]
[<ffffffffa0635804>] metadata_update_start+0x64/0x110 [md_cluster]
[<ffffffffa04d985b>] md_update_sb.part.58+0x9b/0x860 [md_mod]
[<ffffffffa04da035>] md_update_sb+0x15/0x30 [md_mod]
[<ffffffffa04dc066>] md_check_recovery+0x266/0x490 [md_mod]
[<ffffffffa06450e2>] raid1d+0x42/0x810 [raid1]
[<ffffffffa04d2252>] md_thread+0x122/0x150 [md_mod]
[<ffffffff81091741>] kthread+0x101/0x140
linux175:~ # cat /proc/20125/stack
[<ffffffffa0636679>] recv_daemon+0x3f9/0x5c0 [md_cluster]
[<ffffffffa04d2252>] md_thread+0x122/0x150 [md_mod]
[<ffffffff81091741>] kthread+0x101/0x140

To resolve the problem, we need to clear THREAD_WAKEUP in case
mddev_trylock failed to get reconfig_mutex, which could ensure
the wait queue can be waked by call md_wakeup_thread. And this
issue maybe affects mddev_trylock in md_check_recovery as well.

With the new change, mddev_trylock is not suitable as static-inline
function, so move it to md.c.

Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending")
Suggested-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md.c | 14 ++++++++++++++
 drivers/md/md.h |  5 +----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0ff1bbf..7c2517a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -623,6 +623,20 @@ static struct mddev *mddev_find(dev_t unit)
 
 static struct attribute_group md_redundancy_group;
 
+int mddev_trylock(struct mddev *mddev)
+{
+	if (mutex_trylock(&mddev->reconfig_mutex))
+		return 1;
+
+	spin_lock(&pers_lock);
+	if (mddev->thread)
+		clear_bit(THREAD_WAKEUP, &mddev->thread->flags);
+	spin_unlock(&pers_lock);
+
+	return mutex_trylock(&mddev->reconfig_mutex);
+}
+EXPORT_SYMBOL_GPL(mddev_trylock);
+
 void mddev_unlock(struct mddev *mddev)
 {
 	if (mddev->to_remove) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d8287d3..7fe7b48 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -499,10 +499,7 @@ static inline int mddev_is_locked(struct mddev *mddev)
 	return mutex_is_locked(&mddev->reconfig_mutex);
 }
 
-static inline int mddev_trylock(struct mddev *mddev)
-{
-	return mutex_trylock(&mddev->reconfig_mutex);
-}
+extern int mddev_trylock(struct mddev *mddev);
 extern void mddev_unlock(struct mddev *mddev);
 
 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
-- 
2.6.6


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 1/2] md-cluster: fix wrong condition check in raid1_write_request
  2017-09-29  1:16 [PATCH 1/2] md-cluster: fix wrong condition check in raid1_write_request Guoqing Jiang
  2017-09-29  1:16 ` [PATCH 2/2] md: clear THREAD_WAKEUP flag in mddev_trylock Guoqing Jiang
@ 2017-09-29 17:54 ` Shaohua Li
  1 sibling, 0 replies; 4+ messages in thread
From: Shaohua Li @ 2017-09-29 17:54 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli

On Fri, Sep 29, 2017 at 09:16:43AM +0800, Guoqing Jiang wrote:
> The check used here is to avoid conflict between write and
> resync, however we used the wrong logic, it should be the
> inverse of the checking inside "if".
> 
> Fixes: 589a1c4 ("Suspend writes in RAID1 if within range")
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>

applied, thanks!
> ---
>  drivers/md/raid1.c | 10 +++++-----
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index f3f3e40..35264ad 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -1325,12 +1325,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
>  			sigset_t full, old;
>  			prepare_to_wait(&conf->wait_barrier,
>  					&w, TASK_INTERRUPTIBLE);
> -			if (bio_end_sector(bio) <= mddev->suspend_lo ||
> -			    bio->bi_iter.bi_sector >= mddev->suspend_hi ||
> -			    (mddev_is_clustered(mddev) &&
> +			if ((bio_end_sector(bio) <= mddev->suspend_lo ||
> +			     bio->bi_iter.bi_sector >= mddev->suspend_hi) &&
> +			    (!mddev_is_clustered(mddev) ||
>  			     !md_cluster_ops->area_resyncing(mddev, WRITE,
> -				     bio->bi_iter.bi_sector,
> -				     bio_end_sector(bio))))
> +							bio->bi_iter.bi_sector,
> +							bio_end_sector(bio))))
>  				break;
>  			sigfillset(&full);
>  			sigprocmask(SIG_BLOCK, &full, &old);
> -- 
> 2.6.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] md: clear THREAD_WAKEUP flag in mddev_trylock
  2017-09-29  1:16 ` [PATCH 2/2] md: clear THREAD_WAKEUP flag in mddev_trylock Guoqing Jiang
@ 2017-09-29 18:00   ` Shaohua Li
  0 siblings, 0 replies; 4+ messages in thread
From: Shaohua Li @ 2017-09-29 18:00 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid

On Fri, Sep 29, 2017 at 09:16:44AM +0800, Guoqing Jiang wrote:
> Since commit 4ad23a976413aa57fe5ba7a25953dc35ccca5b71 ("MD: use
> per-cpu counter for writes_pending"), the wait_queue is only got
> invoked if THREAD_WAKEUP is not set previously.
> 
> With above change, I can see process_metadata_update could always
> hang on the wait queue, because mddev->thread could stay on 'D'
> status and the THREAD_WAKEUP flag is not cleared since there are
> lots of place to wake up mddev->thread. Then deadlock happened
> as follows:
> 
> linux175:~ # ps aux|grep md|grep D
> root    20117   0.0 0.0         0   0 ? D   03:45   0:00 [md0_raid1]
> root    20125   0.0 0.0         0   0 ? D   03:45   0:00 [md0_cluster_rec]
> linux175:~ # cat /proc/20117/stack
> [<ffffffffa0635604>] dlm_lock_sync+0x94/0xd0 [md_cluster]
> [<ffffffffa0635674>] lock_token+0x34/0xd0 [md_cluster]
> [<ffffffffa0635804>] metadata_update_start+0x64/0x110 [md_cluster]
> [<ffffffffa04d985b>] md_update_sb.part.58+0x9b/0x860 [md_mod]
> [<ffffffffa04da035>] md_update_sb+0x15/0x30 [md_mod]
> [<ffffffffa04dc066>] md_check_recovery+0x266/0x490 [md_mod]
> [<ffffffffa06450e2>] raid1d+0x42/0x810 [raid1]
> [<ffffffffa04d2252>] md_thread+0x122/0x150 [md_mod]
> [<ffffffff81091741>] kthread+0x101/0x140
> linux175:~ # cat /proc/20125/stack
> [<ffffffffa0636679>] recv_daemon+0x3f9/0x5c0 [md_cluster]
> [<ffffffffa04d2252>] md_thread+0x122/0x150 [md_mod]
> [<ffffffff81091741>] kthread+0x101/0x140
> 
> To resolve the problem, we need to clear THREAD_WAKEUP in case
> mddev_trylock failed to get reconfig_mutex, which could ensure
> the wait queue can be waked by call md_wakeup_thread. And this
> issue maybe affects mddev_trylock in md_check_recovery as well.
> 
> With the new change, mddev_trylock is not suitable as static-inline
> function, so move it to md.c.

This really looks like a hack to me. Clearing the THREAD_WAKEUP and taking the
reconfig_mutex lock has no relationship. We clear the bit just because trylock
happens to be called with the bit set. This is very fragile. What if we have
another path/lock which can fail and have the THREAD_WAKEUP set? I'd revert
that part of code in 4ad23a976413, which probably can optimize things a little
bit, but not that much.

Thanks,
Shaohua
> Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending")
> Suggested-by: NeilBrown <neilb@suse.com>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  drivers/md/md.c | 14 ++++++++++++++
>  drivers/md/md.h |  5 +----
>  2 files changed, 15 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 0ff1bbf..7c2517a 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -623,6 +623,20 @@ static struct mddev *mddev_find(dev_t unit)
>  
>  static struct attribute_group md_redundancy_group;
>  
> +int mddev_trylock(struct mddev *mddev)
> +{
> +	if (mutex_trylock(&mddev->reconfig_mutex))
> +		return 1;
> +
> +	spin_lock(&pers_lock);
> +	if (mddev->thread)
> +		clear_bit(THREAD_WAKEUP, &mddev->thread->flags);
> +	spin_unlock(&pers_lock);
> +
> +	return mutex_trylock(&mddev->reconfig_mutex);
> +}
> +EXPORT_SYMBOL_GPL(mddev_trylock);
> +
>  void mddev_unlock(struct mddev *mddev)
>  {
>  	if (mddev->to_remove) {
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index d8287d3..7fe7b48 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -499,10 +499,7 @@ static inline int mddev_is_locked(struct mddev *mddev)
>  	return mutex_is_locked(&mddev->reconfig_mutex);
>  }
>  
> -static inline int mddev_trylock(struct mddev *mddev)
> -{
> -	return mutex_trylock(&mddev->reconfig_mutex);
> -}
> +extern int mddev_trylock(struct mddev *mddev);
>  extern void mddev_unlock(struct mddev *mddev);
>  
>  static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
> -- 
> 2.6.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2017-09-29 18:00 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-29  1:16 [PATCH 1/2] md-cluster: fix wrong condition check in raid1_write_request Guoqing Jiang
2017-09-29  1:16 ` [PATCH 2/2] md: clear THREAD_WAKEUP flag in mddev_trylock Guoqing Jiang
2017-09-29 18:00   ` Shaohua Li
2017-09-29 17:54 ` [PATCH 1/2] md-cluster: fix wrong condition check in raid1_write_request Shaohua Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.