All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING
@ 2023-11-01 23:02 Junxiao Bi
  2023-11-02  1:24 ` Yu Kuai
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Junxiao Bi @ 2023-11-01 23:02 UTC (permalink / raw)
  To: linux-raid; +Cc: song, logang, junxiao.bi

Looks like there is a race between md_write_start() and raid5d() which
can cause deadlock. Run into this issue while raid_check is running.

 md_write_start:                                                                        raid5d:
 if (mddev->safemode == 1)
     mddev->safemode = 0;
 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
 if (mddev->in_sync || mddev->sync_checkers) {
     spin_lock(&mddev->lock);
     if (mddev->in_sync) {
         mddev->in_sync = 0;
         set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
         set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
                                                                                        >>> running before md_write_start wake up it
                                                                                        if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
                                                                                              spin_unlock_irq(&conf->device_lock);
                                                                                              md_check_recovery(mddev);
                                                                                              spin_lock_irq(&conf->device_lock);

                                                                                              /*
                                                                                               * Waiting on MD_SB_CHANGE_PENDING below may deadlock
                                                                                               * seeing md_check_recovery() is needed to clear
                                                                                               * the flag when using mdmon.
                                                                                               */
                                                                                              continue;
                                                                                         }

                                                                                         wait_event_lock_irq(mddev->sb_wait,     >>>>>>>>>>> hung
                                                                                             !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
                                                                                             conf->device_lock);
         md_wakeup_thread(mddev->thread);
         did_change = 1;
     }
     spin_unlock(&mddev->lock);
 }

 ...

 wait_event(mddev->sb_wait,    >>>>>>>>>> hung
    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
    mddev->suspended);

commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d")
introduced this issue, since it want to a reschedule for flushing blk_plug,
let do it explicitly to address that issue.

Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d")
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
---
 block/blk-core.c   | 1 +
 drivers/md/raid5.c | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 9d51e9894ece..bc8757a78477 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 	if (unlikely(!rq_list_empty(plug->cached_rq)))
 		blk_mq_free_plug_rqs(plug);
 }
+EXPORT_SYMBOL(__blk_flush_plug);
 
 /**
  * blk_finish_plug - mark the end of a batch of submitted I/O
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 284cd71bcc68..25ec82f2813f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
 			 * the flag when using mdmon.
 			 */
 			continue;
+		} else {
+			spin_unlock_irq(&conf->device_lock);
+			blk_flush_plug(current);
+			cond_resched();
+			spin_lock_irq(&conf->device_lock);
 		}
-
-		wait_event_lock_irq(mddev->sb_wait,
-			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
-			conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);
 
-- 
2.39.3 (Apple Git-145)


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING
  2023-11-01 23:02 [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING Junxiao Bi
@ 2023-11-02  1:24 ` Yu Kuai
  2023-11-02  4:32   ` junxiao.bi
  2023-11-02  4:59 ` kernel test robot
  2023-11-02  4:59 ` kernel test robot
  2 siblings, 1 reply; 8+ messages in thread
From: Yu Kuai @ 2023-11-02  1:24 UTC (permalink / raw)
  To: Junxiao Bi, linux-raid; +Cc: song, logang, yukuai (C)

Hi,

在 2023/11/02 7:02, Junxiao Bi 写道:
> Looks like there is a race between md_write_start() and raid5d() which

Is this a real issue or just based on code review?
> can cause deadlock. Run into this issue while raid_check is running.
> 
>   md_write_start:                                                                        raid5d:
>   if (mddev->safemode == 1)
>       mddev->safemode = 0;
>   /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
>   if (mddev->in_sync || mddev->sync_checkers) {
>       spin_lock(&mddev->lock);
>       if (mddev->in_sync) {
>           mddev->in_sync = 0;
>           set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>           set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
>                                                                                          >>> running before md_write_start wake up it
>                                                                                          if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
>                                                                                                spin_unlock_irq(&conf->device_lock);
>                                                                                                md_check_recovery(mddev);
>                                                                                                spin_lock_irq(&conf->device_lock);
> 
>                                                                                                /*
>                                                                                                 * Waiting on MD_SB_CHANGE_PENDING below may deadlock
>                                                                                                 * seeing md_check_recovery() is needed to clear
>                                                                                                 * the flag when using mdmon.
>                                                                                                 */
>                                                                                                continue;
>                                                                                           }
> 
>                                                                                           wait_event_lock_irq(mddev->sb_wait,     >>>>>>>>>>> hung
>                                                                                               !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>                                                                                               conf->device_lock);
>           md_wakeup_thread(mddev->thread);
>           did_change = 1;
>       }
>       spin_unlock(&mddev->lock);
>   }
> 
>   ...
> 
>   wait_event(mddev->sb_wait,    >>>>>>>>>> hung
>      !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
>      mddev->suspended);
> 

This is not correct, if daemon thread is running, md_wakeup_thread()
will make sure that daemon thread will run again, see details how
THREAD_WAKEUP worked in md_thread().

Thanks,
Kuai

> commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d")
> introduced this issue, since it want to a reschedule for flushing blk_plug,
> let do it explicitly to address that issue.
> 
> Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d")
> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
> ---
>   block/blk-core.c   | 1 +
>   drivers/md/raid5.c | 9 +++++----
>   2 files changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 9d51e9894ece..bc8757a78477 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
>   	if (unlikely(!rq_list_empty(plug->cached_rq)))
>   		blk_mq_free_plug_rqs(plug);
>   }
> +EXPORT_SYMBOL(__blk_flush_plug);
>   
>   /**
>    * blk_finish_plug - mark the end of a batch of submitted I/O
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 284cd71bcc68..25ec82f2813f 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
>   			 * the flag when using mdmon.
>   			 */
>   			continue;
> +		} else {
> +			spin_unlock_irq(&conf->device_lock);
> +			blk_flush_plug(current);
> +			cond_resched();
> +			spin_lock_irq(&conf->device_lock);
>   		}
> -
> -		wait_event_lock_irq(mddev->sb_wait,
> -			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
> -			conf->device_lock);
>   	}
>   	pr_debug("%d stripes handled\n", handled);
>   
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING
  2023-11-02  1:24 ` Yu Kuai
@ 2023-11-02  4:32   ` junxiao.bi
  2023-11-02  7:16     ` Yu Kuai
  0 siblings, 1 reply; 8+ messages in thread
From: junxiao.bi @ 2023-11-02  4:32 UTC (permalink / raw)
  To: Yu Kuai, linux-raid; +Cc: song, logang, yukuai (C)

On 11/1/23 6:24 PM, Yu Kuai wrote:

> Hi,
>
> 在 2023/11/02 7:02, Junxiao Bi 写道:
>> Looks like there is a race between md_write_start() and raid5d() which
>
> Is this a real issue or just based on code review?

It's a real issue, we see this hung in a production system, it's with 
v5.4, but i didn't see these function has much difference.

crash> bt 2683
PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65   COMMAND: "md0_raid5"
  #0 [ffffbd7a0252bd08] __schedule at ffffffffa8e68931
  #1 [ffffbd7a0252bd88] schedule at ffffffffa8e68c6f
  #2 [ffffbd7a0252bda8] raid5d at ffffffffc0b4df16 [raid456]
  #3 [ffffbd7a0252bea0] md_thread at ffffffffa8bc20b8
  #4 [ffffbd7a0252bf08] kthread at ffffffffa84dac05
  #5 [ffffbd7a0252bf50] ret_from_fork at ffffffffa9000364
crash> ps -m 2683
[ 0 00:11:08.244] [UN]  PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65   
COMMAND: "md0_raid5"
crash>
crash> bt 96352
PID: 96352    TASK: ffff9cc587c95d00  CPU: 64   COMMAND: "kworker/64:0"
  #0 [ffffbd7a07533be0] __schedule at ffffffffa8e68931
  #1 [ffffbd7a07533c60] schedule at ffffffffa8e68c6f
  #2 [ffffbd7a07533c80] md_write_start at ffffffffa8bc47c5
  #3 [ffffbd7a07533ce0] raid5_make_request at ffffffffc0b4a4c9 [raid456]
  #4 [ffffbd7a07533dc8] md_handle_request at ffffffffa8bbfa54
  #5 [ffffbd7a07533e38] md_submit_flush_data at ffffffffa8bc04c1
  #6 [ffffbd7a07533e60] process_one_work at ffffffffa84d4289
  #7 [ffffbd7a07533ea8] worker_thread at ffffffffa84d50cf
  #8 [ffffbd7a07533f08] kthread at ffffffffa84dac05
  #9 [ffffbd7a07533f50] ret_from_fork at ffffffffa9000364
crash> ps -m 96352
[ 0 00:11:08.244] [UN]  PID: 96352    TASK: ffff9cc587c95d00  CPU: 64   
COMMAND: "kworker/64:0"
crash>
crash> bt 25542
PID: 25542    TASK: ffff9cb4cb528000  CPU: 32   COMMAND: "md0_resync"
  #0 [ffffbd7a09387c80] __schedule at ffffffffa8e68931
  #1 [ffffbd7a09387d00] schedule at ffffffffa8e68c6f
  #2 [ffffbd7a09387d20] md_do_sync at ffffffffa8bc613e
  #3 [ffffbd7a09387ea0] md_thread at ffffffffa8bc20b8
  #4 [ffffbd7a09387f08] kthread at ffffffffa84dac05
  #5 [ffffbd7a09387f50] ret_from_fork at ffffffffa9000364
crash>
crash> ps -m 25542
[ 0 00:11:18.370] [UN]  PID: 25542    TASK: ffff9cb4cb528000  CPU: 32   
COMMAND: "md0_resync"


>> can cause deadlock. Run into this issue while raid_check is running.
>>
>> md_write_start: raid5d:
>>   if (mddev->safemode == 1)
>>       mddev->safemode = 0;
>>   /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
>>   if (mddev->in_sync || mddev->sync_checkers) {
>>       spin_lock(&mddev->lock);
>>       if (mddev->in_sync) {
>>           mddev->in_sync = 0;
>>           set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>>           set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
>> >>> running before md_write_start wake up it
>> if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
>> spin_unlock_irq(&conf->device_lock);
>> md_check_recovery(mddev);
>> spin_lock_irq(&conf->device_lock);
>>
>> /*
>> * Waiting on MD_SB_CHANGE_PENDING below may deadlock
>> * seeing md_check_recovery() is needed to clear
>> * the flag when using mdmon.
>> */
>> continue;
>> }
>>
>> wait_event_lock_irq(mddev->sb_wait, >>>>>>>>>>> hung
>> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>> conf->device_lock);
>>           md_wakeup_thread(mddev->thread);
>>           did_change = 1;
>>       }
>>       spin_unlock(&mddev->lock);
>>   }
>>
>>   ...
>>
>>   wait_event(mddev->sb_wait, >>>>>>>>>> hung
>>      !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
>>      mddev->suspended);
>>
>
> This is not correct, if daemon thread is running, md_wakeup_thread()
> will make sure that daemon thread will run again, see details how
> THREAD_WAKEUP worked in md_thread().

The daemon thread was waiting MD_SB_CHANGE_PENDING to be cleared, even 
wake up it, it will hung again as that flag is still not cleared?

Thanks,

Junxiao.

>
> Thanks,
> Kuai
>
>> commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>> raid5d")
>> introduced this issue, since it want to a reschedule for flushing 
>> blk_plug,
>> let do it explicitly to address that issue.
>>
>> Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>> raid5d")
>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>> ---
>>   block/blk-core.c   | 1 +
>>   drivers/md/raid5.c | 9 +++++----
>>   2 files changed, 6 insertions(+), 4 deletions(-)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 9d51e9894ece..bc8757a78477 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, 
>> bool from_schedule)
>>       if (unlikely(!rq_list_empty(plug->cached_rq)))
>>           blk_mq_free_plug_rqs(plug);
>>   }
>> +EXPORT_SYMBOL(__blk_flush_plug);
>>     /**
>>    * blk_finish_plug - mark the end of a batch of submitted I/O
>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>> index 284cd71bcc68..25ec82f2813f 100644
>> --- a/drivers/md/raid5.c
>> +++ b/drivers/md/raid5.c
>> @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
>>                * the flag when using mdmon.
>>                */
>>               continue;
>> +        } else {
>> +            spin_unlock_irq(&conf->device_lock);
>> +            blk_flush_plug(current);
>> +            cond_resched();
>> +            spin_lock_irq(&conf->device_lock);
>>           }
>> -
>> -        wait_event_lock_irq(mddev->sb_wait,
>> -            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>> -            conf->device_lock);
>>       }
>>       pr_debug("%d stripes handled\n", handled);
>>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING
  2023-11-01 23:02 [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING Junxiao Bi
  2023-11-02  1:24 ` Yu Kuai
@ 2023-11-02  4:59 ` kernel test robot
  2023-11-02  4:59 ` kernel test robot
  2 siblings, 0 replies; 8+ messages in thread
From: kernel test robot @ 2023-11-02  4:59 UTC (permalink / raw)
  To: Junxiao Bi; +Cc: oe-kbuild-all

Hi Junxiao,

[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:

[auto build test ERROR on song-md/md-next]
[also build test ERROR on axboe-block/for-next linus/master v6.6 next-20231102]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Junxiao-Bi/md-raid5-fix-hung-by-MD_SB_CHANGE_PENDING/20231102-070925
base:   git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git md-next
patch link:    https://lore.kernel.org/r/20231101230214.57190-1-junxiao.bi%40oracle.com
patch subject: [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING
config: sparc64-defconfig (https://download.01.org/0day-ci/archive/20231102/202311021202.Dm9g43Zm-lkp@intel.com/config)
compiler: sparc64-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231102/202311021202.Dm9g43Zm-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202311021202.Dm9g43Zm-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/md/raid5.c: In function 'raid5d':
>> drivers/md/raid5.c:6825:40: error: passing argument 1 of 'blk_flush_plug' from incompatible pointer type [-Werror=incompatible-pointer-types]
    6825 |                         blk_flush_plug(current);
         |                                        ^~~~~~~
         |                                        |
         |                                        struct task_struct *
   In file included from drivers/md/raid5.c:38:
   include/linux/blkdev.h:992:52: note: expected 'struct blk_plug *' but argument is of type 'struct task_struct *'
     992 | static inline void blk_flush_plug(struct blk_plug *plug, bool async)
         |                                   ~~~~~~~~~~~~~~~~~^~~~
>> drivers/md/raid5.c:6825:25: error: too few arguments to function 'blk_flush_plug'
    6825 |                         blk_flush_plug(current);
         |                         ^~~~~~~~~~~~~~
   include/linux/blkdev.h:992:20: note: declared here
     992 | static inline void blk_flush_plug(struct blk_plug *plug, bool async)
         |                    ^~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/blk_flush_plug +6825 drivers/md/raid5.c

  6753	
  6754	/*
  6755	 * This is our raid5 kernel thread.
  6756	 *
  6757	 * We scan the hash table for stripes which can be handled now.
  6758	 * During the scan, completed stripes are saved for us by the interrupt
  6759	 * handler, so that they will not have to wait for our next wakeup.
  6760	 */
  6761	static void raid5d(struct md_thread *thread)
  6762	{
  6763		struct mddev *mddev = thread->mddev;
  6764		struct r5conf *conf = mddev->private;
  6765		int handled;
  6766		struct blk_plug plug;
  6767	
  6768		pr_debug("+++ raid5d active\n");
  6769	
  6770		md_check_recovery(mddev);
  6771	
  6772		blk_start_plug(&plug);
  6773		handled = 0;
  6774		spin_lock_irq(&conf->device_lock);
  6775		while (1) {
  6776			struct bio *bio;
  6777			int batch_size, released;
  6778			unsigned int offset;
  6779	
  6780			released = release_stripe_list(conf, conf->temp_inactive_list);
  6781			if (released)
  6782				clear_bit(R5_DID_ALLOC, &conf->cache_state);
  6783	
  6784			if (
  6785			    !list_empty(&conf->bitmap_list)) {
  6786				/* Now is a good time to flush some bitmap updates */
  6787				conf->seq_flush++;
  6788				spin_unlock_irq(&conf->device_lock);
  6789				md_bitmap_unplug(mddev->bitmap);
  6790				spin_lock_irq(&conf->device_lock);
  6791				conf->seq_write = conf->seq_flush;
  6792				activate_bit_delay(conf, conf->temp_inactive_list);
  6793			}
  6794			raid5_activate_delayed(conf);
  6795	
  6796			while ((bio = remove_bio_from_retry(conf, &offset))) {
  6797				int ok;
  6798				spin_unlock_irq(&conf->device_lock);
  6799				ok = retry_aligned_read(conf, bio, offset);
  6800				spin_lock_irq(&conf->device_lock);
  6801				if (!ok)
  6802					break;
  6803				handled++;
  6804			}
  6805	
  6806			batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
  6807							   conf->temp_inactive_list);
  6808			if (!batch_size && !released)
  6809				break;
  6810			handled += batch_size;
  6811	
  6812			if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
  6813				spin_unlock_irq(&conf->device_lock);
  6814				md_check_recovery(mddev);
  6815				spin_lock_irq(&conf->device_lock);
  6816	
  6817				/*
  6818				 * Waiting on MD_SB_CHANGE_PENDING below may deadlock
  6819				 * seeing md_check_recovery() is needed to clear
  6820				 * the flag when using mdmon.
  6821				 */
  6822				continue;
  6823			} else {
  6824				spin_unlock_irq(&conf->device_lock);
> 6825				blk_flush_plug(current);
  6826				cond_resched();
  6827				spin_lock_irq(&conf->device_lock);
  6828			}
  6829		}
  6830		pr_debug("%d stripes handled\n", handled);
  6831	
  6832		spin_unlock_irq(&conf->device_lock);
  6833		if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
  6834		    mutex_trylock(&conf->cache_size_mutex)) {
  6835			grow_one_stripe(conf, __GFP_NOWARN);
  6836			/* Set flag even if allocation failed.  This helps
  6837			 * slow down allocation requests when mem is short
  6838			 */
  6839			set_bit(R5_DID_ALLOC, &conf->cache_state);
  6840			mutex_unlock(&conf->cache_size_mutex);
  6841		}
  6842	
  6843		flush_deferred_bios(conf);
  6844	
  6845		r5l_flush_stripe_to_raid(conf->log);
  6846	
  6847		async_tx_issue_pending_all();
  6848		blk_finish_plug(&plug);
  6849	
  6850		pr_debug("--- raid5d inactive\n");
  6851	}
  6852	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING
  2023-11-01 23:02 [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING Junxiao Bi
  2023-11-02  1:24 ` Yu Kuai
  2023-11-02  4:59 ` kernel test robot
@ 2023-11-02  4:59 ` kernel test robot
  2 siblings, 0 replies; 8+ messages in thread
From: kernel test robot @ 2023-11-02  4:59 UTC (permalink / raw)
  To: Junxiao Bi; +Cc: oe-kbuild-all

Hi Junxiao,

[This is a private test report for your RFC patch.]
kernel test robot noticed the following build errors:

[auto build test ERROR on song-md/md-next]
[also build test ERROR on axboe-block/for-next linus/master v6.6 next-20231102]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Junxiao-Bi/md-raid5-fix-hung-by-MD_SB_CHANGE_PENDING/20231102-070925
base:   git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git md-next
patch link:    https://lore.kernel.org/r/20231101230214.57190-1-junxiao.bi%40oracle.com
patch subject: [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING
config: i386-randconfig-015-20231102 (https://download.01.org/0day-ci/archive/20231102/202311021259.Kv4kJKyX-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231102/202311021259.Kv4kJKyX-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202311021259.Kv4kJKyX-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from arch/x86/include/asm/preempt.h:7,
                    from include/linux/preempt.h:79,
                    from include/linux/spinlock.h:56,
                    from include/linux/wait.h:9,
                    from include/linux/wait_bit.h:8,
                    from include/linux/fs.h:6,
                    from include/linux/highmem.h:5,
                    from include/linux/bvec.h:10,
                    from include/linux/blk_types.h:10,
                    from include/linux/blkdev.h:9,
                    from drivers/md/raid5.c:38:
   drivers/md/raid5.c: In function 'raid5d':
>> arch/x86/include/asm/current.h:44:17: error: passing argument 1 of 'blk_flush_plug' from incompatible pointer type [-Werror=incompatible-pointer-types]
      44 | #define current get_current()
         |                 ^~~~~~~~~~~~~
         |                 |
         |                 struct task_struct *
   drivers/md/raid5.c:6825:40: note: in expansion of macro 'current'
    6825 |                         blk_flush_plug(current);
         |                                        ^~~~~~~
   include/linux/blkdev.h:992:52: note: expected 'struct blk_plug *' but argument is of type 'struct task_struct *'
     992 | static inline void blk_flush_plug(struct blk_plug *plug, bool async)
         |                                   ~~~~~~~~~~~~~~~~~^~~~
   drivers/md/raid5.c:6825:25: error: too few arguments to function 'blk_flush_plug'
    6825 |                         blk_flush_plug(current);
         |                         ^~~~~~~~~~~~~~
   include/linux/blkdev.h:992:20: note: declared here
     992 | static inline void blk_flush_plug(struct blk_plug *plug, bool async)
         |                    ^~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/blk_flush_plug +44 arch/x86/include/asm/current.h

f0766440dda7ac include/asm-x86/current.h      Christoph Lameter 2008-05-09  43  
c6f5e0acd5d12e arch/x86/include/asm/current.h Brian Gerst       2009-01-19 @44  #define current get_current()
f0766440dda7ac include/asm-x86/current.h      Christoph Lameter 2008-05-09  45  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING
  2023-11-02  4:32   ` junxiao.bi
@ 2023-11-02  7:16     ` Yu Kuai
  2023-11-03 19:11       ` junxiao.bi
  0 siblings, 1 reply; 8+ messages in thread
From: Yu Kuai @ 2023-11-02  7:16 UTC (permalink / raw)
  To: junxiao.bi, Yu Kuai, linux-raid; +Cc: song, logang, yukuai (C)

Hi,

在 2023/11/02 12:32, junxiao.bi@oracle.com 写道:
> On 11/1/23 6:24 PM, Yu Kuai wrote:
> 
>> Hi,
>>
>> 在 2023/11/02 7:02, Junxiao Bi 写道:
>>> Looks like there is a race between md_write_start() and raid5d() which
>>
>> Is this a real issue or just based on code review?
> 
> It's a real issue, we see this hung in a production system, it's with 
> v5.4, but i didn't see these function has much difference.
> 
> crash> bt 2683
> PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65   COMMAND: "md0_raid5"
>   #0 [ffffbd7a0252bd08] __schedule at ffffffffa8e68931
>   #1 [ffffbd7a0252bd88] schedule at ffffffffa8e68c6f
>   #2 [ffffbd7a0252bda8] raid5d at ffffffffc0b4df16 [raid456]
>   #3 [ffffbd7a0252bea0] md_thread at ffffffffa8bc20b8
>   #4 [ffffbd7a0252bf08] kthread at ffffffffa84dac05
>   #5 [ffffbd7a0252bf50] ret_from_fork at ffffffffa9000364
> crash> ps -m 2683
> [ 0 00:11:08.244] [UN]  PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65 
> COMMAND: "md0_raid5"
> crash>
> crash> bt 96352
> PID: 96352    TASK: ffff9cc587c95d00  CPU: 64   COMMAND: "kworker/64:0"
>   #0 [ffffbd7a07533be0] __schedule at ffffffffa8e68931
>   #1 [ffffbd7a07533c60] schedule at ffffffffa8e68c6f
>   #2 [ffffbd7a07533c80] md_write_start at ffffffffa8bc47c5
>   #3 [ffffbd7a07533ce0] raid5_make_request at ffffffffc0b4a4c9 [raid456]
>   #4 [ffffbd7a07533dc8] md_handle_request at ffffffffa8bbfa54
>   #5 [ffffbd7a07533e38] md_submit_flush_data at ffffffffa8bc04c1
>   #6 [ffffbd7a07533e60] process_one_work at ffffffffa84d4289
>   #7 [ffffbd7a07533ea8] worker_thread at ffffffffa84d50cf
>   #8 [ffffbd7a07533f08] kthread at ffffffffa84dac05
>   #9 [ffffbd7a07533f50] ret_from_fork at ffffffffa9000364
> crash> ps -m 96352
> [ 0 00:11:08.244] [UN]  PID: 96352    TASK: ffff9cc587c95d00  CPU: 64 
> COMMAND: "kworker/64:0"
> crash>
> crash> bt 25542
> PID: 25542    TASK: ffff9cb4cb528000  CPU: 32   COMMAND: "md0_resync"
>   #0 [ffffbd7a09387c80] __schedule at ffffffffa8e68931
>   #1 [ffffbd7a09387d00] schedule at ffffffffa8e68c6f
>   #2 [ffffbd7a09387d20] md_do_sync at ffffffffa8bc613e
>   #3 [ffffbd7a09387ea0] md_thread at ffffffffa8bc20b8
>   #4 [ffffbd7a09387f08] kthread at ffffffffa84dac05
>   #5 [ffffbd7a09387f50] ret_from_fork at ffffffffa9000364
> crash>
> crash> ps -m 25542
> [ 0 00:11:18.370] [UN]  PID: 25542    TASK: ffff9cb4cb528000  CPU: 32 
> COMMAND: "md0_resync"
> 
> 
>>> can cause deadlock. Run into this issue while raid_check is running.
>>>
>>> md_write_start: raid5d:
>>>   if (mddev->safemode == 1)
>>>       mddev->safemode = 0;
>>>   /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
>>>   if (mddev->in_sync || mddev->sync_checkers) {
>>>       spin_lock(&mddev->lock);
>>>       if (mddev->in_sync) {
>>>           mddev->in_sync = 0;
>>>           set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>>>           set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
>>> >>> running before md_write_start wake up it
>>> if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
>>> spin_unlock_irq(&conf->device_lock);
>>> md_check_recovery(mddev);
>>> spin_lock_irq(&conf->device_lock);
>>>
>>> /*
>>> * Waiting on MD_SB_CHANGE_PENDING below may deadlock
>>> * seeing md_check_recovery() is needed to clear
>>> * the flag when using mdmon.
>>> */
>>> continue;
>>> }
>>>
>>> wait_event_lock_irq(mddev->sb_wait, >>>>>>>>>>> hung
>>> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>> conf->device_lock);
>>>           md_wakeup_thread(mddev->thread);
>>>           did_change = 1;
>>>       }
>>>       spin_unlock(&mddev->lock);
>>>   }
>>>
>>>   ...
>>>
>>>   wait_event(mddev->sb_wait, >>>>>>>>>> hung
>>>      !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
>>>      mddev->suspended);
>>>
>>
>> This is not correct, if daemon thread is running, md_wakeup_thread()
>> will make sure that daemon thread will run again, see details how
>> THREAD_WAKEUP worked in md_thread().
> 
> The daemon thread was waiting MD_SB_CHANGE_PENDING to be cleared, even 
> wake up it, it will hung again as that flag is still not cleared?

I aggree that daemon thread should not use wait_event(), however, take a
look at 5e2cf333b7bd, I think this is a common issue for all
personalities, and the better fix is that let bio submitted from
md_write_super() bypass wbt, this is reasonable because wbt is used to
throttle backgroup writeback io, and writing superblock should not be
throttled by wbt.

Thanks,
Kuai

> 
> Thanks,
> 
> Junxiao.
> 
>>
>> Thanks,
>> Kuai
>>
>>> commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>> raid5d")
>>> introduced this issue, since it want to a reschedule for flushing 
>>> blk_plug,
>>> let do it explicitly to address that issue.
>>>
>>> Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>> raid5d")
>>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>>> ---
>>>   block/blk-core.c   | 1 +
>>>   drivers/md/raid5.c | 9 +++++----
>>>   2 files changed, 6 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 9d51e9894ece..bc8757a78477 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, 
>>> bool from_schedule)
>>>       if (unlikely(!rq_list_empty(plug->cached_rq)))
>>>           blk_mq_free_plug_rqs(plug);
>>>   }
>>> +EXPORT_SYMBOL(__blk_flush_plug);
>>>     /**
>>>    * blk_finish_plug - mark the end of a batch of submitted I/O
>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>> index 284cd71bcc68..25ec82f2813f 100644
>>> --- a/drivers/md/raid5.c
>>> +++ b/drivers/md/raid5.c
>>> @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
>>>                * the flag when using mdmon.
>>>                */
>>>               continue;
>>> +        } else {
>>> +            spin_unlock_irq(&conf->device_lock);
>>> +            blk_flush_plug(current);
>>> +            cond_resched();
>>> +            spin_lock_irq(&conf->device_lock);
>>>           }
>>> -
>>> -        wait_event_lock_irq(mddev->sb_wait,
>>> -            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>> -            conf->device_lock);
>>>       }
>>>       pr_debug("%d stripes handled\n", handled);
>>>
>>
> .
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING
  2023-11-02  7:16     ` Yu Kuai
@ 2023-11-03 19:11       ` junxiao.bi
  2023-11-04  1:56         ` Yu Kuai
  0 siblings, 1 reply; 8+ messages in thread
From: junxiao.bi @ 2023-11-03 19:11 UTC (permalink / raw)
  To: Yu Kuai, linux-raid; +Cc: song, logang, yukuai (C)

On 11/2/23 12:16 AM, Yu Kuai wrote:

> Hi,
>
> 在 2023/11/02 12:32, junxiao.bi@oracle.com 写道:
>> On 11/1/23 6:24 PM, Yu Kuai wrote:
>>
>>> Hi,
>>>
>>> 在 2023/11/02 7:02, Junxiao Bi 写道:
>>>> Looks like there is a race between md_write_start() and raid5d() which
>>>
>>> Is this a real issue or just based on code review?
>>
>> It's a real issue, we see this hung in a production system, it's with 
>> v5.4, but i didn't see these function has much difference.
>>
>> crash> bt 2683
>> PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65   COMMAND: "md0_raid5"
>>   #0 [ffffbd7a0252bd08] __schedule at ffffffffa8e68931
>>   #1 [ffffbd7a0252bd88] schedule at ffffffffa8e68c6f
>>   #2 [ffffbd7a0252bda8] raid5d at ffffffffc0b4df16 [raid456]
>>   #3 [ffffbd7a0252bea0] md_thread at ffffffffa8bc20b8
>>   #4 [ffffbd7a0252bf08] kthread at ffffffffa84dac05
>>   #5 [ffffbd7a0252bf50] ret_from_fork at ffffffffa9000364
>> crash> ps -m 2683
>> [ 0 00:11:08.244] [UN]  PID: 2683     TASK: ffff9d3b3e651f00 CPU: 65 
>> COMMAND: "md0_raid5"
>> crash>
>> crash> bt 96352
>> PID: 96352    TASK: ffff9cc587c95d00  CPU: 64   COMMAND: "kworker/64:0"
>>   #0 [ffffbd7a07533be0] __schedule at ffffffffa8e68931
>>   #1 [ffffbd7a07533c60] schedule at ffffffffa8e68c6f
>>   #2 [ffffbd7a07533c80] md_write_start at ffffffffa8bc47c5
>>   #3 [ffffbd7a07533ce0] raid5_make_request at ffffffffc0b4a4c9 [raid456]
>>   #4 [ffffbd7a07533dc8] md_handle_request at ffffffffa8bbfa54
>>   #5 [ffffbd7a07533e38] md_submit_flush_data at ffffffffa8bc04c1
>>   #6 [ffffbd7a07533e60] process_one_work at ffffffffa84d4289
>>   #7 [ffffbd7a07533ea8] worker_thread at ffffffffa84d50cf
>>   #8 [ffffbd7a07533f08] kthread at ffffffffa84dac05
>>   #9 [ffffbd7a07533f50] ret_from_fork at ffffffffa9000364
>> crash> ps -m 96352
>> [ 0 00:11:08.244] [UN]  PID: 96352    TASK: ffff9cc587c95d00 CPU: 64 
>> COMMAND: "kworker/64:0"
>> crash>
>> crash> bt 25542
>> PID: 25542    TASK: ffff9cb4cb528000  CPU: 32   COMMAND: "md0_resync"
>>   #0 [ffffbd7a09387c80] __schedule at ffffffffa8e68931
>>   #1 [ffffbd7a09387d00] schedule at ffffffffa8e68c6f
>>   #2 [ffffbd7a09387d20] md_do_sync at ffffffffa8bc613e
>>   #3 [ffffbd7a09387ea0] md_thread at ffffffffa8bc20b8
>>   #4 [ffffbd7a09387f08] kthread at ffffffffa84dac05
>>   #5 [ffffbd7a09387f50] ret_from_fork at ffffffffa9000364
>> crash>
>> crash> ps -m 25542
>> [ 0 00:11:18.370] [UN]  PID: 25542    TASK: ffff9cb4cb528000 CPU: 32 
>> COMMAND: "md0_resync"
>>
>>
>>>> can cause deadlock. Run into this issue while raid_check is running.
>>>>
>>>> md_write_start: raid5d:
>>>>   if (mddev->safemode == 1)
>>>>       mddev->safemode = 0;
>>>>   /* sync_checkers is always 0 when writes_pending is in per-cpu 
>>>> mode */
>>>>   if (mddev->in_sync || mddev->sync_checkers) {
>>>>       spin_lock(&mddev->lock);
>>>>       if (mddev->in_sync) {
>>>>           mddev->in_sync = 0;
>>>>           set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>>>>           set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
>>>> >>> running before md_write_start wake up it
>>>> if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
>>>> spin_unlock_irq(&conf->device_lock);
>>>> md_check_recovery(mddev);
>>>> spin_lock_irq(&conf->device_lock);
>>>>
>>>> /*
>>>> * Waiting on MD_SB_CHANGE_PENDING below may deadlock
>>>> * seeing md_check_recovery() is needed to clear
>>>> * the flag when using mdmon.
>>>> */
>>>> continue;
>>>> }
>>>>
>>>> wait_event_lock_irq(mddev->sb_wait, >>>>>>>>>>> hung
>>>> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>>> conf->device_lock);
>>>>           md_wakeup_thread(mddev->thread);
>>>>           did_change = 1;
>>>>       }
>>>>       spin_unlock(&mddev->lock);
>>>>   }
>>>>
>>>>   ...
>>>>
>>>>   wait_event(mddev->sb_wait, >>>>>>>>>> hung
>>>>      !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
>>>>      mddev->suspended);
>>>>
>>>
>>> This is not correct, if daemon thread is running, md_wakeup_thread()
>>> will make sure that daemon thread will run again, see details how
>>> THREAD_WAKEUP worked in md_thread().
>>
>> The daemon thread was waiting MD_SB_CHANGE_PENDING to be cleared, 
>> even wake up it, it will hung again as that flag is still not cleared?
>
> I aggree that daemon thread should not use wait_event(), however, take a
> look at 5e2cf333b7bd, I think this is a common issue for all
> personalities, and the better fix is that let bio submitted from
> md_write_super() bypass wbt, this is reasonable because wbt is used to
> throttle backgroup writeback io, and writing superblock should not be
> throttled by wbt.

So the fix is the following plus reverting commit 5e2cf333b7bd?


diff --git a/drivers/md/md.c b/drivers/md/md.c
index 839e79e567ee..841bd4459817 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -931,7 +931,7 @@ void md_super_write(struct mddev *mddev, struct 
md_rdev *rdev,

         bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : 
rdev->bdev,
                                1,
-                              REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | 
REQ_FUA,
+                              REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | 
REQ_PREFLUSH | REQ_FUA,
                                GFP_NOIO, &mddev->sync_set);

         atomic_inc(&rdev->nr_pending);


Thanks,

Junxiao.

>
> Thanks,
> Kuai
>
>>
>> Thanks,
>>
>> Junxiao.
>>
>>>
>>> Thanks,
>>> Kuai
>>>
>>>> commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>>> raid5d")
>>>> introduced this issue, since it want to a reschedule for flushing 
>>>> blk_plug,
>>>> let do it explicitly to address that issue.
>>>>
>>>> Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>>> raid5d")
>>>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>> ---
>>>>   block/blk-core.c   | 1 +
>>>>   drivers/md/raid5.c | 9 +++++----
>>>>   2 files changed, 6 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>> index 9d51e9894ece..bc8757a78477 100644
>>>> --- a/block/blk-core.c
>>>> +++ b/block/blk-core.c
>>>> @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, 
>>>> bool from_schedule)
>>>>       if (unlikely(!rq_list_empty(plug->cached_rq)))
>>>>           blk_mq_free_plug_rqs(plug);
>>>>   }
>>>> +EXPORT_SYMBOL(__blk_flush_plug);
>>>>     /**
>>>>    * blk_finish_plug - mark the end of a batch of submitted I/O
>>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>> index 284cd71bcc68..25ec82f2813f 100644
>>>> --- a/drivers/md/raid5.c
>>>> +++ b/drivers/md/raid5.c
>>>> @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
>>>>                * the flag when using mdmon.
>>>>                */
>>>>               continue;
>>>> +        } else {
>>>> +            spin_unlock_irq(&conf->device_lock);
>>>> +            blk_flush_plug(current);
>>>> +            cond_resched();
>>>> +            spin_lock_irq(&conf->device_lock);
>>>>           }
>>>> -
>>>> -        wait_event_lock_irq(mddev->sb_wait,
>>>> -            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>>> -            conf->device_lock);
>>>>       }
>>>>       pr_debug("%d stripes handled\n", handled);
>>>>
>>>
>> .
>>
>

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING
  2023-11-03 19:11       ` junxiao.bi
@ 2023-11-04  1:56         ` Yu Kuai
  0 siblings, 0 replies; 8+ messages in thread
From: Yu Kuai @ 2023-11-04  1:56 UTC (permalink / raw)
  To: junxiao.bi, Yu Kuai, linux-raid; +Cc: song, logang, yukuai (C)

Hi,

在 2023/11/04 3:11, junxiao.bi@oracle.com 写道:
> On 11/2/23 12:16 AM, Yu Kuai wrote:
> 
>> Hi,
>>
>> 在 2023/11/02 12:32, junxiao.bi@oracle.com 写道:
>>> On 11/1/23 6:24 PM, Yu Kuai wrote:
>>>
>>>> Hi,
>>>>
>>>> 在 2023/11/02 7:02, Junxiao Bi 写道:
>>>>> Looks like there is a race between md_write_start() and raid5d() which
>>>>
>>>> Is this a real issue or just based on code review?
>>>
>>> It's a real issue, we see this hung in a production system, it's with 
>>> v5.4, but i didn't see these function has much difference.
>>>
>>> crash> bt 2683
>>> PID: 2683     TASK: ffff9d3b3e651f00  CPU: 65   COMMAND: "md0_raid5"
>>>   #0 [ffffbd7a0252bd08] __schedule at ffffffffa8e68931
>>>   #1 [ffffbd7a0252bd88] schedule at ffffffffa8e68c6f
>>>   #2 [ffffbd7a0252bda8] raid5d at ffffffffc0b4df16 [raid456]
>>>   #3 [ffffbd7a0252bea0] md_thread at ffffffffa8bc20b8
>>>   #4 [ffffbd7a0252bf08] kthread at ffffffffa84dac05
>>>   #5 [ffffbd7a0252bf50] ret_from_fork at ffffffffa9000364
>>> crash> ps -m 2683
>>> [ 0 00:11:08.244] [UN]  PID: 2683     TASK: ffff9d3b3e651f00 CPU: 65 
>>> COMMAND: "md0_raid5"
>>> crash>
>>> crash> bt 96352
>>> PID: 96352    TASK: ffff9cc587c95d00  CPU: 64   COMMAND: "kworker/64:0"
>>>   #0 [ffffbd7a07533be0] __schedule at ffffffffa8e68931
>>>   #1 [ffffbd7a07533c60] schedule at ffffffffa8e68c6f
>>>   #2 [ffffbd7a07533c80] md_write_start at ffffffffa8bc47c5
>>>   #3 [ffffbd7a07533ce0] raid5_make_request at ffffffffc0b4a4c9 [raid456]
>>>   #4 [ffffbd7a07533dc8] md_handle_request at ffffffffa8bbfa54
>>>   #5 [ffffbd7a07533e38] md_submit_flush_data at ffffffffa8bc04c1
>>>   #6 [ffffbd7a07533e60] process_one_work at ffffffffa84d4289
>>>   #7 [ffffbd7a07533ea8] worker_thread at ffffffffa84d50cf
>>>   #8 [ffffbd7a07533f08] kthread at ffffffffa84dac05
>>>   #9 [ffffbd7a07533f50] ret_from_fork at ffffffffa9000364
>>> crash> ps -m 96352
>>> [ 0 00:11:08.244] [UN]  PID: 96352    TASK: ffff9cc587c95d00 CPU: 64 
>>> COMMAND: "kworker/64:0"
>>> crash>
>>> crash> bt 25542
>>> PID: 25542    TASK: ffff9cb4cb528000  CPU: 32   COMMAND: "md0_resync"
>>>   #0 [ffffbd7a09387c80] __schedule at ffffffffa8e68931
>>>   #1 [ffffbd7a09387d00] schedule at ffffffffa8e68c6f
>>>   #2 [ffffbd7a09387d20] md_do_sync at ffffffffa8bc613e
>>>   #3 [ffffbd7a09387ea0] md_thread at ffffffffa8bc20b8
>>>   #4 [ffffbd7a09387f08] kthread at ffffffffa84dac05
>>>   #5 [ffffbd7a09387f50] ret_from_fork at ffffffffa9000364
>>> crash>
>>> crash> ps -m 25542
>>> [ 0 00:11:18.370] [UN]  PID: 25542    TASK: ffff9cb4cb528000 CPU: 32 
>>> COMMAND: "md0_resync"
>>>
>>>
>>>>> can cause deadlock. Run into this issue while raid_check is running.
>>>>>
>>>>> md_write_start: raid5d:
>>>>>   if (mddev->safemode == 1)
>>>>>       mddev->safemode = 0;
>>>>>   /* sync_checkers is always 0 when writes_pending is in per-cpu 
>>>>> mode */
>>>>>   if (mddev->in_sync || mddev->sync_checkers) {
>>>>>       spin_lock(&mddev->lock);
>>>>>       if (mddev->in_sync) {
>>>>>           mddev->in_sync = 0;
>>>>>           set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
>>>>>           set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
>>>>> >>> running before md_write_start wake up it
>>>>> if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
>>>>> spin_unlock_irq(&conf->device_lock);
>>>>> md_check_recovery(mddev);
>>>>> spin_lock_irq(&conf->device_lock);
>>>>>
>>>>> /*
>>>>> * Waiting on MD_SB_CHANGE_PENDING below may deadlock
>>>>> * seeing md_check_recovery() is needed to clear
>>>>> * the flag when using mdmon.
>>>>> */
>>>>> continue;
>>>>> }
>>>>>
>>>>> wait_event_lock_irq(mddev->sb_wait, >>>>>>>>>>> hung
>>>>> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>>>> conf->device_lock);
>>>>>           md_wakeup_thread(mddev->thread);
>>>>>           did_change = 1;
>>>>>       }
>>>>>       spin_unlock(&mddev->lock);
>>>>>   }
>>>>>
>>>>>   ...
>>>>>
>>>>>   wait_event(mddev->sb_wait, >>>>>>>>>> hung
>>>>>      !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
>>>>>      mddev->suspended);
>>>>>
>>>>
>>>> This is not correct, if daemon thread is running, md_wakeup_thread()
>>>> will make sure that daemon thread will run again, see details how
>>>> THREAD_WAKEUP worked in md_thread().
>>>
>>> The daemon thread was waiting MD_SB_CHANGE_PENDING to be cleared, 
>>> even wake up it, it will hung again as that flag is still not cleared?
>>
>> I aggree that daemon thread should not use wait_event(), however, take a
>> look at 5e2cf333b7bd, I think this is a common issue for all
>> personalities, and the better fix is that let bio submitted from
>> md_write_super() bypass wbt, this is reasonable because wbt is used to
>> throttle backgroup writeback io, and writing superblock should not be
>> throttled by wbt.
> 
> So the fix is the following plus reverting commit 5e2cf333b7bd?

Yes, I think this can work, and REQ_META should be added for the same
reason, see bio_issue_as_root_blkg().

Thanks,
Kuai

> 
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 839e79e567ee..841bd4459817 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -931,7 +931,7 @@ void md_super_write(struct mddev *mddev, struct 
> md_rdev *rdev,
> 
>          bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : 
> rdev->bdev,
>                                 1,
> -                              REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | 
> REQ_FUA,
> +                              REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | 
> REQ_PREFLUSH | REQ_FUA,
>                                 GFP_NOIO, &mddev->sync_set);
> 
>          atomic_inc(&rdev->nr_pending);
> 
> 
> Thanks,
> 
> Junxiao.
> 
>>
>> Thanks,
>> Kuai
>>
>>>
>>> Thanks,
>>>
>>> Junxiao.
>>>
>>>>
>>>> Thanks,
>>>> Kuai
>>>>
>>>>> commit 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>>>> raid5d")
>>>>> introduced this issue, since it want to a reschedule for flushing 
>>>>> blk_plug,
>>>>> let do it explicitly to address that issue.
>>>>>
>>>>> Fixes: 5e2cf333b7bd ("md/raid5: Wait for MD_SB_CHANGE_PENDING in 
>>>>> raid5d")
>>>>> Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>> ---
>>>>>   block/blk-core.c   | 1 +
>>>>>   drivers/md/raid5.c | 9 +++++----
>>>>>   2 files changed, 6 insertions(+), 4 deletions(-)
>>>>>
>>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>>> index 9d51e9894ece..bc8757a78477 100644
>>>>> --- a/block/blk-core.c
>>>>> +++ b/block/blk-core.c
>>>>> @@ -1149,6 +1149,7 @@ void __blk_flush_plug(struct blk_plug *plug, 
>>>>> bool from_schedule)
>>>>>       if (unlikely(!rq_list_empty(plug->cached_rq)))
>>>>>           blk_mq_free_plug_rqs(plug);
>>>>>   }
>>>>> +EXPORT_SYMBOL(__blk_flush_plug);
>>>>>     /**
>>>>>    * blk_finish_plug - mark the end of a batch of submitted I/O
>>>>> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
>>>>> index 284cd71bcc68..25ec82f2813f 100644
>>>>> --- a/drivers/md/raid5.c
>>>>> +++ b/drivers/md/raid5.c
>>>>> @@ -6850,11 +6850,12 @@ static void raid5d(struct md_thread *thread)
>>>>>                * the flag when using mdmon.
>>>>>                */
>>>>>               continue;
>>>>> +        } else {
>>>>> +            spin_unlock_irq(&conf->device_lock);
>>>>> +            blk_flush_plug(current);
>>>>> +            cond_resched();
>>>>> +            spin_lock_irq(&conf->device_lock);
>>>>>           }
>>>>> -
>>>>> -        wait_event_lock_irq(mddev->sb_wait,
>>>>> -            !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
>>>>> -            conf->device_lock);
>>>>>       }
>>>>>       pr_debug("%d stripes handled\n", handled);
>>>>>
>>>>
>>> .
>>>
>>
> .
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-11-04  1:56 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-01 23:02 [RFC] md/raid5: fix hung by MD_SB_CHANGE_PENDING Junxiao Bi
2023-11-02  1:24 ` Yu Kuai
2023-11-02  4:32   ` junxiao.bi
2023-11-02  7:16     ` Yu Kuai
2023-11-03 19:11       ` junxiao.bi
2023-11-04  1:56         ` Yu Kuai
2023-11-02  4:59 ` kernel test robot
2023-11-02  4:59 ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.