All of lore.kernel.org
 help / color / mirror / Atom feed
* Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
       [not found] <221835411.4473056.1504338574607.JavaMail.zimbra@redhat.com>
@ 2017-09-02  8:01 ` Xiao Ni
  2017-09-04  2:16   ` NeilBrown
  0 siblings, 1 reply; 14+ messages in thread
From: Xiao Ni @ 2017-09-02  8:01 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, shli

Hi Neil and Shaohua

I'm trying to reproduce the problem the problem "raid5_finish_reshape is stuck at revalidate_disk".
But there is a new stuck. I tried with 4.13.0-rc5 and latest mdadm. The steps are:

#!/bin/bash
num=0
while [ 1 ]
do
	echo "*************************$num"
	mdadm -Ss
	mdadm --create --run /dev/md0 --level 5 --metadata 1.2 --raid-devices 5 /dev/loop0 /dev/loop1 /dev/loop2 /dev/loop3 /dev/loop4 --spare-devices 1 /dev/loop5 --chunk 512
	mdadm --wait /dev/md0
	mkfs -t ext4 /dev/md0
	mount -t ext4 /dev/md0 /mnt/md_test
	dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=100
	mdadm --add /dev/md0 /dev/loop6
	mdadm --grow --raid-devices 6 /dev/md0
	dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=1000
	mdadm --wait /dev/md0
	((num++))
	umount /dev/md0
done

The calltrace messages are:
Sep  1 13:57:25 localhost kernel: INFO: task kworker/u8:4:21401 blocked for more than 120 seconds.
Sep  1 13:57:25 localhost kernel:      Tainted: G           OE   4.13.0-rc5 #2
Sep  1 13:57:25 localhost kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
Sep  1 13:57:25 localhost kernel: kworker/u8:4    D    0 21401      2 0x00000080
Sep  1 13:57:25 localhost kernel: Workqueue: writeback wb_workfn (flush-9:0)
Sep  1 13:57:25 localhost kernel: Call Trace:
Sep  1 13:57:25 localhost kernel: __schedule+0x28d/0x890
Sep  1 13:57:25 localhost kernel: schedule+0x36/0x80
Sep  1 13:57:25 localhost kernel: md_write_start+0xf0/0x220 [md_mod]
Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
Sep  1 13:57:25 localhost kernel: raid5_make_request+0x89/0x8b0 [raid456]
Sep  1 13:57:25 localhost kernel: ? bio_split+0x5d/0x90
Sep  1 13:57:25 localhost kernel: ? blk_queue_split+0xd2/0x630
Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
Sep  1 13:57:25 localhost kernel: md_make_request+0xf5/0x260 [md_mod]
....

All the processes send bio to md are stuck at md_write_start.
        wait_event(mddev->sb_wait,
                   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended);


Sep  1 13:57:26 localhost kernel: INFO: task md0_reshape:23605 blocked for more than 120 seconds.
Sep  1 13:57:26 localhost kernel:      Tainted: G           OE   4.13.0-rc5 #2
Sep  1 13:57:26 localhost kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
Sep  1 13:57:26 localhost kernel: md0_reshape     D    0 23605      2 0x00000080
Sep  1 13:57:26 localhost kernel: Call Trace:
Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
Sep  1 13:57:26 localhost kernel: raid5_sync_request+0x2cf/0x370 [raid456]
Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
Sep  1 13:57:26 localhost kernel: md_do_sync+0xafe/0xee0 [md_mod]
Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
Sep  1 13:57:26 localhost kernel: md_thread+0x132/0x180 [md_mod]
Sep  1 13:57:26 localhost kernel: kthread+0x109/0x140
Sep  1 13:57:26 localhost kernel: ? find_pers+0x70/0x70 [md_mod]
Sep  1 13:57:26 localhost kernel: ? kthread_park+0x60/0x60
Sep  1 13:57:26 localhost kernel: ? do_syscall_64+0x67/0x150
Sep  1 13:57:26 localhost kernel: ret_from_fork+0x25/0x30

It stucks at:
        /* Allow raid5_quiesce to complete */
        wait_event(conf->wait_for_overlap, conf->quiesce != 2);

Sep  1 13:57:26 localhost kernel: INFO: task mdadm:23613 blocked for more than 120 seconds.
Sep  1 13:57:26 localhost kernel:      Tainted: G           OE   4.13.0-rc5 #2
Sep  1 13:57:26 localhost kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
Sep  1 13:57:26 localhost kernel: mdadm           D    0 23613      1 0x00000080
Sep  1 13:57:26 localhost kernel: Call Trace:
Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
Sep  1 13:57:26 localhost kernel: raid5_quiesce+0x274/0x2b0 [raid456]
Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
Sep  1 13:57:26 localhost kernel: suspend_lo_store+0x82/0xe0 [md_mod]
Sep  1 13:57:26 localhost kernel: md_attr_store+0x80/0xc0 [md_mod]
Sep  1 13:57:26 localhost kernel: sysfs_kf_write+0x3a/0x50
Sep  1 13:57:26 localhost kernel: kernfs_fop_write+0xff/0x180
Sep  1 13:57:26 localhost kernel: __vfs_write+0x37/0x170
Sep  1 13:57:26 localhost kernel: ? selinux_file_permission+0xe5/0x120
Sep  1 13:57:26 localhost kernel: ? security_file_permission+0x3b/0xc0
Sep  1 13:57:26 localhost kernel: vfs_write+0xb2/0x1b0
Sep  1 13:57:26 localhost kernel: ? syscall_trace_enter+0x1d0/0x2b0
Sep  1 13:57:26 localhost kernel: SyS_write+0x55/0xc0
Sep  1 13:57:26 localhost kernel: do_syscall_64+0x67/0x150

It's stuck at:
conf->quiesce = 2;
wait_event_cmd(conf->wait_for_quiescent,
   atomic_read(&conf->active_stripes) == 0 &&
   atomic_read(&conf->active_aligned_reads) == 0,
   unlock_all_device_hash_locks_irq(conf),
   lock_all_device_hash_locks_irq(conf));
conf->quiesce = 1;

[root@dell-pr1700-02 ~]# cat /proc/mdstat 
Personalities : [raid6] [raid5] [raid4] 
md0 : active raid5 loop6[7] loop4[6] loop5[5](S) loop3[3] loop2[2] loop1[1] loop0[0]
      2039808 blocks super 1.2 level 5, 512k chunk, algorithm 2 [6/6] [UUUUUU]
      [>....................]  reshape =  0.5% (2560/509952) finish=162985.0min speed=0K/sec
      
unused devices: <none>


I can reproduce this. So I add some logs by printk to check MD_SB_CHANGE_PENDING of mddev->flags
and mddev->suspended.
mddev->suspend : 0
mddev->flags   : SP CHANGE PENDING is set
conf->quiesce  : 2

The I echo active > /sys/block/md0/md/array_state, reshape can start and finish successfully.

I notice there are some fixes for raid5 stuck problem. Not sure whether it's introduced by those
patches. 

Best Regards
Xiao




^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-02  8:01 ` Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared Xiao Ni
@ 2017-09-04  2:16   ` NeilBrown
  2017-09-04  2:45     ` Xiao Ni
  0 siblings, 1 reply; 14+ messages in thread
From: NeilBrown @ 2017-09-04  2:16 UTC (permalink / raw)
  To: Xiao Ni, linux-raid; +Cc: shli

[-- Attachment #1: Type: text/plain, Size: 7293 bytes --]

On Sat, Sep 02 2017, Xiao Ni wrote:

> Hi Neil and Shaohua
>
> I'm trying to reproduce the problem the problem "raid5_finish_reshape is stuck at revalidate_disk".
> But there is a new stuck. I tried with 4.13.0-rc5 and latest mdadm. The steps are:
>
> #!/bin/bash
> num=0
> while [ 1 ]
> do
> 	echo "*************************$num"
> 	mdadm -Ss
> 	mdadm --create --run /dev/md0 --level 5 --metadata 1.2 --raid-devices 5 /dev/loop0 /dev/loop1 /dev/loop2 /dev/loop3 /dev/loop4 --spare-devices 1 /dev/loop5 --chunk 512
> 	mdadm --wait /dev/md0
> 	mkfs -t ext4 /dev/md0
> 	mount -t ext4 /dev/md0 /mnt/md_test
> 	dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=100
> 	mdadm --add /dev/md0 /dev/loop6
> 	mdadm --grow --raid-devices 6 /dev/md0
> 	dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=1000
> 	mdadm --wait /dev/md0
> 	((num++))
> 	umount /dev/md0
> done
>
> The calltrace messages are:
> Sep  1 13:57:25 localhost kernel: INFO: task kworker/u8:4:21401 blocked for more than 120 seconds.
> Sep  1 13:57:25 localhost kernel:      Tainted: G           OE   4.13.0-rc5 #2
> Sep  1 13:57:25 localhost kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> Sep  1 13:57:25 localhost kernel: kworker/u8:4    D    0 21401      2 0x00000080
> Sep  1 13:57:25 localhost kernel: Workqueue: writeback wb_workfn (flush-9:0)
> Sep  1 13:57:25 localhost kernel: Call Trace:
> Sep  1 13:57:25 localhost kernel: __schedule+0x28d/0x890
> Sep  1 13:57:25 localhost kernel: schedule+0x36/0x80
> Sep  1 13:57:25 localhost kernel: md_write_start+0xf0/0x220 [md_mod]
> Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
> Sep  1 13:57:25 localhost kernel: raid5_make_request+0x89/0x8b0 [raid456]
> Sep  1 13:57:25 localhost kernel: ? bio_split+0x5d/0x90
> Sep  1 13:57:25 localhost kernel: ? blk_queue_split+0xd2/0x630
> Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
> Sep  1 13:57:25 localhost kernel: md_make_request+0xf5/0x260 [md_mod]
> ....
>
> All the processes send bio to md are stuck at md_write_start.
>         wait_event(mddev->sb_wait,
>                    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended);
>
>
> Sep  1 13:57:26 localhost kernel: INFO: task md0_reshape:23605 blocked for more than 120 seconds.
> Sep  1 13:57:26 localhost kernel:      Tainted: G           OE   4.13.0-rc5 #2
> Sep  1 13:57:26 localhost kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> Sep  1 13:57:26 localhost kernel: md0_reshape     D    0 23605      2 0x00000080
> Sep  1 13:57:26 localhost kernel: Call Trace:
> Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
> Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
> Sep  1 13:57:26 localhost kernel: raid5_sync_request+0x2cf/0x370 [raid456]
> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
> Sep  1 13:57:26 localhost kernel: md_do_sync+0xafe/0xee0 [md_mod]
> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
> Sep  1 13:57:26 localhost kernel: md_thread+0x132/0x180 [md_mod]
> Sep  1 13:57:26 localhost kernel: kthread+0x109/0x140
> Sep  1 13:57:26 localhost kernel: ? find_pers+0x70/0x70 [md_mod]
> Sep  1 13:57:26 localhost kernel: ? kthread_park+0x60/0x60
> Sep  1 13:57:26 localhost kernel: ? do_syscall_64+0x67/0x150
> Sep  1 13:57:26 localhost kernel: ret_from_fork+0x25/0x30
>
> It stucks at:
>         /* Allow raid5_quiesce to complete */
>         wait_event(conf->wait_for_overlap, conf->quiesce != 2);
>
> Sep  1 13:57:26 localhost kernel: INFO: task mdadm:23613 blocked for more than 120 seconds.
> Sep  1 13:57:26 localhost kernel:      Tainted: G           OE   4.13.0-rc5 #2
> Sep  1 13:57:26 localhost kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> Sep  1 13:57:26 localhost kernel: mdadm           D    0 23613      1 0x00000080
> Sep  1 13:57:26 localhost kernel: Call Trace:
> Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
> Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
> Sep  1 13:57:26 localhost kernel: raid5_quiesce+0x274/0x2b0 [raid456]
> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
> Sep  1 13:57:26 localhost kernel: suspend_lo_store+0x82/0xe0 [md_mod]
> Sep  1 13:57:26 localhost kernel: md_attr_store+0x80/0xc0 [md_mod]
> Sep  1 13:57:26 localhost kernel: sysfs_kf_write+0x3a/0x50
> Sep  1 13:57:26 localhost kernel: kernfs_fop_write+0xff/0x180
> Sep  1 13:57:26 localhost kernel: __vfs_write+0x37/0x170
> Sep  1 13:57:26 localhost kernel: ? selinux_file_permission+0xe5/0x120
> Sep  1 13:57:26 localhost kernel: ? security_file_permission+0x3b/0xc0
> Sep  1 13:57:26 localhost kernel: vfs_write+0xb2/0x1b0
> Sep  1 13:57:26 localhost kernel: ? syscall_trace_enter+0x1d0/0x2b0
> Sep  1 13:57:26 localhost kernel: SyS_write+0x55/0xc0
> Sep  1 13:57:26 localhost kernel: do_syscall_64+0x67/0x150
>
> It's stuck at:
> conf->quiesce = 2;
> wait_event_cmd(conf->wait_for_quiescent,
>    atomic_read(&conf->active_stripes) == 0 &&
>    atomic_read(&conf->active_aligned_reads) == 0,
>    unlock_all_device_hash_locks_irq(conf),
>    lock_all_device_hash_locks_irq(conf));
> conf->quiesce = 1;
>
> [root@dell-pr1700-02 ~]# cat /proc/mdstat 
> Personalities : [raid6] [raid5] [raid4] 
> md0 : active raid5 loop6[7] loop4[6] loop5[5](S) loop3[3] loop2[2] loop1[1] loop0[0]
>       2039808 blocks super 1.2 level 5, 512k chunk, algorithm 2 [6/6] [UUUUUU]
>       [>....................]  reshape =  0.5% (2560/509952) finish=162985.0min speed=0K/sec
>       
> unused devices: <none>
>
>
> I can reproduce this. So I add some logs by printk to check MD_SB_CHANGE_PENDING of mddev->flags
> and mddev->suspended.
> mddev->suspend : 0
> mddev->flags   : SP CHANGE PENDING is set
> conf->quiesce  : 2
>
> The I echo active > /sys/block/md0/md/array_state, reshape can start and finish successfully.
>
> I notice there are some fixes for raid5 stuck problem. Not sure whether it's introduced by those
> patches. 
>
> Best Regards
> Xiao

Thanks for the detailed report.
All the processes that you have mentioned are waiting for
raid5_quiesce() to complete.
raid5_quiesce() must be waiting for either active_stripes or
active_aligned_reads to reach zero.  Most likely active_stripes.

That suggests that either there is a stripe that is active and isn't
being handled for some reason, or the accounting of active stripes has
problems.

The fact that setting the array to 'active' removes the hang is
interesting.  That would cause the threads blocked in md_write_start to
continue, but I cannot see how that would affect active_stripes as
threads waiting for md_write_start() never hold a stripe.  Doing that
could cause a stripe to be used, then released, which might trigger a
wakeup that had been lost.

It would be useful to see if ->active_stripes is 0 when the deadlock
happens.
One way to do that is to add a printk into raid5_status().  Then
whenever you "cat /proc/mdstat" the printk will trigger and tell you the
value of ->active_stripes.

I haven't managed to reproduce this yet, or I would track it down my
self.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-04  2:16   ` NeilBrown
@ 2017-09-04  2:45     ` Xiao Ni
  2017-09-04  3:52       ` Xiao Ni
  0 siblings, 1 reply; 14+ messages in thread
From: Xiao Ni @ 2017-09-04  2:45 UTC (permalink / raw)
  To: NeilBrown, linux-raid; +Cc: shli



On 09/04/2017 10:16 AM, NeilBrown wrote:
> On Sat, Sep 02 2017, Xiao Ni wrote:
>
>> Hi Neil and Shaohua
>>
>> I'm trying to reproduce the problem the problem "raid5_finish_reshape is stuck at revalidate_disk".
>> But there is a new stuck. I tried with 4.13.0-rc5 and latest mdadm. The steps are:
>>
>> #!/bin/bash
>> num=0
>> while [ 1 ]
>> do
>> 	echo "*************************$num"
>> 	mdadm -Ss
>> 	mdadm --create --run /dev/md0 --level 5 --metadata 1.2 --raid-devices 5 /dev/loop0 /dev/loop1 /dev/loop2 /dev/loop3 /dev/loop4 --spare-devices 1 /dev/loop5 --chunk 512
>> 	mdadm --wait /dev/md0
>> 	mkfs -t ext4 /dev/md0
>> 	mount -t ext4 /dev/md0 /mnt/md_test
>> 	dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=100
>> 	mdadm --add /dev/md0 /dev/loop6
>> 	mdadm --grow --raid-devices 6 /dev/md0
>> 	dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=1000
>> 	mdadm --wait /dev/md0
>> 	((num++))
>> 	umount /dev/md0
>> done
>>
>> The calltrace messages are:
>> Sep  1 13:57:25 localhost kernel: INFO: task kworker/u8:4:21401 blocked for more than 120 seconds.
>> Sep  1 13:57:25 localhost kernel:      Tainted: G           OE   4.13.0-rc5 #2
>> Sep  1 13:57:25 localhost kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> Sep  1 13:57:25 localhost kernel: kworker/u8:4    D    0 21401      2 0x00000080
>> Sep  1 13:57:25 localhost kernel: Workqueue: writeback wb_workfn (flush-9:0)
>> Sep  1 13:57:25 localhost kernel: Call Trace:
>> Sep  1 13:57:25 localhost kernel: __schedule+0x28d/0x890
>> Sep  1 13:57:25 localhost kernel: schedule+0x36/0x80
>> Sep  1 13:57:25 localhost kernel: md_write_start+0xf0/0x220 [md_mod]
>> Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
>> Sep  1 13:57:25 localhost kernel: raid5_make_request+0x89/0x8b0 [raid456]
>> Sep  1 13:57:25 localhost kernel: ? bio_split+0x5d/0x90
>> Sep  1 13:57:25 localhost kernel: ? blk_queue_split+0xd2/0x630
>> Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
>> Sep  1 13:57:25 localhost kernel: md_make_request+0xf5/0x260 [md_mod]
>> ....
>>
>> All the processes send bio to md are stuck at md_write_start.
>>          wait_event(mddev->sb_wait,
>>                     !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended);
>>
>>
>> Sep  1 13:57:26 localhost kernel: INFO: task md0_reshape:23605 blocked for more than 120 seconds.
>> Sep  1 13:57:26 localhost kernel:      Tainted: G           OE   4.13.0-rc5 #2
>> Sep  1 13:57:26 localhost kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> Sep  1 13:57:26 localhost kernel: md0_reshape     D    0 23605      2 0x00000080
>> Sep  1 13:57:26 localhost kernel: Call Trace:
>> Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
>> Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
>> Sep  1 13:57:26 localhost kernel: raid5_sync_request+0x2cf/0x370 [raid456]
>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>> Sep  1 13:57:26 localhost kernel: md_do_sync+0xafe/0xee0 [md_mod]
>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>> Sep  1 13:57:26 localhost kernel: md_thread+0x132/0x180 [md_mod]
>> Sep  1 13:57:26 localhost kernel: kthread+0x109/0x140
>> Sep  1 13:57:26 localhost kernel: ? find_pers+0x70/0x70 [md_mod]
>> Sep  1 13:57:26 localhost kernel: ? kthread_park+0x60/0x60
>> Sep  1 13:57:26 localhost kernel: ? do_syscall_64+0x67/0x150
>> Sep  1 13:57:26 localhost kernel: ret_from_fork+0x25/0x30
>>
>> It stucks at:
>>          /* Allow raid5_quiesce to complete */
>>          wait_event(conf->wait_for_overlap, conf->quiesce != 2);
>>
>> Sep  1 13:57:26 localhost kernel: INFO: task mdadm:23613 blocked for more than 120 seconds.
>> Sep  1 13:57:26 localhost kernel:      Tainted: G           OE   4.13.0-rc5 #2
>> Sep  1 13:57:26 localhost kernel: "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> Sep  1 13:57:26 localhost kernel: mdadm           D    0 23613      1 0x00000080
>> Sep  1 13:57:26 localhost kernel: Call Trace:
>> Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
>> Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
>> Sep  1 13:57:26 localhost kernel: raid5_quiesce+0x274/0x2b0 [raid456]
>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>> Sep  1 13:57:26 localhost kernel: suspend_lo_store+0x82/0xe0 [md_mod]
>> Sep  1 13:57:26 localhost kernel: md_attr_store+0x80/0xc0 [md_mod]
>> Sep  1 13:57:26 localhost kernel: sysfs_kf_write+0x3a/0x50
>> Sep  1 13:57:26 localhost kernel: kernfs_fop_write+0xff/0x180
>> Sep  1 13:57:26 localhost kernel: __vfs_write+0x37/0x170
>> Sep  1 13:57:26 localhost kernel: ? selinux_file_permission+0xe5/0x120
>> Sep  1 13:57:26 localhost kernel: ? security_file_permission+0x3b/0xc0
>> Sep  1 13:57:26 localhost kernel: vfs_write+0xb2/0x1b0
>> Sep  1 13:57:26 localhost kernel: ? syscall_trace_enter+0x1d0/0x2b0
>> Sep  1 13:57:26 localhost kernel: SyS_write+0x55/0xc0
>> Sep  1 13:57:26 localhost kernel: do_syscall_64+0x67/0x150
>>
>> It's stuck at:
>> conf->quiesce = 2;
>> wait_event_cmd(conf->wait_for_quiescent,
>>     atomic_read(&conf->active_stripes) == 0 &&
>>     atomic_read(&conf->active_aligned_reads) == 0,
>>     unlock_all_device_hash_locks_irq(conf),
>>     lock_all_device_hash_locks_irq(conf));
>> conf->quiesce = 1;
>>
>> [root@dell-pr1700-02 ~]# cat /proc/mdstat
>> Personalities : [raid6] [raid5] [raid4]
>> md0 : active raid5 loop6[7] loop4[6] loop5[5](S) loop3[3] loop2[2] loop1[1] loop0[0]
>>        2039808 blocks super 1.2 level 5, 512k chunk, algorithm 2 [6/6] [UUUUUU]
>>        [>....................]  reshape =  0.5% (2560/509952) finish=162985.0min speed=0K/sec
>>        
>> unused devices: <none>
>>
>>
>> I can reproduce this. So I add some logs by printk to check MD_SB_CHANGE_PENDING of mddev->flags
>> and mddev->suspended.
>> mddev->suspend : 0
>> mddev->flags   : SP CHANGE PENDING is set
>> conf->quiesce  : 2
>>
>> The I echo active > /sys/block/md0/md/array_state, reshape can start and finish successfully.
>>
>> I notice there are some fixes for raid5 stuck problem. Not sure whether it's introduced by those
>> patches.
>>
>> Best Regards
>> Xiao
> Thanks for the detailed report.
> All the processes that you have mentioned are waiting for
> raid5_quiesce() to complete.
> raid5_quiesce() must be waiting for either active_stripes or
> active_aligned_reads to reach zero.  Most likely active_stripes.
>
> That suggests that either there is a stripe that is active and isn't
> being handled for some reason, or the accounting of active stripes has
> problems.
>
> The fact that setting the array to 'active' removes the hang is
> interesting.  That would cause the threads blocked in md_write_start to
> continue, but I cannot see how that would affect active_stripes as
> threads waiting for md_write_start() never hold a stripe.  Doing that
> could cause a stripe to be used, then released, which might trigger a
> wakeup that had been lost.
At first I doubt there are some active strips that haven't been handled yet.
But the process raid5d doesn't appear in calltrace. It wasn't stuck.
>
> It would be useful to see if ->active_stripes is 0 when the deadlock
> happens.
> One way to do that is to add a printk into raid5_status().  Then
> whenever you "cat /proc/mdstat" the printk will trigger and tell you the
> value of ->active_stripes.
>
> I haven't managed to reproduce this yet, or I would track it down my
> self.

Sure the testing is running again. I'll give the information later.

Regards
Xiao
>
> Thanks,
> NeilBrown


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-04  2:45     ` Xiao Ni
@ 2017-09-04  3:52       ` Xiao Ni
  2017-09-04  5:34         ` NeilBrown
  0 siblings, 1 reply; 14+ messages in thread
From: Xiao Ni @ 2017-09-04  3:52 UTC (permalink / raw)
  To: NeilBrown, linux-raid; +Cc: shli



On 09/04/2017 10:45 AM, Xiao Ni wrote:
>
>
> On 09/04/2017 10:16 AM, NeilBrown wrote:
>> On Sat, Sep 02 2017, Xiao Ni wrote:
>>
>>> Hi Neil and Shaohua
>>>
>>> I'm trying to reproduce the problem the problem 
>>> "raid5_finish_reshape is stuck at revalidate_disk".
>>> But there is a new stuck. I tried with 4.13.0-rc5 and latest mdadm. 
>>> The steps are:
>>>
>>> #!/bin/bash
>>> num=0
>>> while [ 1 ]
>>> do
>>>     echo "*************************$num"
>>>     mdadm -Ss
>>>     mdadm --create --run /dev/md0 --level 5 --metadata 1.2 
>>> --raid-devices 5 /dev/loop0 /dev/loop1 /dev/loop2 /dev/loop3 
>>> /dev/loop4 --spare-devices 1 /dev/loop5 --chunk 512
>>>     mdadm --wait /dev/md0
>>>     mkfs -t ext4 /dev/md0
>>>     mount -t ext4 /dev/md0 /mnt/md_test
>>>     dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=100
>>>     mdadm --add /dev/md0 /dev/loop6
>>>     mdadm --grow --raid-devices 6 /dev/md0
>>>     dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=1000
>>>     mdadm --wait /dev/md0
>>>     ((num++))
>>>     umount /dev/md0
>>> done
>>>
>>> The calltrace messages are:
>>> Sep  1 13:57:25 localhost kernel: INFO: task kworker/u8:4:21401 
>>> blocked for more than 120 seconds.
>>> Sep  1 13:57:25 localhost kernel:      Tainted: G OE   4.13.0-rc5 #2
>>> Sep  1 13:57:25 localhost kernel: "echo 0 > 
>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>> Sep  1 13:57:25 localhost kernel: kworker/u8:4    D    0 21401      
>>> 2 0x00000080
>>> Sep  1 13:57:25 localhost kernel: Workqueue: writeback wb_workfn 
>>> (flush-9:0)
>>> Sep  1 13:57:25 localhost kernel: Call Trace:
>>> Sep  1 13:57:25 localhost kernel: __schedule+0x28d/0x890
>>> Sep  1 13:57:25 localhost kernel: schedule+0x36/0x80
>>> Sep  1 13:57:25 localhost kernel: md_write_start+0xf0/0x220 [md_mod]
>>> Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
>>> Sep  1 13:57:25 localhost kernel: raid5_make_request+0x89/0x8b0 
>>> [raid456]
>>> Sep  1 13:57:25 localhost kernel: ? bio_split+0x5d/0x90
>>> Sep  1 13:57:25 localhost kernel: ? blk_queue_split+0xd2/0x630
>>> Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
>>> Sep  1 13:57:25 localhost kernel: md_make_request+0xf5/0x260 [md_mod]
>>> ....
>>>
>>> All the processes send bio to md are stuck at md_write_start.
>>>          wait_event(mddev->sb_wait,
>>>                     !test_bit(MD_SB_CHANGE_PENDING, 
>>> &mddev->sb_flags) && !mddev->suspended);
>>>
>>>
>>> Sep  1 13:57:26 localhost kernel: INFO: task md0_reshape:23605 
>>> blocked for more than 120 seconds.
>>> Sep  1 13:57:26 localhost kernel:      Tainted: G OE   4.13.0-rc5 #2
>>> Sep  1 13:57:26 localhost kernel: "echo 0 > 
>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>> Sep  1 13:57:26 localhost kernel: md0_reshape     D    0 23605      
>>> 2 0x00000080
>>> Sep  1 13:57:26 localhost kernel: Call Trace:
>>> Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
>>> Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
>>> Sep  1 13:57:26 localhost kernel: raid5_sync_request+0x2cf/0x370 
>>> [raid456]
>>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>>> Sep  1 13:57:26 localhost kernel: md_do_sync+0xafe/0xee0 [md_mod]
>>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>>> Sep  1 13:57:26 localhost kernel: md_thread+0x132/0x180 [md_mod]
>>> Sep  1 13:57:26 localhost kernel: kthread+0x109/0x140
>>> Sep  1 13:57:26 localhost kernel: ? find_pers+0x70/0x70 [md_mod]
>>> Sep  1 13:57:26 localhost kernel: ? kthread_park+0x60/0x60
>>> Sep  1 13:57:26 localhost kernel: ? do_syscall_64+0x67/0x150
>>> Sep  1 13:57:26 localhost kernel: ret_from_fork+0x25/0x30
>>>
>>> It stucks at:
>>>          /* Allow raid5_quiesce to complete */
>>>          wait_event(conf->wait_for_overlap, conf->quiesce != 2);
>>>
>>> Sep  1 13:57:26 localhost kernel: INFO: task mdadm:23613 blocked for 
>>> more than 120 seconds.
>>> Sep  1 13:57:26 localhost kernel:      Tainted: G OE   4.13.0-rc5 #2
>>> Sep  1 13:57:26 localhost kernel: "echo 0 > 
>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>> Sep  1 13:57:26 localhost kernel: mdadm           D    0 23613      
>>> 1 0x00000080
>>> Sep  1 13:57:26 localhost kernel: Call Trace:
>>> Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
>>> Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
>>> Sep  1 13:57:26 localhost kernel: raid5_quiesce+0x274/0x2b0 [raid456]
>>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>>> Sep  1 13:57:26 localhost kernel: suspend_lo_store+0x82/0xe0 [md_mod]
>>> Sep  1 13:57:26 localhost kernel: md_attr_store+0x80/0xc0 [md_mod]
>>> Sep  1 13:57:26 localhost kernel: sysfs_kf_write+0x3a/0x50
>>> Sep  1 13:57:26 localhost kernel: kernfs_fop_write+0xff/0x180
>>> Sep  1 13:57:26 localhost kernel: __vfs_write+0x37/0x170
>>> Sep  1 13:57:26 localhost kernel: ? selinux_file_permission+0xe5/0x120
>>> Sep  1 13:57:26 localhost kernel: ? security_file_permission+0x3b/0xc0
>>> Sep  1 13:57:26 localhost kernel: vfs_write+0xb2/0x1b0
>>> Sep  1 13:57:26 localhost kernel: ? syscall_trace_enter+0x1d0/0x2b0
>>> Sep  1 13:57:26 localhost kernel: SyS_write+0x55/0xc0
>>> Sep  1 13:57:26 localhost kernel: do_syscall_64+0x67/0x150
>>>
>>> It's stuck at:
>>> conf->quiesce = 2;
>>> wait_event_cmd(conf->wait_for_quiescent,
>>>     atomic_read(&conf->active_stripes) == 0 &&
>>>     atomic_read(&conf->active_aligned_reads) == 0,
>>>     unlock_all_device_hash_locks_irq(conf),
>>>     lock_all_device_hash_locks_irq(conf));
>>> conf->quiesce = 1;
>>>
>>> [root@dell-pr1700-02 ~]# cat /proc/mdstat
>>> Personalities : [raid6] [raid5] [raid4]
>>> md0 : active raid5 loop6[7] loop4[6] loop5[5](S) loop3[3] loop2[2] 
>>> loop1[1] loop0[0]
>>>        2039808 blocks super 1.2 level 5, 512k chunk, algorithm 2 
>>> [6/6] [UUUUUU]
>>>        [>....................]  reshape =  0.5% (2560/509952) 
>>> finish=162985.0min speed=0K/sec
>>>        unused devices: <none>
>>>
>>>
>>> I can reproduce this. So I add some logs by printk to check 
>>> MD_SB_CHANGE_PENDING of mddev->flags
>>> and mddev->suspended.
>>> mddev->suspend : 0
>>> mddev->flags   : SP CHANGE PENDING is set
>>> conf->quiesce  : 2
>>>
>>> The I echo active > /sys/block/md0/md/array_state, reshape can start 
>>> and finish successfully.
>>>
>>> I notice there are some fixes for raid5 stuck problem. Not sure 
>>> whether it's introduced by those
>>> patches.
>>>
>>> Best Regards
>>> Xiao
>> Thanks for the detailed report.
>> All the processes that you have mentioned are waiting for
>> raid5_quiesce() to complete.
>> raid5_quiesce() must be waiting for either active_stripes or
>> active_aligned_reads to reach zero.  Most likely active_stripes.
>>
>> That suggests that either there is a stripe that is active and isn't
>> being handled for some reason, or the accounting of active stripes has
>> problems.
>>
>> The fact that setting the array to 'active' removes the hang is
>> interesting.  That would cause the threads blocked in md_write_start to
>> continue, but I cannot see how that would affect active_stripes as
>> threads waiting for md_write_start() never hold a stripe.  Doing that
>> could cause a stripe to be used, then released, which might trigger a
>> wakeup that had been lost.
> At first I doubt there are some active strips that haven't been 
> handled yet.
> But the process raid5d doesn't appear in calltrace. It wasn't stuck.
>>
>> It would be useful to see if ->active_stripes is 0 when the deadlock
>> happens.
>> One way to do that is to add a printk into raid5_status().  Then
>> whenever you "cat /proc/mdstat" the printk will trigger and tell you the
>> value of ->active_stripes.
>>
>> I haven't managed to reproduce this yet, or I would track it down my
>> self.
>
> Sure the testing is running again. I'll give the information later.

Hi Neil

I have reproduced this.

[root@dell-pr1700-02 ~]# cat /proc/mdstat
Personalities : [raid6] [raid5] [raid4]
md0 : active raid5 loop6[7] loop4[6] loop5[5](S) loop3[3] loop2[2] 
loop1[1] loop0[0]
       2039808 blocks super 1.2 level 5, 512k chunk, algorithm 2 [6/6] 
[UUUUUU]
       [>....................]  reshape =  0.5% (2560/509952) 
finish=2498.2min speed=3K/sec

unused devices: <none>
[root@dell-pr1700-02 ~]# dmesg
[ 3072.826251] suspend : 0
[ 3072.828683] SP CHANGE PENDING is set
[ 3072.832243] conf quiesce 2
[ 3072.834932] active stripes : 251
[ 3072.838139] active aligned reads : 0

[root@dell-pr1700-02 ~]# ps auxf | grep md
...
root      1682  0.0  0.0      0     0 ?        S<   23:02   0:00 \_ [md]
root      3103 97.2  0.0      0     0 ?        R    23:34  14:06  \_ 
[md0_raid5]
root      3123  0.0  0.0      0     0 ?        D    23:34   0:00  \_ 
[jbd2/md0-8]
root      3134  0.0  0.0      0     0 ?        D    23:34   0:00  \_ 
[md0_reshape]
....
root      3142  0.5  0.0 108996  2728 pts/1    D+   23:34   0:04 
|           \_ dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=1000
root      3223  0.0  0.0 112672  2300 pts/0    S+   23:48 0:00          
\_ grep --color=auto md
root      3141  0.0  0.0  10132  5288 ?        DLs  23:34   0:00 
/usr/sbin/mdadm --grow --continue /dev/md0

It's strange, raid5d is running but it don't handle active stripes.
Do you need more information?

Regards
Xiao


>
> Regards
> Xiao
>>
>> Thanks,
>> NeilBrown
>
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-04  3:52       ` Xiao Ni
@ 2017-09-04  5:34         ` NeilBrown
  2017-09-04  7:36           ` Xiao Ni
  0 siblings, 1 reply; 14+ messages in thread
From: NeilBrown @ 2017-09-04  5:34 UTC (permalink / raw)
  To: Xiao Ni, linux-raid; +Cc: shli

[-- Attachment #1: Type: text/plain, Size: 10172 bytes --]

On Mon, Sep 04 2017, Xiao Ni wrote:

> On 09/04/2017 10:45 AM, Xiao Ni wrote:
>>
>>
>> On 09/04/2017 10:16 AM, NeilBrown wrote:
>>> On Sat, Sep 02 2017, Xiao Ni wrote:
>>>
>>>> Hi Neil and Shaohua
>>>>
>>>> I'm trying to reproduce the problem the problem 
>>>> "raid5_finish_reshape is stuck at revalidate_disk".
>>>> But there is a new stuck. I tried with 4.13.0-rc5 and latest mdadm. 
>>>> The steps are:
>>>>
>>>> #!/bin/bash
>>>> num=0
>>>> while [ 1 ]
>>>> do
>>>>     echo "*************************$num"
>>>>     mdadm -Ss
>>>>     mdadm --create --run /dev/md0 --level 5 --metadata 1.2 
>>>> --raid-devices 5 /dev/loop0 /dev/loop1 /dev/loop2 /dev/loop3 
>>>> /dev/loop4 --spare-devices 1 /dev/loop5 --chunk 512
>>>>     mdadm --wait /dev/md0
>>>>     mkfs -t ext4 /dev/md0
>>>>     mount -t ext4 /dev/md0 /mnt/md_test
>>>>     dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=100
>>>>     mdadm --add /dev/md0 /dev/loop6
>>>>     mdadm --grow --raid-devices 6 /dev/md0
>>>>     dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=1000
>>>>     mdadm --wait /dev/md0
>>>>     ((num++))
>>>>     umount /dev/md0
>>>> done
>>>>
>>>> The calltrace messages are:
>>>> Sep  1 13:57:25 localhost kernel: INFO: task kworker/u8:4:21401 
>>>> blocked for more than 120 seconds.
>>>> Sep  1 13:57:25 localhost kernel:      Tainted: G OE   4.13.0-rc5 #2
>>>> Sep  1 13:57:25 localhost kernel: "echo 0 > 
>>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>> Sep  1 13:57:25 localhost kernel: kworker/u8:4    D    0 21401      
>>>> 2 0x00000080
>>>> Sep  1 13:57:25 localhost kernel: Workqueue: writeback wb_workfn 
>>>> (flush-9:0)
>>>> Sep  1 13:57:25 localhost kernel: Call Trace:
>>>> Sep  1 13:57:25 localhost kernel: __schedule+0x28d/0x890
>>>> Sep  1 13:57:25 localhost kernel: schedule+0x36/0x80
>>>> Sep  1 13:57:25 localhost kernel: md_write_start+0xf0/0x220 [md_mod]
>>>> Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
>>>> Sep  1 13:57:25 localhost kernel: raid5_make_request+0x89/0x8b0 
>>>> [raid456]
>>>> Sep  1 13:57:25 localhost kernel: ? bio_split+0x5d/0x90
>>>> Sep  1 13:57:25 localhost kernel: ? blk_queue_split+0xd2/0x630
>>>> Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
>>>> Sep  1 13:57:25 localhost kernel: md_make_request+0xf5/0x260 [md_mod]
>>>> ....
>>>>
>>>> All the processes send bio to md are stuck at md_write_start.
>>>>          wait_event(mddev->sb_wait,
>>>>                     !test_bit(MD_SB_CHANGE_PENDING, 
>>>> &mddev->sb_flags) && !mddev->suspended);
>>>>
>>>>
>>>> Sep  1 13:57:26 localhost kernel: INFO: task md0_reshape:23605 
>>>> blocked for more than 120 seconds.
>>>> Sep  1 13:57:26 localhost kernel:      Tainted: G OE   4.13.0-rc5 #2
>>>> Sep  1 13:57:26 localhost kernel: "echo 0 > 
>>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>> Sep  1 13:57:26 localhost kernel: md0_reshape     D    0 23605      
>>>> 2 0x00000080
>>>> Sep  1 13:57:26 localhost kernel: Call Trace:
>>>> Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
>>>> Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
>>>> Sep  1 13:57:26 localhost kernel: raid5_sync_request+0x2cf/0x370 
>>>> [raid456]
>>>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>>>> Sep  1 13:57:26 localhost kernel: md_do_sync+0xafe/0xee0 [md_mod]
>>>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>>>> Sep  1 13:57:26 localhost kernel: md_thread+0x132/0x180 [md_mod]
>>>> Sep  1 13:57:26 localhost kernel: kthread+0x109/0x140
>>>> Sep  1 13:57:26 localhost kernel: ? find_pers+0x70/0x70 [md_mod]
>>>> Sep  1 13:57:26 localhost kernel: ? kthread_park+0x60/0x60
>>>> Sep  1 13:57:26 localhost kernel: ? do_syscall_64+0x67/0x150
>>>> Sep  1 13:57:26 localhost kernel: ret_from_fork+0x25/0x30
>>>>
>>>> It stucks at:
>>>>          /* Allow raid5_quiesce to complete */
>>>>          wait_event(conf->wait_for_overlap, conf->quiesce != 2);
>>>>
>>>> Sep  1 13:57:26 localhost kernel: INFO: task mdadm:23613 blocked for 
>>>> more than 120 seconds.
>>>> Sep  1 13:57:26 localhost kernel:      Tainted: G OE   4.13.0-rc5 #2
>>>> Sep  1 13:57:26 localhost kernel: "echo 0 > 
>>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>> Sep  1 13:57:26 localhost kernel: mdadm           D    0 23613      
>>>> 1 0x00000080
>>>> Sep  1 13:57:26 localhost kernel: Call Trace:
>>>> Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
>>>> Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
>>>> Sep  1 13:57:26 localhost kernel: raid5_quiesce+0x274/0x2b0 [raid456]
>>>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>>>> Sep  1 13:57:26 localhost kernel: suspend_lo_store+0x82/0xe0 [md_mod]
>>>> Sep  1 13:57:26 localhost kernel: md_attr_store+0x80/0xc0 [md_mod]
>>>> Sep  1 13:57:26 localhost kernel: sysfs_kf_write+0x3a/0x50
>>>> Sep  1 13:57:26 localhost kernel: kernfs_fop_write+0xff/0x180
>>>> Sep  1 13:57:26 localhost kernel: __vfs_write+0x37/0x170
>>>> Sep  1 13:57:26 localhost kernel: ? selinux_file_permission+0xe5/0x120
>>>> Sep  1 13:57:26 localhost kernel: ? security_file_permission+0x3b/0xc0
>>>> Sep  1 13:57:26 localhost kernel: vfs_write+0xb2/0x1b0
>>>> Sep  1 13:57:26 localhost kernel: ? syscall_trace_enter+0x1d0/0x2b0
>>>> Sep  1 13:57:26 localhost kernel: SyS_write+0x55/0xc0
>>>> Sep  1 13:57:26 localhost kernel: do_syscall_64+0x67/0x150
>>>>
>>>> It's stuck at:
>>>> conf->quiesce = 2;
>>>> wait_event_cmd(conf->wait_for_quiescent,
>>>>     atomic_read(&conf->active_stripes) == 0 &&
>>>>     atomic_read(&conf->active_aligned_reads) == 0,
>>>>     unlock_all_device_hash_locks_irq(conf),
>>>>     lock_all_device_hash_locks_irq(conf));
>>>> conf->quiesce = 1;
>>>>
>>>> [root@dell-pr1700-02 ~]# cat /proc/mdstat
>>>> Personalities : [raid6] [raid5] [raid4]
>>>> md0 : active raid5 loop6[7] loop4[6] loop5[5](S) loop3[3] loop2[2] 
>>>> loop1[1] loop0[0]
>>>>        2039808 blocks super 1.2 level 5, 512k chunk, algorithm 2 
>>>> [6/6] [UUUUUU]
>>>>        [>....................]  reshape =  0.5% (2560/509952) 
>>>> finish=162985.0min speed=0K/sec
>>>>        unused devices: <none>
>>>>
>>>>
>>>> I can reproduce this. So I add some logs by printk to check 
>>>> MD_SB_CHANGE_PENDING of mddev->flags
>>>> and mddev->suspended.
>>>> mddev->suspend : 0
>>>> mddev->flags   : SP CHANGE PENDING is set
>>>> conf->quiesce  : 2
>>>>
>>>> The I echo active > /sys/block/md0/md/array_state, reshape can start 
>>>> and finish successfully.
>>>>
>>>> I notice there are some fixes for raid5 stuck problem. Not sure 
>>>> whether it's introduced by those
>>>> patches.
>>>>
>>>> Best Regards
>>>> Xiao
>>> Thanks for the detailed report.
>>> All the processes that you have mentioned are waiting for
>>> raid5_quiesce() to complete.
>>> raid5_quiesce() must be waiting for either active_stripes or
>>> active_aligned_reads to reach zero.  Most likely active_stripes.
>>>
>>> That suggests that either there is a stripe that is active and isn't
>>> being handled for some reason, or the accounting of active stripes has
>>> problems.
>>>
>>> The fact that setting the array to 'active' removes the hang is
>>> interesting.  That would cause the threads blocked in md_write_start to
>>> continue, but I cannot see how that would affect active_stripes as
>>> threads waiting for md_write_start() never hold a stripe.  Doing that
>>> could cause a stripe to be used, then released, which might trigger a
>>> wakeup that had been lost.
>> At first I doubt there are some active strips that haven't been 
>> handled yet.
>> But the process raid5d doesn't appear in calltrace. It wasn't stuck.
>>>
>>> It would be useful to see if ->active_stripes is 0 when the deadlock
>>> happens.
>>> One way to do that is to add a printk into raid5_status().  Then
>>> whenever you "cat /proc/mdstat" the printk will trigger and tell you the
>>> value of ->active_stripes.
>>>
>>> I haven't managed to reproduce this yet, or I would track it down my
>>> self.
>>
>> Sure the testing is running again. I'll give the information later.
>
> Hi Neil
>
> I have reproduced this.
>
> [root@dell-pr1700-02 ~]# cat /proc/mdstat
> Personalities : [raid6] [raid5] [raid4]
> md0 : active raid5 loop6[7] loop4[6] loop5[5](S) loop3[3] loop2[2] 
> loop1[1] loop0[0]
>        2039808 blocks super 1.2 level 5, 512k chunk, algorithm 2 [6/6] 
> [UUUUUU]
>        [>....................]  reshape =  0.5% (2560/509952) 
> finish=2498.2min speed=3K/sec
>
> unused devices: <none>
> [root@dell-pr1700-02 ~]# dmesg
> [ 3072.826251] suspend : 0
> [ 3072.828683] SP CHANGE PENDING is set
> [ 3072.832243] conf quiesce 2
> [ 3072.834932] active stripes : 251
> [ 3072.838139] active aligned reads : 0
>
> [root@dell-pr1700-02 ~]# ps auxf | grep md
> ...
> root      1682  0.0  0.0      0     0 ?        S<   23:02   0:00 \_ [md]
> root      3103 97.2  0.0      0     0 ?        R    23:34  14:06  \_ 
> [md0_raid5]
> root      3123  0.0  0.0      0     0 ?        D    23:34   0:00  \_ 
> [jbd2/md0-8]
> root      3134  0.0  0.0      0     0 ?        D    23:34   0:00  \_ 
> [md0_reshape]
> ....
> root      3142  0.5  0.0 108996  2728 pts/1    D+   23:34   0:04 
> |           \_ dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=1000
> root      3223  0.0  0.0 112672  2300 pts/0    S+   23:48 0:00          
> \_ grep --color=auto md
> root      3141  0.0  0.0  10132  5288 ?        DLs  23:34   0:00 
> /usr/sbin/mdadm --grow --continue /dev/md0
>
> It's strange, raid5d is running but it don't handle active stripes.
> Do you need more information?

It would help a lot to know waht md0_raid5 is doing.  It appears to be
spinning and that is presumably the cause of the problem.

You could try

  echo file raid5.c +p > /sys/kernel/debug/dynamic_debug/control

or see if /proc/3103/stack contains anything useful.

Maybe perf could be used to see where it is spending its time.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-04  5:34         ` NeilBrown
@ 2017-09-04  7:36           ` Xiao Ni
  2017-09-05  1:36             ` NeilBrown
  0 siblings, 1 reply; 14+ messages in thread
From: Xiao Ni @ 2017-09-04  7:36 UTC (permalink / raw)
  To: NeilBrown, linux-raid; +Cc: shli



On 09/04/2017 01:34 PM, NeilBrown wrote:
> On Mon, Sep 04 2017, Xiao Ni wrote:
>
>> On 09/04/2017 10:45 AM, Xiao Ni wrote:
>>>
>>> On 09/04/2017 10:16 AM, NeilBrown wrote:
>>>> On Sat, Sep 02 2017, Xiao Ni wrote:
>>>>
>>>>> Hi Neil and Shaohua
>>>>>
>>>>> I'm trying to reproduce the problem the problem
>>>>> "raid5_finish_reshape is stuck at revalidate_disk".
>>>>> But there is a new stuck. I tried with 4.13.0-rc5 and latest mdadm.
>>>>> The steps are:
>>>>>
>>>>> #!/bin/bash
>>>>> num=0
>>>>> while [ 1 ]
>>>>> do
>>>>>      echo "*************************$num"
>>>>>      mdadm -Ss
>>>>>      mdadm --create --run /dev/md0 --level 5 --metadata 1.2
>>>>> --raid-devices 5 /dev/loop0 /dev/loop1 /dev/loop2 /dev/loop3
>>>>> /dev/loop4 --spare-devices 1 /dev/loop5 --chunk 512
>>>>>      mdadm --wait /dev/md0
>>>>>      mkfs -t ext4 /dev/md0
>>>>>      mount -t ext4 /dev/md0 /mnt/md_test
>>>>>      dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=100
>>>>>      mdadm --add /dev/md0 /dev/loop6
>>>>>      mdadm --grow --raid-devices 6 /dev/md0
>>>>>      dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=1000
>>>>>      mdadm --wait /dev/md0
>>>>>      ((num++))
>>>>>      umount /dev/md0
>>>>> done
>>>>>
>>>>> The calltrace messages are:
>>>>> Sep  1 13:57:25 localhost kernel: INFO: task kworker/u8:4:21401
>>>>> blocked for more than 120 seconds.
>>>>> Sep  1 13:57:25 localhost kernel:      Tainted: G OE   4.13.0-rc5 #2
>>>>> Sep  1 13:57:25 localhost kernel: "echo 0 >
>>>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>>> Sep  1 13:57:25 localhost kernel: kworker/u8:4    D    0 21401
>>>>> 2 0x00000080
>>>>> Sep  1 13:57:25 localhost kernel: Workqueue: writeback wb_workfn
>>>>> (flush-9:0)
>>>>> Sep  1 13:57:25 localhost kernel: Call Trace:
>>>>> Sep  1 13:57:25 localhost kernel: __schedule+0x28d/0x890
>>>>> Sep  1 13:57:25 localhost kernel: schedule+0x36/0x80
>>>>> Sep  1 13:57:25 localhost kernel: md_write_start+0xf0/0x220 [md_mod]
>>>>> Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
>>>>> Sep  1 13:57:25 localhost kernel: raid5_make_request+0x89/0x8b0
>>>>> [raid456]
>>>>> Sep  1 13:57:25 localhost kernel: ? bio_split+0x5d/0x90
>>>>> Sep  1 13:57:25 localhost kernel: ? blk_queue_split+0xd2/0x630
>>>>> Sep  1 13:57:25 localhost kernel: ? remove_wait_queue+0x60/0x60
>>>>> Sep  1 13:57:25 localhost kernel: md_make_request+0xf5/0x260 [md_mod]
>>>>> ....
>>>>>
>>>>> All the processes send bio to md are stuck at md_write_start.
>>>>>           wait_event(mddev->sb_wait,
>>>>>                      !test_bit(MD_SB_CHANGE_PENDING,
>>>>> &mddev->sb_flags) && !mddev->suspended);
>>>>>
>>>>>
>>>>> Sep  1 13:57:26 localhost kernel: INFO: task md0_reshape:23605
>>>>> blocked for more than 120 seconds.
>>>>> Sep  1 13:57:26 localhost kernel:      Tainted: G OE   4.13.0-rc5 #2
>>>>> Sep  1 13:57:26 localhost kernel: "echo 0 >
>>>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>>> Sep  1 13:57:26 localhost kernel: md0_reshape     D    0 23605
>>>>> 2 0x00000080
>>>>> Sep  1 13:57:26 localhost kernel: Call Trace:
>>>>> Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
>>>>> Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
>>>>> Sep  1 13:57:26 localhost kernel: raid5_sync_request+0x2cf/0x370
>>>>> [raid456]
>>>>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>>>>> Sep  1 13:57:26 localhost kernel: md_do_sync+0xafe/0xee0 [md_mod]
>>>>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>>>>> Sep  1 13:57:26 localhost kernel: md_thread+0x132/0x180 [md_mod]
>>>>> Sep  1 13:57:26 localhost kernel: kthread+0x109/0x140
>>>>> Sep  1 13:57:26 localhost kernel: ? find_pers+0x70/0x70 [md_mod]
>>>>> Sep  1 13:57:26 localhost kernel: ? kthread_park+0x60/0x60
>>>>> Sep  1 13:57:26 localhost kernel: ? do_syscall_64+0x67/0x150
>>>>> Sep  1 13:57:26 localhost kernel: ret_from_fork+0x25/0x30
>>>>>
>>>>> It stucks at:
>>>>>           /* Allow raid5_quiesce to complete */
>>>>>           wait_event(conf->wait_for_overlap, conf->quiesce != 2);
>>>>>
>>>>> Sep  1 13:57:26 localhost kernel: INFO: task mdadm:23613 blocked for
>>>>> more than 120 seconds.
>>>>> Sep  1 13:57:26 localhost kernel:      Tainted: G OE   4.13.0-rc5 #2
>>>>> Sep  1 13:57:26 localhost kernel: "echo 0 >
>>>>> /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>>>>> Sep  1 13:57:26 localhost kernel: mdadm           D    0 23613
>>>>> 1 0x00000080
>>>>> Sep  1 13:57:26 localhost kernel: Call Trace:
>>>>> Sep  1 13:57:26 localhost kernel: __schedule+0x28d/0x890
>>>>> Sep  1 13:57:26 localhost kernel: schedule+0x36/0x80
>>>>> Sep  1 13:57:26 localhost kernel: raid5_quiesce+0x274/0x2b0 [raid456]
>>>>> Sep  1 13:57:26 localhost kernel: ? remove_wait_queue+0x60/0x60
>>>>> Sep  1 13:57:26 localhost kernel: suspend_lo_store+0x82/0xe0 [md_mod]
>>>>> Sep  1 13:57:26 localhost kernel: md_attr_store+0x80/0xc0 [md_mod]
>>>>> Sep  1 13:57:26 localhost kernel: sysfs_kf_write+0x3a/0x50
>>>>> Sep  1 13:57:26 localhost kernel: kernfs_fop_write+0xff/0x180
>>>>> Sep  1 13:57:26 localhost kernel: __vfs_write+0x37/0x170
>>>>> Sep  1 13:57:26 localhost kernel: ? selinux_file_permission+0xe5/0x120
>>>>> Sep  1 13:57:26 localhost kernel: ? security_file_permission+0x3b/0xc0
>>>>> Sep  1 13:57:26 localhost kernel: vfs_write+0xb2/0x1b0
>>>>> Sep  1 13:57:26 localhost kernel: ? syscall_trace_enter+0x1d0/0x2b0
>>>>> Sep  1 13:57:26 localhost kernel: SyS_write+0x55/0xc0
>>>>> Sep  1 13:57:26 localhost kernel: do_syscall_64+0x67/0x150
>>>>>
>>>>> It's stuck at:
>>>>> conf->quiesce = 2;
>>>>> wait_event_cmd(conf->wait_for_quiescent,
>>>>>      atomic_read(&conf->active_stripes) == 0 &&
>>>>>      atomic_read(&conf->active_aligned_reads) == 0,
>>>>>      unlock_all_device_hash_locks_irq(conf),
>>>>>      lock_all_device_hash_locks_irq(conf));
>>>>> conf->quiesce = 1;
>>>>>
>>>>> [root@dell-pr1700-02 ~]# cat /proc/mdstat
>>>>> Personalities : [raid6] [raid5] [raid4]
>>>>> md0 : active raid5 loop6[7] loop4[6] loop5[5](S) loop3[3] loop2[2]
>>>>> loop1[1] loop0[0]
>>>>>         2039808 blocks super 1.2 level 5, 512k chunk, algorithm 2
>>>>> [6/6] [UUUUUU]
>>>>>         [>....................]  reshape =  0.5% (2560/509952)
>>>>> finish=162985.0min speed=0K/sec
>>>>>         unused devices: <none>
>>>>>
>>>>>
>>>>> I can reproduce this. So I add some logs by printk to check
>>>>> MD_SB_CHANGE_PENDING of mddev->flags
>>>>> and mddev->suspended.
>>>>> mddev->suspend : 0
>>>>> mddev->flags   : SP CHANGE PENDING is set
>>>>> conf->quiesce  : 2
>>>>>
>>>>> The I echo active > /sys/block/md0/md/array_state, reshape can start
>>>>> and finish successfully.
>>>>>
>>>>> I notice there are some fixes for raid5 stuck problem. Not sure
>>>>> whether it's introduced by those
>>>>> patches.
>>>>>
>>>>> Best Regards
>>>>> Xiao
>>>> Thanks for the detailed report.
>>>> All the processes that you have mentioned are waiting for
>>>> raid5_quiesce() to complete.
>>>> raid5_quiesce() must be waiting for either active_stripes or
>>>> active_aligned_reads to reach zero.  Most likely active_stripes.
>>>>
>>>> That suggests that either there is a stripe that is active and isn't
>>>> being handled for some reason, or the accounting of active stripes has
>>>> problems.
>>>>
>>>> The fact that setting the array to 'active' removes the hang is
>>>> interesting.  That would cause the threads blocked in md_write_start to
>>>> continue, but I cannot see how that would affect active_stripes as
>>>> threads waiting for md_write_start() never hold a stripe.  Doing that
>>>> could cause a stripe to be used, then released, which might trigger a
>>>> wakeup that had been lost.
>>> At first I doubt there are some active strips that haven't been
>>> handled yet.
>>> But the process raid5d doesn't appear in calltrace. It wasn't stuck.
>>>> It would be useful to see if ->active_stripes is 0 when the deadlock
>>>> happens.
>>>> One way to do that is to add a printk into raid5_status().  Then
>>>> whenever you "cat /proc/mdstat" the printk will trigger and tell you the
>>>> value of ->active_stripes.
>>>>
>>>> I haven't managed to reproduce this yet, or I would track it down my
>>>> self.
>>> Sure the testing is running again. I'll give the information later.
>> Hi Neil
>>
>> I have reproduced this.
>>
>> [root@dell-pr1700-02 ~]# cat /proc/mdstat
>> Personalities : [raid6] [raid5] [raid4]
>> md0 : active raid5 loop6[7] loop4[6] loop5[5](S) loop3[3] loop2[2]
>> loop1[1] loop0[0]
>>         2039808 blocks super 1.2 level 5, 512k chunk, algorithm 2 [6/6]
>> [UUUUUU]
>>         [>....................]  reshape =  0.5% (2560/509952)
>> finish=2498.2min speed=3K/sec
>>
>> unused devices: <none>
>> [root@dell-pr1700-02 ~]# dmesg
>> [ 3072.826251] suspend : 0
>> [ 3072.828683] SP CHANGE PENDING is set
>> [ 3072.832243] conf quiesce 2
>> [ 3072.834932] active stripes : 251
>> [ 3072.838139] active aligned reads : 0
>>
>> [root@dell-pr1700-02 ~]# ps auxf | grep md
>> ...
>> root      1682  0.0  0.0      0     0 ?        S<   23:02   0:00 \_ [md]
>> root      3103 97.2  0.0      0     0 ?        R    23:34  14:06  \_
>> [md0_raid5]
>> root      3123  0.0  0.0      0     0 ?        D    23:34   0:00  \_
>> [jbd2/md0-8]
>> root      3134  0.0  0.0      0     0 ?        D    23:34   0:00  \_
>> [md0_reshape]
>> ....
>> root      3142  0.5  0.0 108996  2728 pts/1    D+   23:34   0:04
>> |           \_ dd if=/dev/urandom of=/mnt/md_test/testfile bs=1M count=1000
>> root      3223  0.0  0.0 112672  2300 pts/0    S+   23:48 0:00
>> \_ grep --color=auto md
>> root      3141  0.0  0.0  10132  5288 ?        DLs  23:34   0:00
>> /usr/sbin/mdadm --grow --continue /dev/md0
>>
>> It's strange, raid5d is running but it don't handle active stripes.
>> Do you need more information?
> It would help a lot to know waht md0_raid5 is doing.  It appears to be
> spinning and that is presumably the cause of the problem.
>
> You could try
>
>    echo file raid5.c +p > /sys/kernel/debug/dynamic_debug/control

[10331.356436] __get_priority_stripe: handle: busy hold: empty 
full_writes: 0 bypass_count: 0
[10331.356437] __get_priority_stripe: handle: busy hold: empty 
full_writes: 0 bypass_count: 0
[10331.356438] __get_priority_stripe: handle: busy hold: empty 
full_writes: 0 bypass_count: 0
[10331.356439] __get_priority_stripe: handle: busy hold: empty 
full_writes: 0 bypass_count: 0
[10331.356440] __get_priority_stripe: handle: busy hold: empty 
full_writes: 0 bypass_count: 0
[10331.356441] __get_priority_stripe: handle: busy hold: empty 
full_writes: 0 bypass_count: 0
[10331.356442] __get_priority_stripe: handle: busy hold: empty 
full_writes: 0 bypass_count: 0
[10331.356443] __get_priority_stripe: handle: busy hold: empty 
full_writes: 0 bypass_count: 0
[10331.356444] handling stripe 7656, state=0x801 cnt=1, pd_idx=2, qd_idx=-1
, check:0, reconstruct:0
[10331.356445] check 4: state 0x10 read           (null) write           
(null) written           (null)
[10331.356446] check 3: state 0x10 read           (null) write           
(null) written           (null)
[10331.356447] check 2: state 0x10 read           (null) write           
(null) written           (null)
[10331.356448] check 1: state 0x10 read           (null) write           
(null) written           (null)
[10331.356449] check 0: state 0x10 read           (null) write           
(null) written           (null)
[10331.356450] handling stripe 6272, state=0x2801 cnt=1, pd_idx=3, qd_idx=-1
, check:0, reconstruct:0
[10331.356451] check 4: state 0x11 read           (null) write           
(null) written           (null)
[10331.356452] check 3: state 0x11 read           (null) write           
(null) written           (null)
[10331.356453] check 2: state 0x11 read           (null) write           
(null) written           (null)
[10331.356454] check 1: state 0x11 read           (null) write           
(null) written           (null)
[10331.356456] check 0: state 0x11 read           (null) write           
(null) written           (null)
[10331.356457] handling stripe 6280, state=0x2801 cnt=1, pd_idx=3, qd_idx=-1
, check:0, reconstruct:0
[10331.356458] check 4: state 0x11 read           (null) write           
(null) written           (null)
[10331.356459] check 3: state 0x11 read           (null) write           
(null) written           (null)
[10331.356460] check 2: state 0x11 read           (null) write           
(null) written           (null)
[10331.356461] check 1: state 0x11 read           (null) write           
(null) written           (null)
[10331.356462] check 0: state 0x11 read           (null) write           
(null) written           (null)
[10331.356463] handling stripe 6288, state=0x2801 cnt=1, pd_idx=3, qd_idx=-1
, check:0, reconstruct:0
[10331.356464] check 4: state 0x11 read           (null) write           
(null) written           (null)
[10331.356465] check 3: state 0x11 read           (null) write           
(null) written           (null)
[10331.356466] check 2: state 0x11 read           (null) write           
(null) written           (null)
[10331.356467] check 1: state 0x11 read           (null) write           
(null) written           (null)
[10331.356468] check 0: state 0x11 read           (null) write           
(null) written           (null)
[10331.356469] handling stripe 6296, state=0x2801 cnt=1, pd_idx=3, qd_idx=-1
, check:0, reconstruct:0
[10331.356470] check 4: state 0x11 read           (null) write           
(null) written           (null)
[10331.356473] check 3: state 0x11 read           (null) write           
(null) written           (null)
[10331.356475] check 2: state 0x11 read           (null) write           
(null) written           (null)
[10331.356476] check 1: state 0x11 read           (null) write           
(null) written           (null)
[10331.356477] check 0: state 0x11 read           (null) write           
(null) written           (null)
[10331.356479] handling stripe 6304, state=0x2801 cnt=1, pd_idx=3, qd_idx=-1
, check:0, reconstruct:0
[10331.356481] check 4: state 0x11 read           (null) write           
(null) written           (null)
[10331.356482] check 3: state 0x11 read           (null) write           
(null) written           (null)
[10331.356484] check 2: state 0x11 read           (null) write           
(null) written           (null)
[10331.356485] check 1: state 0x11 read           (null) write           
(null) written           (null)
[10331.356486] check 0: state 0x11 read           (null) write           
(null) written           (null)
[10331.356488] handling stripe 6312, state=0x2801 cnt=1, pd_idx=3, qd_idx=-1
, check:0, reconstruct:0
[10331.356489] check 4: state 0x11 read           (null) write           
(null) written           (null)
[10331.356491] check 3: state 0x11 read           (null) write           
(null) written           (null)
[10331.356492] check 2: state 0x11 read           (null) write           
(null) written           (null)
[10331.356493] check 1: state 0x11 read           (null) write           
(null) written           (null)
[10331.356495] check 0: state 0x11 read           (null) write           
(null) written           (null)
[10331.356497] handling stripe 6320, state=0x2801 cnt=1, pd_idx=3, qd_idx=-1
, check:0, reconstruct:0
[10331.356498] check 4: state 0x11 read           (null) write           
(null) written           (null)
[10331.356499] check 3: state 0x11 read           (null) write           
(null) written           (null)
[10331.356501] check 2: state 0x11 read           (null) write           
(null) written           (null)
[10331.356502] check 1: state 0x11 read           (null) write           
(null) written           (null)
[10331.356503] check 0: state 0x11 read           (null) write           
(null) written           (null)


In function handle_stripe:
4697         if (s.handle_bad_blocks ||
4698             test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4699                 set_bit(STRIPE_HANDLE, &sh->state);
4700                 goto finish;
4701         }

Because MD_SB_CHANGE_PENDING is set, so the stripes can't be handled.


>
> or see if /proc/3103/stack contains anything useful.
[root@dell-pr1700-02 ~]# cat /proc/3103/stack
[<ffffffffffffffff>] 0xffffffffffffffff

There are no useful messages.

Regards
Xiao

>
> Maybe perf could be used to see where it is spending its time.
>
> NeilBrown


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-04  7:36           ` Xiao Ni
@ 2017-09-05  1:36             ` NeilBrown
  2017-09-05  2:15               ` Xiao Ni
  2017-09-30  9:44               ` Xiao Ni
  0 siblings, 2 replies; 14+ messages in thread
From: NeilBrown @ 2017-09-05  1:36 UTC (permalink / raw)
  To: Xiao Ni, linux-raid; +Cc: shli

[-- Attachment #1: Type: text/plain, Size: 2999 bytes --]

On Mon, Sep 04 2017, Xiao Ni wrote:

>
>
> In function handle_stripe:
> 4697         if (s.handle_bad_blocks ||
> 4698             test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
> 4699                 set_bit(STRIPE_HANDLE, &sh->state);
> 4700                 goto finish;
> 4701         }
>
> Because MD_SB_CHANGE_PENDING is set, so the stripes can't be handled.
>

Right, of course.  I see what is happening now.

- raid5d cannot complete stripes until the metadata is written
- the metadata cannot be written until raid5d gets the mddev_lock
- mddev_lock is held by the write to suspend_hi
- the write to suspend_hi is waiting for raid5_quiesce
- raid5_quiesce is waiting for some stripes to complete.

We could declare that ->quiesce(, 1) cannot be called while holding the
lock.
We could possible allow it but only if md_update_sb() is called first,
though that might still be racy.

->quiesce(, 1) is currently called from:
 mddev_suspend
 suspend_lo_store
 suspend_hi_store
 __md_stop_writes
 mddev_detach
 set_bitmap_file
 update_array_info (when setting/removing internal bitmap)
 md_do_sync

and most of those are call with the lock held, or take the lock.

Maybe we should *require* that mddev_lock is held when calling
->quiesce() and have ->quiesce() do the metadata update.

Something like the following maybe.  Can you test it?
Thanks,
NeilBrown

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b01e458d31e9..999ccf08c5db 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5805,9 +5805,11 @@ void md_stop(struct mddev *mddev)
 	/* stop the array and free an attached data structures.
 	 * This is called from dm-raid
 	 */
+	mddev_lock_nointr(mddev);
 	__md_stop(mddev);
 	if (mddev->bio_set)
 		bioset_free(mddev->bio_set);
+	mddev_unlock(mddev);
 }
 
 EXPORT_SYMBOL_GPL(md_stop);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0fc2748aaf95..cde5a82eb404 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4316,6 +4316,8 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 
 			/* place all the copies on one channel */
 			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
+			WARN_ON(sh2->dev[dd_idx].page != sh2->dev[dd_idx].orig_page);
+			WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
 			tx = async_memcpy(sh2->dev[dd_idx].page,
 					  sh->dev[i].page, 0, 0, STRIPE_SIZE,
 					  &submit);
@@ -8031,7 +8033,10 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 		wait_event_cmd(conf->wait_for_quiescent,
 				    atomic_read(&conf->active_stripes) == 0 &&
 				    atomic_read(&conf->active_aligned_reads) == 0,
-				    unlock_all_device_hash_locks_irq(conf),
+				    ({unlock_all_device_hash_locks_irq(conf);
+					if (mddev->sb_flags)
+						md_update_sb(mddev, 0);
+				    }),
 				    lock_all_device_hash_locks_irq(conf));
 		conf->quiesce = 1;
 		unlock_all_device_hash_locks_irq(conf);

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-05  1:36             ` NeilBrown
@ 2017-09-05  2:15               ` Xiao Ni
  2017-09-07  1:37                 ` Xiao Ni
  2017-09-30  9:44               ` Xiao Ni
  1 sibling, 1 reply; 14+ messages in thread
From: Xiao Ni @ 2017-09-05  2:15 UTC (permalink / raw)
  To: NeilBrown, linux-raid; +Cc: shli



On 09/05/2017 09:36 AM, NeilBrown wrote:
> On Mon, Sep 04 2017, Xiao Ni wrote:
>
>>
>> In function handle_stripe:
>> 4697         if (s.handle_bad_blocks ||
>> 4698             test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
>> 4699                 set_bit(STRIPE_HANDLE, &sh->state);
>> 4700                 goto finish;
>> 4701         }
>>
>> Because MD_SB_CHANGE_PENDING is set, so the stripes can't be handled.
>>
> Right, of course.  I see what is happening now.
>
> - raid5d cannot complete stripes until the metadata is written
> - the metadata cannot be written until raid5d gets the mddev_lock
> - mddev_lock is held by the write to suspend_hi
> - the write to suspend_hi is waiting for raid5_quiesce
> - raid5_quiesce is waiting for some stripes to complete.
>
> We could declare that ->quiesce(, 1) cannot be called while holding the
> lock.
> We could possible allow it but only if md_update_sb() is called first,
> though that might still be racy.
>
> ->quiesce(, 1) is currently called from:
>   mddev_suspend
>   suspend_lo_store
>   suspend_hi_store
>   __md_stop_writes
>   mddev_detach
>   set_bitmap_file
>   update_array_info (when setting/removing internal bitmap)
>   md_do_sync
>
> and most of those are call with the lock held, or take the lock.
>
> Maybe we should *require* that mddev_lock is held when calling
> ->quiesce() and have ->quiesce() do the metadata update.
>
> Something like the following maybe.  Can you test it?

Hi Neil

Thanks for the analysis. I need to thing for a while :)
I already added the patch and the test is running now. It usually needs 
more than 5
hours to reproduce this problem. I'll let it run more than 24 hours. 
I'll update the test
result later.

Regards
Xiao

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-05  2:15               ` Xiao Ni
@ 2017-09-07  1:37                 ` Xiao Ni
  2017-09-07  5:37                   ` NeilBrown
  0 siblings, 1 reply; 14+ messages in thread
From: Xiao Ni @ 2017-09-07  1:37 UTC (permalink / raw)
  To: NeilBrown, linux-raid; +Cc: shli



----- Original Message -----
> From: "Xiao Ni" <xni@redhat.com>
> To: "NeilBrown" <neilb@suse.com>, "linux-raid" <linux-raid@vger.kernel.org>
> Cc: shli@kernel.org
> Sent: Tuesday, September 5, 2017 10:15:00 AM
> Subject: Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
> 
> 
> 
> On 09/05/2017 09:36 AM, NeilBrown wrote:
> > On Mon, Sep 04 2017, Xiao Ni wrote:
> >
> >>
> >> In function handle_stripe:
> >> 4697         if (s.handle_bad_blocks ||
> >> 4698             test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
> >> 4699                 set_bit(STRIPE_HANDLE, &sh->state);
> >> 4700                 goto finish;
> >> 4701         }
> >>
> >> Because MD_SB_CHANGE_PENDING is set, so the stripes can't be handled.
> >>
> > Right, of course.  I see what is happening now.
> >
> > - raid5d cannot complete stripes until the metadata is written
> > - the metadata cannot be written until raid5d gets the mddev_lock
> > - mddev_lock is held by the write to suspend_hi
> > - the write to suspend_hi is waiting for raid5_quiesce
> > - raid5_quiesce is waiting for some stripes to complete.
> >
> > We could declare that ->quiesce(, 1) cannot be called while holding the
> > lock.
> > We could possible allow it but only if md_update_sb() is called first,
> > though that might still be racy.
> >
> > ->quiesce(, 1) is currently called from:
> >   mddev_suspend
> >   suspend_lo_store
> >   suspend_hi_store
> >   __md_stop_writes
> >   mddev_detach
> >   set_bitmap_file
> >   update_array_info (when setting/removing internal bitmap)
> >   md_do_sync
> >
> > and most of those are call with the lock held, or take the lock.
> >
> > Maybe we should *require* that mddev_lock is held when calling
> > ->quiesce() and have ->quiesce() do the metadata update.
> >
> > Something like the following maybe.  Can you test it?
> 
> Hi Neil
> 
> Thanks for the analysis. I need to thing for a while :)
> I already added the patch and the test is running now. It usually needs
> more than 5
> hours to reproduce this problem. I'll let it run more than 24 hours.
> I'll update the test
> result later.

Hi Neil

The problem still exists. But it doesn't show calltrace this time. It
was stuck yesterday. I didn't notice that because there has no calltrace.

echo file raid5.c +p > /sys/kernel/debug/dynamic_debug/control

It shows that raid5d is still spinning.

Regards
Xiao

> 
> Regards
> Xiao
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-07  1:37                 ` Xiao Ni
@ 2017-09-07  5:37                   ` NeilBrown
  2017-09-11  0:14                     ` Xiao Ni
  0 siblings, 1 reply; 14+ messages in thread
From: NeilBrown @ 2017-09-07  5:37 UTC (permalink / raw)
  To: Xiao Ni, linux-raid; +Cc: shli

[-- Attachment #1: Type: text/plain, Size: 8063 bytes --]

On Wed, Sep 06 2017, Xiao Ni wrote:

> ----- Original Message -----
>> From: "Xiao Ni" <xni@redhat.com>
>> To: "NeilBrown" <neilb@suse.com>, "linux-raid" <linux-raid@vger.kernel.org>
>> Cc: shli@kernel.org
>> Sent: Tuesday, September 5, 2017 10:15:00 AM
>> Subject: Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
>> 
>> 
>> 
>> On 09/05/2017 09:36 AM, NeilBrown wrote:
>> > On Mon, Sep 04 2017, Xiao Ni wrote:
>> >
>> >>
>> >> In function handle_stripe:
>> >> 4697         if (s.handle_bad_blocks ||
>> >> 4698             test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
>> >> 4699                 set_bit(STRIPE_HANDLE, &sh->state);
>> >> 4700                 goto finish;
>> >> 4701         }
>> >>
>> >> Because MD_SB_CHANGE_PENDING is set, so the stripes can't be handled.
>> >>
>> > Right, of course.  I see what is happening now.
>> >
>> > - raid5d cannot complete stripes until the metadata is written
>> > - the metadata cannot be written until raid5d gets the mddev_lock
>> > - mddev_lock is held by the write to suspend_hi
>> > - the write to suspend_hi is waiting for raid5_quiesce
>> > - raid5_quiesce is waiting for some stripes to complete.
>> >
>> > We could declare that ->quiesce(, 1) cannot be called while holding the
>> > lock.
>> > We could possible allow it but only if md_update_sb() is called first,
>> > though that might still be racy.
>> >
>> > ->quiesce(, 1) is currently called from:
>> >   mddev_suspend
>> >   suspend_lo_store
>> >   suspend_hi_store
>> >   __md_stop_writes
>> >   mddev_detach
>> >   set_bitmap_file
>> >   update_array_info (when setting/removing internal bitmap)
>> >   md_do_sync
>> >
>> > and most of those are call with the lock held, or take the lock.
>> >
>> > Maybe we should *require* that mddev_lock is held when calling
>> > ->quiesce() and have ->quiesce() do the metadata update.
>> >
>> > Something like the following maybe.  Can you test it?
>> 
>> Hi Neil
>> 
>> Thanks for the analysis. I need to thing for a while :)
>> I already added the patch and the test is running now. It usually needs
>> more than 5
>> hours to reproduce this problem. I'll let it run more than 24 hours.
>> I'll update the test
>> result later.
>
> Hi Neil
>
> The problem still exists. But it doesn't show calltrace this time. It
> was stuck yesterday. I didn't notice that because there has no calltrace.
>
> echo file raid5.c +p > /sys/kernel/debug/dynamic_debug/control
>
> It shows that raid5d is still spinning.

Thanks for testing. I've thought some more and I think there is a better
approach.
The fact that we need to take the mutex to write the super block has
caused problems several times before and is a key part of the problem
now.
Maybe we should relax that.  Obviously we don't want two threads
updating the metadata at the same time, but it should be safe to
update it in parallel with other uses of reconfix_mutex.

Holding mddev->lock while copying data from the struct mddev to the
superblock (which we do) should ensure that the superblock is internally
consistent, and that should be enough.

So I propose the following patch.  It certainly needs review and
testing, but I think it should make a big improvement.

Thanks,
NeilBrown


diff --git a/drivers/md/md.c b/drivers/md/md.c
index b01e458d31e9..414a4c1a052d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2388,6 +2388,15 @@ void md_update_sb(struct mddev *mddev, int force_change)
 		return;
 	}
 
+	if (!force_change && !(mddev->sb_flags & ~BIT(MD_SB_UPDATE_ACTIVE)))
+		return;
+
+	wait_event(mddev->sb_wait,
+		   !test_and_set_bit(MD_SB_UPDATE_ACTIVE, &mddev->sb_flags));
+
+	if (!force_change && !(mddev->sb_flags & ~BIT(MD_SB_UPDATE_ACTIVE)))
+		goto out;
+
 repeat:
 	if (mddev_is_clustered(mddev)) {
 		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
@@ -2402,7 +2411,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
 			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
 							 BIT(MD_SB_CHANGE_DEVS) |
 							 BIT(MD_SB_CHANGE_CLEAN));
-			return;
+			goto out;
 		}
 	}
 
@@ -2432,8 +2441,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
 				wake_up(&rdev->blocked_wait);
 			}
 		}
-		wake_up(&mddev->sb_wait);
-		return;
+		goto out;
 	}
 
 	spin_lock(&mddev->lock);
@@ -2544,6 +2552,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
 			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
 		/* have to write it out again */
 		goto repeat;
+
+out:
+	clear_bit_unlock(MD_SB_UPDATE_ACTIVE, &mddev->sb_flags);
 	wake_up(&mddev->sb_wait);
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
@@ -5606,8 +5617,7 @@ int md_run(struct mddev *mddev)
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 
-	if (mddev->sb_flags)
-		md_update_sb(mddev, 0);
+	md_update_sb(mddev, 0);
 
 	md_new_event(mddev);
 	sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -8643,17 +8653,14 @@ void md_check_recovery(struct mddev *mddev)
 
 	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return;
-	if ( ! (
-		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
+	if ((
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
 		(mddev->safemode == 2
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
-		))
-		return;
-
-	if (mddev_trylock(mddev)) {
+		     ) &&
+	    mddev_trylock(mddev)) {
 		int spares = 0;
 
 		if (!mddev->external && mddev->safemode == 1)
@@ -8706,9 +8713,6 @@ void md_check_recovery(struct mddev *mddev)
 			spin_unlock(&mddev->lock);
 		}
 
-		if (mddev->sb_flags)
-			md_update_sb(mddev, 0);
-
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
 			/* resync/recovery still happening */
@@ -8786,6 +8790,7 @@ void md_check_recovery(struct mddev *mddev)
 		wake_up(&mddev->sb_wait);
 		mddev_unlock(mddev);
 	}
+	md_update_sb(mddev, 0);
 }
 EXPORT_SYMBOL(md_check_recovery);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 09db03455801..bc8633cf7c40 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -243,6 +243,7 @@ enum mddev_sb_flags {
 	MD_SB_CHANGE_CLEAN,	/* transition to or from 'clean' */
 	MD_SB_CHANGE_PENDING,	/* switch from 'clean' to 'active' in progress */
 	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
+	MD_SB_UPDATE_ACTIVE,	/* A thread is running in md_update_sb */
 };
 
 struct mddev {
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 2dcbafa8e66c..76169dd8ff7c 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1334,21 +1334,10 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 	mddev = log->rdev->mddev;
 	/*
 	 * Discard could zero data, so before discard we must make sure
-	 * superblock is updated to new log tail. Updating superblock (either
-	 * directly call md_update_sb() or depend on md thread) must hold
-	 * reconfig mutex. On the other hand, raid5_quiesce is called with
-	 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
-	 * for all IO finish, hence waitting for reclaim thread, while reclaim
-	 * thread is calling this function and waitting for reconfig mutex. So
-	 * there is a deadlock. We workaround this issue with a trylock.
-	 * FIXME: we could miss discard if we can't take reconfig mutex
+	 * superblock is updated to new log tail.
 	 */
-	set_mask_bits(&mddev->sb_flags, 0,
-		BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
-	if (!mddev_trylock(mddev))
-		return;
+
 	md_update_sb(mddev, 1);
-	mddev_unlock(mddev);
 
 	/* discard IO error really doesn't matter, ignore it */
 	if (log->last_checkpoint < end) {

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-07  5:37                   ` NeilBrown
@ 2017-09-11  0:14                     ` Xiao Ni
  2017-09-11  3:36                       ` NeilBrown
  0 siblings, 1 reply; 14+ messages in thread
From: Xiao Ni @ 2017-09-11  0:14 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, shli



----- Original Message -----
> From: "NeilBrown" <neilb@suse.com>
> To: "Xiao Ni" <xni@redhat.com>, "linux-raid" <linux-raid@vger.kernel.org>
> Cc: shli@kernel.org
> Sent: Thursday, September 7, 2017 1:37:45 PM
> Subject: Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
> 
> On Wed, Sep 06 2017, Xiao Ni wrote:
> 
> > ----- Original Message -----
> >> From: "Xiao Ni" <xni@redhat.com>
> >> To: "NeilBrown" <neilb@suse.com>, "linux-raid"
> >> <linux-raid@vger.kernel.org>
> >> Cc: shli@kernel.org
> >> Sent: Tuesday, September 5, 2017 10:15:00 AM
> >> Subject: Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be
> >> cleared
> >> 
> >> 
> >> 
> >> On 09/05/2017 09:36 AM, NeilBrown wrote:
> >> > On Mon, Sep 04 2017, Xiao Ni wrote:
> >> >
> >> >>
> >> >> In function handle_stripe:
> >> >> 4697         if (s.handle_bad_blocks ||
> >> >> 4698             test_bit(MD_SB_CHANGE_PENDING,
> >> >> &conf->mddev->sb_flags)) {
> >> >> 4699                 set_bit(STRIPE_HANDLE, &sh->state);
> >> >> 4700                 goto finish;
> >> >> 4701         }
> >> >>
> >> >> Because MD_SB_CHANGE_PENDING is set, so the stripes can't be handled.
> >> >>
> >> > Right, of course.  I see what is happening now.
> >> >
> >> > - raid5d cannot complete stripes until the metadata is written
> >> > - the metadata cannot be written until raid5d gets the mddev_lock
> >> > - mddev_lock is held by the write to suspend_hi
> >> > - the write to suspend_hi is waiting for raid5_quiesce
> >> > - raid5_quiesce is waiting for some stripes to complete.
> >> >
> >> > We could declare that ->quiesce(, 1) cannot be called while holding the
> >> > lock.
> >> > We could possible allow it but only if md_update_sb() is called first,
> >> > though that might still be racy.
> >> >
> >> > ->quiesce(, 1) is currently called from:
> >> >   mddev_suspend
> >> >   suspend_lo_store
> >> >   suspend_hi_store
> >> >   __md_stop_writes
> >> >   mddev_detach
> >> >   set_bitmap_file
> >> >   update_array_info (when setting/removing internal bitmap)
> >> >   md_do_sync
> >> >
> >> > and most of those are call with the lock held, or take the lock.
> >> >
> >> > Maybe we should *require* that mddev_lock is held when calling
> >> > ->quiesce() and have ->quiesce() do the metadata update.
> >> >
> >> > Something like the following maybe.  Can you test it?
> >> 
> >> Hi Neil
> >> 
> >> Thanks for the analysis. I need to thing for a while :)
> >> I already added the patch and the test is running now. It usually needs
> >> more than 5
> >> hours to reproduce this problem. I'll let it run more than 24 hours.
> >> I'll update the test
> >> result later.
> >
> > Hi Neil
> >
> > The problem still exists. But it doesn't show calltrace this time. It
> > was stuck yesterday. I didn't notice that because there has no calltrace.
> >
> > echo file raid5.c +p > /sys/kernel/debug/dynamic_debug/control
> >
> > It shows that raid5d is still spinning.
> 
> Thanks for testing. I've thought some more and I think there is a better
> approach.
> The fact that we need to take the mutex to write the super block has
> caused problems several times before and is a key part of the problem
> now.
> Maybe we should relax that.  Obviously we don't want two threads
> updating the metadata at the same time, but it should be safe to
> update it in parallel with other uses of reconfix_mutex.
> 
> Holding mddev->lock while copying data from the struct mddev to the
> superblock (which we do) should ensure that the superblock is internally
> consistent, and that should be enough.
> 
> So I propose the following patch.  It certainly needs review and
> testing, but I think it should make a big improvement.
> 
> Thanks,
> NeilBrown
> 
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index b01e458d31e9..414a4c1a052d 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -2388,6 +2388,15 @@ void md_update_sb(struct mddev *mddev, int
> force_change)
>  		return;
>  	}
>  
> +	if (!force_change && !(mddev->sb_flags & ~BIT(MD_SB_UPDATE_ACTIVE)))
> +		return;
> +
> +	wait_event(mddev->sb_wait,
> +		   !test_and_set_bit(MD_SB_UPDATE_ACTIVE, &mddev->sb_flags));
> +
> +	if (!force_change && !(mddev->sb_flags & ~BIT(MD_SB_UPDATE_ACTIVE)))
> +		goto out;
> +
>  repeat:
>  	if (mddev_is_clustered(mddev)) {
>  		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
> @@ -2402,7 +2411,7 @@ void md_update_sb(struct mddev *mddev, int
> force_change)
>  			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
>  							 BIT(MD_SB_CHANGE_DEVS) |
>  							 BIT(MD_SB_CHANGE_CLEAN));
> -			return;
> +			goto out;
>  		}
>  	}
>  
> @@ -2432,8 +2441,7 @@ void md_update_sb(struct mddev *mddev, int
> force_change)
>  				wake_up(&rdev->blocked_wait);
>  			}
>  		}
> -		wake_up(&mddev->sb_wait);
> -		return;
> +		goto out;
>  	}
>  
>  	spin_lock(&mddev->lock);
> @@ -2544,6 +2552,9 @@ void md_update_sb(struct mddev *mddev, int
> force_change)
>  			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
>  		/* have to write it out again */
>  		goto repeat;
> +
> +out:
> +	clear_bit_unlock(MD_SB_UPDATE_ACTIVE, &mddev->sb_flags);
>  	wake_up(&mddev->sb_wait);
>  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
>  		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
> @@ -5606,8 +5617,7 @@ int md_run(struct mddev *mddev)
>  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
>  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
>  
> -	if (mddev->sb_flags)
> -		md_update_sb(mddev, 0);
> +	md_update_sb(mddev, 0);
>  
>  	md_new_event(mddev);
>  	sysfs_notify_dirent_safe(mddev->sysfs_state);
> @@ -8643,17 +8653,14 @@ void md_check_recovery(struct mddev *mddev)
>  
>  	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
>  		return;
> -	if ( ! (
> -		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
> +	if ((
>  		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
>  		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
>  		(mddev->external == 0 && mddev->safemode == 1) ||
>  		(mddev->safemode == 2
>  		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
> -		))
> -		return;
> -
> -	if (mddev_trylock(mddev)) {
> +		     ) &&
> +	    mddev_trylock(mddev)) {
>  		int spares = 0;
>  
>  		if (!mddev->external && mddev->safemode == 1)
> @@ -8706,9 +8713,6 @@ void md_check_recovery(struct mddev *mddev)
>  			spin_unlock(&mddev->lock);
>  		}
>  
> -		if (mddev->sb_flags)
> -			md_update_sb(mddev, 0);
> -
>  		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
>  		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
>  			/* resync/recovery still happening */
> @@ -8786,6 +8790,7 @@ void md_check_recovery(struct mddev *mddev)
>  		wake_up(&mddev->sb_wait);
>  		mddev_unlock(mddev);
>  	}
> +	md_update_sb(mddev, 0);
>  }
>  EXPORT_SYMBOL(md_check_recovery);
>  
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 09db03455801..bc8633cf7c40 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -243,6 +243,7 @@ enum mddev_sb_flags {
>  	MD_SB_CHANGE_CLEAN,	/* transition to or from 'clean' */
>  	MD_SB_CHANGE_PENDING,	/* switch from 'clean' to 'active' in progress */
>  	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
> +	MD_SB_UPDATE_ACTIVE,	/* A thread is running in md_update_sb */
>  };
>  
>  struct mddev {
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 2dcbafa8e66c..76169dd8ff7c 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -1334,21 +1334,10 @@ static void r5l_write_super_and_discard_space(struct
> r5l_log *log,
>  	mddev = log->rdev->mddev;
>  	/*
>  	 * Discard could zero data, so before discard we must make sure
> -	 * superblock is updated to new log tail. Updating superblock (either
> -	 * directly call md_update_sb() or depend on md thread) must hold
> -	 * reconfig mutex. On the other hand, raid5_quiesce is called with
> -	 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
> -	 * for all IO finish, hence waitting for reclaim thread, while reclaim
> -	 * thread is calling this function and waitting for reconfig mutex. So
> -	 * there is a deadlock. We workaround this issue with a trylock.
> -	 * FIXME: we could miss discard if we can't take reconfig mutex
> +	 * superblock is updated to new log tail.
>  	 */
> -	set_mask_bits(&mddev->sb_flags, 0,
> -		BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
> -	if (!mddev_trylock(mddev))
> -		return;
> +
>  	md_update_sb(mddev, 1);
> -	mddev_unlock(mddev);
>  
>  	/* discard IO error really doesn't matter, ignore it */
>  	if (log->last_checkpoint < end) {
> 

Hi Neil

The test have run for three days and the problem is fixed by this patch. 
Thanks for the help.

Could you help to look at https://www.spinics.net/lists/raid/msg58918.html.
The bug which is fixed by your patch was found when I try to reproduce that
bug. I did a simply analysis, but I'm not sure whether I'm right or not.

Regards
Xiao

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-11  0:14                     ` Xiao Ni
@ 2017-09-11  3:36                       ` NeilBrown
  2017-09-11  5:03                         ` Xiao Ni
  0 siblings, 1 reply; 14+ messages in thread
From: NeilBrown @ 2017-09-11  3:36 UTC (permalink / raw)
  To: Xiao Ni; +Cc: linux-raid, shli

[-- Attachment #1: Type: text/plain, Size: 9886 bytes --]

On Sun, Sep 10 2017, Xiao Ni wrote:

> ----- Original Message -----
>> From: "NeilBrown" <neilb@suse.com>
>> To: "Xiao Ni" <xni@redhat.com>, "linux-raid" <linux-raid@vger.kernel.org>
>> Cc: shli@kernel.org
>> Sent: Thursday, September 7, 2017 1:37:45 PM
>> Subject: Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
>> 
>> On Wed, Sep 06 2017, Xiao Ni wrote:
>> 
>> > ----- Original Message -----
>> >> From: "Xiao Ni" <xni@redhat.com>
>> >> To: "NeilBrown" <neilb@suse.com>, "linux-raid"
>> >> <linux-raid@vger.kernel.org>
>> >> Cc: shli@kernel.org
>> >> Sent: Tuesday, September 5, 2017 10:15:00 AM
>> >> Subject: Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be
>> >> cleared
>> >> 
>> >> 
>> >> 
>> >> On 09/05/2017 09:36 AM, NeilBrown wrote:
>> >> > On Mon, Sep 04 2017, Xiao Ni wrote:
>> >> >
>> >> >>
>> >> >> In function handle_stripe:
>> >> >> 4697         if (s.handle_bad_blocks ||
>> >> >> 4698             test_bit(MD_SB_CHANGE_PENDING,
>> >> >> &conf->mddev->sb_flags)) {
>> >> >> 4699                 set_bit(STRIPE_HANDLE, &sh->state);
>> >> >> 4700                 goto finish;
>> >> >> 4701         }
>> >> >>
>> >> >> Because MD_SB_CHANGE_PENDING is set, so the stripes can't be handled.
>> >> >>
>> >> > Right, of course.  I see what is happening now.
>> >> >
>> >> > - raid5d cannot complete stripes until the metadata is written
>> >> > - the metadata cannot be written until raid5d gets the mddev_lock
>> >> > - mddev_lock is held by the write to suspend_hi
>> >> > - the write to suspend_hi is waiting for raid5_quiesce
>> >> > - raid5_quiesce is waiting for some stripes to complete.
>> >> >
>> >> > We could declare that ->quiesce(, 1) cannot be called while holding the
>> >> > lock.
>> >> > We could possible allow it but only if md_update_sb() is called first,
>> >> > though that might still be racy.
>> >> >
>> >> > ->quiesce(, 1) is currently called from:
>> >> >   mddev_suspend
>> >> >   suspend_lo_store
>> >> >   suspend_hi_store
>> >> >   __md_stop_writes
>> >> >   mddev_detach
>> >> >   set_bitmap_file
>> >> >   update_array_info (when setting/removing internal bitmap)
>> >> >   md_do_sync
>> >> >
>> >> > and most of those are call with the lock held, or take the lock.
>> >> >
>> >> > Maybe we should *require* that mddev_lock is held when calling
>> >> > ->quiesce() and have ->quiesce() do the metadata update.
>> >> >
>> >> > Something like the following maybe.  Can you test it?
>> >> 
>> >> Hi Neil
>> >> 
>> >> Thanks for the analysis. I need to thing for a while :)
>> >> I already added the patch and the test is running now. It usually needs
>> >> more than 5
>> >> hours to reproduce this problem. I'll let it run more than 24 hours.
>> >> I'll update the test
>> >> result later.
>> >
>> > Hi Neil
>> >
>> > The problem still exists. But it doesn't show calltrace this time. It
>> > was stuck yesterday. I didn't notice that because there has no calltrace.
>> >
>> > echo file raid5.c +p > /sys/kernel/debug/dynamic_debug/control
>> >
>> > It shows that raid5d is still spinning.
>> 
>> Thanks for testing. I've thought some more and I think there is a better
>> approach.
>> The fact that we need to take the mutex to write the super block has
>> caused problems several times before and is a key part of the problem
>> now.
>> Maybe we should relax that.  Obviously we don't want two threads
>> updating the metadata at the same time, but it should be safe to
>> update it in parallel with other uses of reconfix_mutex.
>> 
>> Holding mddev->lock while copying data from the struct mddev to the
>> superblock (which we do) should ensure that the superblock is internally
>> consistent, and that should be enough.
>> 
>> So I propose the following patch.  It certainly needs review and
>> testing, but I think it should make a big improvement.
>> 
>> Thanks,
>> NeilBrown
>> 
>> 
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index b01e458d31e9..414a4c1a052d 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -2388,6 +2388,15 @@ void md_update_sb(struct mddev *mddev, int
>> force_change)
>>  		return;
>>  	}
>>  
>> +	if (!force_change && !(mddev->sb_flags & ~BIT(MD_SB_UPDATE_ACTIVE)))
>> +		return;
>> +
>> +	wait_event(mddev->sb_wait,
>> +		   !test_and_set_bit(MD_SB_UPDATE_ACTIVE, &mddev->sb_flags));
>> +
>> +	if (!force_change && !(mddev->sb_flags & ~BIT(MD_SB_UPDATE_ACTIVE)))
>> +		goto out;
>> +
>>  repeat:
>>  	if (mddev_is_clustered(mddev)) {
>>  		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
>> @@ -2402,7 +2411,7 @@ void md_update_sb(struct mddev *mddev, int
>> force_change)
>>  			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
>>  							 BIT(MD_SB_CHANGE_DEVS) |
>>  							 BIT(MD_SB_CHANGE_CLEAN));
>> -			return;
>> +			goto out;
>>  		}
>>  	}
>>  
>> @@ -2432,8 +2441,7 @@ void md_update_sb(struct mddev *mddev, int
>> force_change)
>>  				wake_up(&rdev->blocked_wait);
>>  			}
>>  		}
>> -		wake_up(&mddev->sb_wait);
>> -		return;
>> +		goto out;
>>  	}
>>  
>>  	spin_lock(&mddev->lock);
>> @@ -2544,6 +2552,9 @@ void md_update_sb(struct mddev *mddev, int
>> force_change)
>>  			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
>>  		/* have to write it out again */
>>  		goto repeat;
>> +
>> +out:
>> +	clear_bit_unlock(MD_SB_UPDATE_ACTIVE, &mddev->sb_flags);
>>  	wake_up(&mddev->sb_wait);
>>  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
>>  		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
>> @@ -5606,8 +5617,7 @@ int md_run(struct mddev *mddev)
>>  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
>>  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
>>  
>> -	if (mddev->sb_flags)
>> -		md_update_sb(mddev, 0);
>> +	md_update_sb(mddev, 0);
>>  
>>  	md_new_event(mddev);
>>  	sysfs_notify_dirent_safe(mddev->sysfs_state);
>> @@ -8643,17 +8653,14 @@ void md_check_recovery(struct mddev *mddev)
>>  
>>  	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
>>  		return;
>> -	if ( ! (
>> -		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
>> +	if ((
>>  		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
>>  		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
>>  		(mddev->external == 0 && mddev->safemode == 1) ||
>>  		(mddev->safemode == 2
>>  		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
>> -		))
>> -		return;
>> -
>> -	if (mddev_trylock(mddev)) {
>> +		     ) &&
>> +	    mddev_trylock(mddev)) {
>>  		int spares = 0;
>>  
>>  		if (!mddev->external && mddev->safemode == 1)
>> @@ -8706,9 +8713,6 @@ void md_check_recovery(struct mddev *mddev)
>>  			spin_unlock(&mddev->lock);
>>  		}
>>  
>> -		if (mddev->sb_flags)
>> -			md_update_sb(mddev, 0);
>> -
>>  		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
>>  		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
>>  			/* resync/recovery still happening */
>> @@ -8786,6 +8790,7 @@ void md_check_recovery(struct mddev *mddev)
>>  		wake_up(&mddev->sb_wait);
>>  		mddev_unlock(mddev);
>>  	}
>> +	md_update_sb(mddev, 0);
>>  }
>>  EXPORT_SYMBOL(md_check_recovery);
>>  
>> diff --git a/drivers/md/md.h b/drivers/md/md.h
>> index 09db03455801..bc8633cf7c40 100644
>> --- a/drivers/md/md.h
>> +++ b/drivers/md/md.h
>> @@ -243,6 +243,7 @@ enum mddev_sb_flags {
>>  	MD_SB_CHANGE_CLEAN,	/* transition to or from 'clean' */
>>  	MD_SB_CHANGE_PENDING,	/* switch from 'clean' to 'active' in progress */
>>  	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
>> +	MD_SB_UPDATE_ACTIVE,	/* A thread is running in md_update_sb */
>>  };
>>  
>>  struct mddev {
>> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
>> index 2dcbafa8e66c..76169dd8ff7c 100644
>> --- a/drivers/md/raid5-cache.c
>> +++ b/drivers/md/raid5-cache.c
>> @@ -1334,21 +1334,10 @@ static void r5l_write_super_and_discard_space(struct
>> r5l_log *log,
>>  	mddev = log->rdev->mddev;
>>  	/*
>>  	 * Discard could zero data, so before discard we must make sure
>> -	 * superblock is updated to new log tail. Updating superblock (either
>> -	 * directly call md_update_sb() or depend on md thread) must hold
>> -	 * reconfig mutex. On the other hand, raid5_quiesce is called with
>> -	 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
>> -	 * for all IO finish, hence waitting for reclaim thread, while reclaim
>> -	 * thread is calling this function and waitting for reconfig mutex. So
>> -	 * there is a deadlock. We workaround this issue with a trylock.
>> -	 * FIXME: we could miss discard if we can't take reconfig mutex
>> +	 * superblock is updated to new log tail.
>>  	 */
>> -	set_mask_bits(&mddev->sb_flags, 0,
>> -		BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
>> -	if (!mddev_trylock(mddev))
>> -		return;
>> +
>>  	md_update_sb(mddev, 1);
>> -	mddev_unlock(mddev);
>>  
>>  	/* discard IO error really doesn't matter, ignore it */
>>  	if (log->last_checkpoint < end) {
>> 
>
> Hi Neil
>
> The test have run for three days and the problem is fixed by this patch. 
> Thanks for the help.

Thanks for testing.  I'll look over the patch again and see if there is
any chance that the locking change could introduce other problems.

>
> Could you help to look at https://www.spinics.net/lists/raid/msg58918.html.
> The bug which is fixed by your patch was found when I try to reproduce that
> bug. I did a simply analysis, but I'm not sure whether I'm right or not.

It might be the same bug, but if it is there should be other processes
stuck in a D wait, one of them holding the reconfig_mutex and waiting
for the array to quiesce.

Where there any other processes in D wait?

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-11  3:36                       ` NeilBrown
@ 2017-09-11  5:03                         ` Xiao Ni
  0 siblings, 0 replies; 14+ messages in thread
From: Xiao Ni @ 2017-09-11  5:03 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, shli



----- Original Message -----
> From: "NeilBrown" <neilb@suse.com>
> To: "Xiao Ni" <xni@redhat.com>
> Cc: "linux-raid" <linux-raid@vger.kernel.org>, shli@kernel.org
> Sent: Monday, September 11, 2017 11:36:25 AM
> Subject: Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
> 
> On Sun, Sep 10 2017, Xiao Ni wrote:
> 
> > ----- Original Message -----
> >> From: "NeilBrown" <neilb@suse.com>
> >> To: "Xiao Ni" <xni@redhat.com>, "linux-raid" <linux-raid@vger.kernel.org>
> >> Cc: shli@kernel.org
> >> Sent: Thursday, September 7, 2017 1:37:45 PM
> >> Subject: Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be
> >> cleared
> >> 
> >> On Wed, Sep 06 2017, Xiao Ni wrote:
> >> 
> >> > ----- Original Message -----
> >> >> From: "Xiao Ni" <xni@redhat.com>
> >> >> To: "NeilBrown" <neilb@suse.com>, "linux-raid"
> >> >> <linux-raid@vger.kernel.org>
> >> >> Cc: shli@kernel.org
> >> >> Sent: Tuesday, September 5, 2017 10:15:00 AM
> >> >> Subject: Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't
> >> >> be
> >> >> cleared
> >> >> 
> >> >> 
> >> >> 
> >> >> On 09/05/2017 09:36 AM, NeilBrown wrote:
> >> >> > On Mon, Sep 04 2017, Xiao Ni wrote:
> >> >> >
> >> >> >>
> >> >> >> In function handle_stripe:
> >> >> >> 4697         if (s.handle_bad_blocks ||
> >> >> >> 4698             test_bit(MD_SB_CHANGE_PENDING,
> >> >> >> &conf->mddev->sb_flags)) {
> >> >> >> 4699                 set_bit(STRIPE_HANDLE, &sh->state);
> >> >> >> 4700                 goto finish;
> >> >> >> 4701         }
> >> >> >>
> >> >> >> Because MD_SB_CHANGE_PENDING is set, so the stripes can't be
> >> >> >> handled.
> >> >> >>
> >> >> > Right, of course.  I see what is happening now.
> >> >> >
> >> >> > - raid5d cannot complete stripes until the metadata is written
> >> >> > - the metadata cannot be written until raid5d gets the mddev_lock
> >> >> > - mddev_lock is held by the write to suspend_hi
> >> >> > - the write to suspend_hi is waiting for raid5_quiesce
> >> >> > - raid5_quiesce is waiting for some stripes to complete.
> >> >> >
> >> >> > We could declare that ->quiesce(, 1) cannot be called while holding
> >> >> > the
> >> >> > lock.
> >> >> > We could possible allow it but only if md_update_sb() is called
> >> >> > first,
> >> >> > though that might still be racy.
> >> >> >
> >> >> > ->quiesce(, 1) is currently called from:
> >> >> >   mddev_suspend
> >> >> >   suspend_lo_store
> >> >> >   suspend_hi_store
> >> >> >   __md_stop_writes
> >> >> >   mddev_detach
> >> >> >   set_bitmap_file
> >> >> >   update_array_info (when setting/removing internal bitmap)
> >> >> >   md_do_sync
> >> >> >
> >> >> > and most of those are call with the lock held, or take the lock.
> >> >> >
> >> >> > Maybe we should *require* that mddev_lock is held when calling
> >> >> > ->quiesce() and have ->quiesce() do the metadata update.
> >> >> >
> >> >> > Something like the following maybe.  Can you test it?
> >> >> 
> >> >> Hi Neil
> >> >> 
> >> >> Thanks for the analysis. I need to thing for a while :)
> >> >> I already added the patch and the test is running now. It usually needs
> >> >> more than 5
> >> >> hours to reproduce this problem. I'll let it run more than 24 hours.
> >> >> I'll update the test
> >> >> result later.
> >> >
> >> > Hi Neil
> >> >
> >> > The problem still exists. But it doesn't show calltrace this time. It
> >> > was stuck yesterday. I didn't notice that because there has no
> >> > calltrace.
> >> >
> >> > echo file raid5.c +p > /sys/kernel/debug/dynamic_debug/control
> >> >
> >> > It shows that raid5d is still spinning.
> >> 
> >> Thanks for testing. I've thought some more and I think there is a better
> >> approach.
> >> The fact that we need to take the mutex to write the super block has
> >> caused problems several times before and is a key part of the problem
> >> now.
> >> Maybe we should relax that.  Obviously we don't want two threads
> >> updating the metadata at the same time, but it should be safe to
> >> update it in parallel with other uses of reconfix_mutex.
> >> 
> >> Holding mddev->lock while copying data from the struct mddev to the
> >> superblock (which we do) should ensure that the superblock is internally
> >> consistent, and that should be enough.
> >> 
> >> So I propose the following patch.  It certainly needs review and
> >> testing, but I think it should make a big improvement.
> >> 
> >> Thanks,
> >> NeilBrown
> >> 
> >> 
> >> diff --git a/drivers/md/md.c b/drivers/md/md.c
> >> index b01e458d31e9..414a4c1a052d 100644
> >> --- a/drivers/md/md.c
> >> +++ b/drivers/md/md.c
> >> @@ -2388,6 +2388,15 @@ void md_update_sb(struct mddev *mddev, int
> >> force_change)
> >>  		return;
> >>  	}
> >>  
> >> +	if (!force_change && !(mddev->sb_flags & ~BIT(MD_SB_UPDATE_ACTIVE)))
> >> +		return;
> >> +
> >> +	wait_event(mddev->sb_wait,
> >> +		   !test_and_set_bit(MD_SB_UPDATE_ACTIVE, &mddev->sb_flags));
> >> +
> >> +	if (!force_change && !(mddev->sb_flags & ~BIT(MD_SB_UPDATE_ACTIVE)))
> >> +		goto out;
> >> +
> >>  repeat:
> >>  	if (mddev_is_clustered(mddev)) {
> >>  		if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
> >> @@ -2402,7 +2411,7 @@ void md_update_sb(struct mddev *mddev, int
> >> force_change)
> >>  			bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
> >>  							 BIT(MD_SB_CHANGE_DEVS) |
> >>  							 BIT(MD_SB_CHANGE_CLEAN));
> >> -			return;
> >> +			goto out;
> >>  		}
> >>  	}
> >>  
> >> @@ -2432,8 +2441,7 @@ void md_update_sb(struct mddev *mddev, int
> >> force_change)
> >>  				wake_up(&rdev->blocked_wait);
> >>  			}
> >>  		}
> >> -		wake_up(&mddev->sb_wait);
> >> -		return;
> >> +		goto out;
> >>  	}
> >>  
> >>  	spin_lock(&mddev->lock);
> >> @@ -2544,6 +2552,9 @@ void md_update_sb(struct mddev *mddev, int
> >> force_change)
> >>  			       BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
> >>  		/* have to write it out again */
> >>  		goto repeat;
> >> +
> >> +out:
> >> +	clear_bit_unlock(MD_SB_UPDATE_ACTIVE, &mddev->sb_flags);
> >>  	wake_up(&mddev->sb_wait);
> >>  	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
> >>  		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
> >> @@ -5606,8 +5617,7 @@ int md_run(struct mddev *mddev)
> >>  		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
> >>  	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> >>  
> >> -	if (mddev->sb_flags)
> >> -		md_update_sb(mddev, 0);
> >> +	md_update_sb(mddev, 0);
> >>  
> >>  	md_new_event(mddev);
> >>  	sysfs_notify_dirent_safe(mddev->sysfs_state);
> >> @@ -8643,17 +8653,14 @@ void md_check_recovery(struct mddev *mddev)
> >>  
> >>  	if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
> >>  		return;
> >> -	if ( ! (
> >> -		(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
> >> +	if ((
> >>  		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
> >>  		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
> >>  		(mddev->external == 0 && mddev->safemode == 1) ||
> >>  		(mddev->safemode == 2
> >>  		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
> >> -		))
> >> -		return;
> >> -
> >> -	if (mddev_trylock(mddev)) {
> >> +		     ) &&
> >> +	    mddev_trylock(mddev)) {
> >>  		int spares = 0;
> >>  
> >>  		if (!mddev->external && mddev->safemode == 1)
> >> @@ -8706,9 +8713,6 @@ void md_check_recovery(struct mddev *mddev)
> >>  			spin_unlock(&mddev->lock);
> >>  		}
> >>  
> >> -		if (mddev->sb_flags)
> >> -			md_update_sb(mddev, 0);
> >> -
> >>  		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
> >>  		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
> >>  			/* resync/recovery still happening */
> >> @@ -8786,6 +8790,7 @@ void md_check_recovery(struct mddev *mddev)
> >>  		wake_up(&mddev->sb_wait);
> >>  		mddev_unlock(mddev);
> >>  	}
> >> +	md_update_sb(mddev, 0);
> >>  }
> >>  EXPORT_SYMBOL(md_check_recovery);
> >>  
> >> diff --git a/drivers/md/md.h b/drivers/md/md.h
> >> index 09db03455801..bc8633cf7c40 100644
> >> --- a/drivers/md/md.h
> >> +++ b/drivers/md/md.h
> >> @@ -243,6 +243,7 @@ enum mddev_sb_flags {
> >>  	MD_SB_CHANGE_CLEAN,	/* transition to or from 'clean' */
> >>  	MD_SB_CHANGE_PENDING,	/* switch from 'clean' to 'active' in progress */
> >>  	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
> >> +	MD_SB_UPDATE_ACTIVE,	/* A thread is running in md_update_sb */
> >>  };
> >>  
> >>  struct mddev {
> >> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> >> index 2dcbafa8e66c..76169dd8ff7c 100644
> >> --- a/drivers/md/raid5-cache.c
> >> +++ b/drivers/md/raid5-cache.c
> >> @@ -1334,21 +1334,10 @@ static void
> >> r5l_write_super_and_discard_space(struct
> >> r5l_log *log,
> >>  	mddev = log->rdev->mddev;
> >>  	/*
> >>  	 * Discard could zero data, so before discard we must make sure
> >> -	 * superblock is updated to new log tail. Updating superblock (either
> >> -	 * directly call md_update_sb() or depend on md thread) must hold
> >> -	 * reconfig mutex. On the other hand, raid5_quiesce is called with
> >> -	 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
> >> -	 * for all IO finish, hence waitting for reclaim thread, while reclaim
> >> -	 * thread is calling this function and waitting for reconfig mutex. So
> >> -	 * there is a deadlock. We workaround this issue with a trylock.
> >> -	 * FIXME: we could miss discard if we can't take reconfig mutex
> >> +	 * superblock is updated to new log tail.
> >>  	 */
> >> -	set_mask_bits(&mddev->sb_flags, 0,
> >> -		BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
> >> -	if (!mddev_trylock(mddev))
> >> -		return;
> >> +
> >>  	md_update_sb(mddev, 1);
> >> -	mddev_unlock(mddev);
> >>  
> >>  	/* discard IO error really doesn't matter, ignore it */
> >>  	if (log->last_checkpoint < end) {
> >> 
> >
> > Hi Neil
> >
> > The test have run for three days and the problem is fixed by this patch.
> > Thanks for the help.
> 
> Thanks for testing.  I'll look over the patch again and see if there is
> any chance that the locking change could introduce other problems.

Ok, if you have new patches I can do the test. 

> 
> >
> > Could you help to look at https://www.spinics.net/lists/raid/msg58918.html.
> > The bug which is fixed by your patch was found when I try to reproduce that
> > bug. I did a simply analysis, but I'm not sure whether I'm right or not.
> 
> It might be the same bug, but if it is there should be other processes
> stuck in a D wait, one of them holding the reconfig_mutex and waiting
> for the array to quiesce.

Hmm but raid5d is stuck in this problem. It's stuck at md_check_recovery. 
And conf->quiesce is 0 in this problem. 

> 
> Where there any other processes in D wait?

Yes, there is another process is in D state:

[440793.893432] INFO: task jbd2/md0-8:27647 blocked for more than 120 seconds.
[440793.900387] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[440793.908299] jbd2/md0-8      D ffff8801fe815e00     0 27647      2 0x00000080
[440793.908300]  ffff88021ae9fc90 0000000000000046 ffff88022084edd0 ffff88021ae9ffd8
[440793.915846]  ffff88021ae9ffd8 ffff88021ae9ffd8 ffff88022084edd0 ffff88021ae9fda0
[440793.923390]  ffff88005399e0c0 ffff88022084edd0 ffff88021ae9fd88 ffff8801fe815e00
[440793.930931] Call Trace:
[440793.933467]  [<ffffffff81689d39>] schedule+0x29/0x70
[440793.938522]  [<ffffffffa0a15238>] jbd2_journal_commit_transaction+0x248/0x19a0 [jbd2]
[440793.946434]  [<ffffffff810c780e>] ? account_entity_dequeue+0xae/0xd0
[440793.952876]  [<ffffffff810cb2fc>] ? dequeue_entity+0x11c/0x5d0
[440793.958797]  [<ffffffff81028569>] ? __switch_to+0xd9/0x4c0
[440793.964372]  [<ffffffff810af390>] ? wake_up_atomic_t+0x30/0x30
[440793.970293]  [<ffffffff810963ce>] ? try_to_del_timer_sync+0x5e/0x90
[440793.976645]  [<ffffffffa0a1b849>] kjournald2+0xc9/0x260 [jbd2]
[440793.982565]  [<ffffffff810af390>] ? wake_up_atomic_t+0x30/0x30
[440793.988485]  [<ffffffffa0a1b780>] ? commit_timeout+0x10/0x10 [jbd2]
[440793.994839]  [<ffffffff810ae2bf>] kthread+0xcf/0xe0
[440793.999803]  [<ffffffff810ae1f0>] ? kthread_create_on_node+0x140/0x140
[440794.006419]  [<ffffffff81695598>] ret_from_fork+0x58/0x90
[440794.011906]  [<ffffffff810ae1f0>] ? kthread_create_on_node+0x140/0x140

Regards
Xiao

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared
  2017-09-05  1:36             ` NeilBrown
  2017-09-05  2:15               ` Xiao Ni
@ 2017-09-30  9:44               ` Xiao Ni
  1 sibling, 0 replies; 14+ messages in thread
From: Xiao Ni @ 2017-09-30  9:44 UTC (permalink / raw)
  To: NeilBrown, linux-raid; +Cc: shli



On 09/05/2017 09:36 AM, NeilBrown wrote:
> On Mon, Sep 04 2017, Xiao Ni wrote:
>
>>
>> In function handle_stripe:
>> 4697         if (s.handle_bad_blocks ||
>> 4698             test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
>> 4699                 set_bit(STRIPE_HANDLE, &sh->state);
>> 4700                 goto finish;
>> 4701         }
>>
>> Because MD_SB_CHANGE_PENDING is set, so the stripes can't be handled.
>>
> Right, of course.  I see what is happening now.
>
> - raid5d cannot complete stripes until the metadata is written
> - the metadata cannot be written until raid5d gets the mddev_lock
> - mddev_lock is held by the write to suspend_hi
> - the write to suspend_hi is waiting for raid5_quiesce
> - raid5_quiesce is waiting for some stripes to complete.
>
> We could declare that ->quiesce(, 1) cannot be called while holding the
> lock.
> We could possible allow it but only if md_update_sb() is called first,
> though that might still be racy.
>
> ->quiesce(, 1) is currently called from:
>   mddev_suspend
>   suspend_lo_store
>   suspend_hi_store
>   __md_stop_writes
>   mddev_detach
>   set_bitmap_file
>   update_array_info (when setting/removing internal bitmap)
>   md_do_sync
>
> and most of those are call with the lock held, or take the lock.
>
> Maybe we should *require* that mddev_lock is held when calling
> ->quiesce() and have ->quiesce() do the metadata update.
>
> Something like the following maybe.  Can you test it?
> Thanks,
> NeilBrown
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index b01e458d31e9..999ccf08c5db 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5805,9 +5805,11 @@ void md_stop(struct mddev *mddev)
>   	/* stop the array and free an attached data structures.
>   	 * This is called from dm-raid
>   	 */
> +	mddev_lock_nointr(mddev);
>   	__md_stop(mddev);
>   	if (mddev->bio_set)
>   		bioset_free(mddev->bio_set);
> +	mddev_unlock(mddev);
>   }
>   
>   EXPORT_SYMBOL_GPL(md_stop);
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 0fc2748aaf95..cde5a82eb404 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -4316,6 +4316,8 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
>   
>   			/* place all the copies on one channel */
>   			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
> +			WARN_ON(sh2->dev[dd_idx].page != sh2->dev[dd_idx].orig_page);
> +			WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
>   			tx = async_memcpy(sh2->dev[dd_idx].page,
>   					  sh->dev[i].page, 0, 0, STRIPE_SIZE,
>   					  &submit);
> @@ -8031,7 +8033,10 @@ static void raid5_quiesce(struct mddev *mddev, int state)
>   		wait_event_cmd(conf->wait_for_quiescent,
>   				    atomic_read(&conf->active_stripes) == 0 &&
>   				    atomic_read(&conf->active_aligned_reads) == 0,
> -				    unlock_all_device_hash_locks_irq(conf),
> +				    ({unlock_all_device_hash_locks_irq(conf);
> +					if (mddev->sb_flags)
> +						md_update_sb(mddev, 0);
> +				    }),
>   				    lock_all_device_hash_locks_irq(conf));
>   		conf->quiesce = 1;
>   		unlock_all_device_hash_locks_irq(conf);

Hi Neil

I read this patch again. But I don't know why it can't work.
It calls md_update_sb when it waits for active_stripes. It should
clear MD_SB_CHANGE_CLEAN and MD_SB_CHANGE_PENDING.

Could you explain this?

Best Regards
Xiao

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-09-30  9:44 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <221835411.4473056.1504338574607.JavaMail.zimbra@redhat.com>
2017-09-02  8:01 ` Stuck in md_write_start because MD_SB_CHANGE_PENDING can't be cleared Xiao Ni
2017-09-04  2:16   ` NeilBrown
2017-09-04  2:45     ` Xiao Ni
2017-09-04  3:52       ` Xiao Ni
2017-09-04  5:34         ` NeilBrown
2017-09-04  7:36           ` Xiao Ni
2017-09-05  1:36             ` NeilBrown
2017-09-05  2:15               ` Xiao Ni
2017-09-07  1:37                 ` Xiao Ni
2017-09-07  5:37                   ` NeilBrown
2017-09-11  0:14                     ` Xiao Ni
2017-09-11  3:36                       ` NeilBrown
2017-09-11  5:03                         ` Xiao Ni
2017-09-30  9:44               ` Xiao Ni

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.