All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] md: create new workqueue for object destruction
@ 2017-10-17  5:04 NeilBrown
  2017-10-18  6:21 ` Shaohua Li
  0 siblings, 1 reply; 13+ messages in thread
From: NeilBrown @ 2017-10-17  5:04 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Linux Raid

[-- Attachment #1: Type: text/plain, Size: 6965 bytes --]


lockdep currently complains about a potential deadlock
with sysfs access taking reconfig_mutex, and that
waiting for a work queue to complete.

The cause is inappropriate overloading of work-items
on work-queues.

We currently have two work-queues: md_wq and md_misc_wq.
They service 5 different tasks:

  mddev->flush_work                       md_wq
  mddev->event_work (for dm-raid)         md_misc_wq
  mddev->del_work (mddev_delayed_delete)  md_misc_wq
  mddev->del_work (md_start_sync)         md_misc_wq
  rdev->del_work                          md_misc_wq

We need to call flush_workqueue() for md_start_sync and ->event_work
while holding reconfig_mutex, but mustn't hold it when
flushing mddev_delayed_delete or rdev->del_work.

md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
best to leave that alone.

So create a new workqueue, md_del_wq, and a new work_struct,
mddev->sync_work, so we can keep two classes of work separate.

md_del_wq and ->del_work are used only for destroying rdev
and mddev.
md_misc_wq is used for event_work and sync_work.

Also document the purpose of each flush_workqueue() call.

This removes the lockdep warning.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 51 ++++++++++++++++++++++++++++-----------------------
 drivers/md/md.h |  1 +
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index bf06ff017eda..a9f1352b3849 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -91,6 +91,7 @@ EXPORT_SYMBOL(md_cluster_mod);
 
 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 static struct workqueue_struct *md_wq;
+static struct workqueue_struct *md_del_wq;
 static struct workqueue_struct *md_misc_wq;
 
 static int remove_and_add_spares(struct mddev *mddev,
@@ -529,7 +530,7 @@ static void mddev_put(struct mddev *mddev)
 			 * succeed in waiting for the work to be done.
 			 */
 			INIT_WORK(&mddev->del_work, mddev_delayed_delete);
-			queue_work(md_misc_wq, &mddev->del_work);
+			queue_work(md_del_wq, &mddev->del_work);
 		} else
 			kfree(mddev);
 	}
@@ -2264,7 +2265,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
 	synchronize_rcu();
 	INIT_WORK(&rdev->del_work, md_delayed_delete);
 	kobject_get(&rdev->kobj);
-	queue_work(md_misc_wq, &rdev->del_work);
+	queue_work(md_del_wq, &rdev->del_work);
 }
 
 /*
@@ -4298,7 +4299,8 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
 	    minor != MINOR(dev))
 		return -EOVERFLOW;
 
-	flush_workqueue(md_misc_wq);
+	/* Ensure old devices are fully deleted (rdev->del_work) */
+	flush_workqueue(md_del_wq);
 
 	err = mddev_lock(mddev);
 	if (err)
@@ -4537,6 +4539,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
 		    mddev_lock(mddev) == 0) {
+			/* Ensure sync/recovery has fully started (mddev->sync_work) */
 			flush_workqueue(md_misc_wq);
 			if (mddev->sync_thread) {
 				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -5280,9 +5283,9 @@ static int md_alloc(dev_t dev, char *name)
 	unit = MINOR(mddev->unit) >> shift;
 
 	/* wait for any previous instance of this device to be
-	 * completely removed (mddev_delayed_delete).
+	 * completely removed (mddev_delayed_delete, mddev->del_work).
 	 */
-	flush_workqueue(md_misc_wq);
+	flush_workqueue(md_del_wq);
 
 	mutex_lock(&disks_mutex);
 	error = -EEXIST;
@@ -5788,6 +5791,7 @@ static void md_clean(struct mddev *mddev)
 static void __md_stop_writes(struct mddev *mddev)
 {
 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+	/* Ensure any sync/recovery has fully started (mddev->sync_work) */
 	flush_workqueue(md_misc_wq);
 	if (mddev->sync_thread) {
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -7128,8 +7132,8 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 
 	if (cmd == ADD_NEW_DISK)
-		/* need to ensure md_delayed_delete() has completed */
-		flush_workqueue(md_misc_wq);
+		/* need to ensure md_delayed_delete() has completed (rdev->del_work) */
+		flush_workqueue(md_del_wq);
 
 	if (cmd == HOT_REMOVE_DISK)
 		/* need to ensure recovery thread has run */
@@ -7383,8 +7387,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 		 * bd_disk.
 		 */
 		mddev_put(mddev);
-		/* Wait until bdev->bd_disk is definitely gone */
-		flush_workqueue(md_misc_wq);
+		/* Wait until bdev->bd_disk is definitely gone (mddev->del_work) */
+		flush_workqueue(md_del_wq);
 		/* Then retry the open from the top */
 		return -ERESTARTSYS;
 	}
@@ -8631,7 +8635,7 @@ static int remove_and_add_spares(struct mddev *mddev,
 
 static void md_start_sync(struct work_struct *ws)
 {
-	struct mddev *mddev = container_of(ws, struct mddev, del_work);
+	struct mddev *mddev = container_of(ws, struct mddev, sync_work);
 
 	mddev->sync_thread = md_register_thread(md_do_sync,
 						mddev,
@@ -8823,8 +8827,8 @@ void md_check_recovery(struct mddev *mddev)
 				 */
 				bitmap_write_all(mddev->bitmap);
 			}
-			INIT_WORK(&mddev->del_work, md_start_sync);
-			queue_work(md_misc_wq, &mddev->del_work);
+			INIT_WORK(&mddev->sync_work, md_start_sync);
+			queue_work(md_misc_wq, &mddev->sync_work);
 			goto unlock;
 		}
 	not_running:
@@ -9018,15 +9022,13 @@ static int __init md_init(void)
 	int ret = -ENOMEM;
 
 	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
-	if (!md_wq)
-		goto err_wq;
-
+	md_del_wq = alloc_workqueue("md_del", 0, 0);
 	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
-	if (!md_misc_wq)
-		goto err_misc_wq;
+	if (!md_wq || !md_del_wq || !md_misc_wq)
+		goto err;
 
 	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
-		goto err_md;
+		goto err;
 
 	if ((ret = register_blkdev(0, "mdp")) < 0)
 		goto err_mdp;
@@ -9045,11 +9047,13 @@ static int __init md_init(void)
 
 err_mdp:
 	unregister_blkdev(MD_MAJOR, "md");
-err_md:
-	destroy_workqueue(md_misc_wq);
-err_misc_wq:
-	destroy_workqueue(md_wq);
-err_wq:
+err:
+	if (md_wq)
+		destroy_workqueue(md_wq);
+	if (md_del_wq)
+		destroy_workqueue(md_del_wq);
+	if (md_misc_wq)
+		destroy_workqueue(md_misc_wq);
 	return ret;
 }
 
@@ -9304,6 +9308,7 @@ static __exit void md_exit(void)
 		 */
 	}
 	destroy_workqueue(md_misc_wq);
+	destroy_workqueue(md_del_wq);
 	destroy_workqueue(md_wq);
 }
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 03fc641e5da1..8c2158f3bd59 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -397,6 +397,7 @@ struct mddev {
 	struct kernfs_node		*sysfs_action;  /* handle for 'sync_action' */
 
 	struct work_struct del_work;	/* used for delayed sysfs removal */
+	struct work_struct sync_work;	/* used for async starting of md_do_sync */
 
 	/* "lock" protects:
 	 *   flush_bio transition from NULL to !NULL
-- 
2.14.0.rc0.dirty


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-17  5:04 [PATCH] md: create new workqueue for object destruction NeilBrown
@ 2017-10-18  6:21 ` Shaohua Li
  2017-10-18  7:29   ` NeilBrown
  0 siblings, 1 reply; 13+ messages in thread
From: Shaohua Li @ 2017-10-18  6:21 UTC (permalink / raw)
  To: NeilBrown; +Cc: Linux Raid

On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
> 
> lockdep currently complains about a potential deadlock
> with sysfs access taking reconfig_mutex, and that
> waiting for a work queue to complete.
> 
> The cause is inappropriate overloading of work-items
> on work-queues.
> 
> We currently have two work-queues: md_wq and md_misc_wq.
> They service 5 different tasks:
> 
>   mddev->flush_work                       md_wq
>   mddev->event_work (for dm-raid)         md_misc_wq
>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>   mddev->del_work (md_start_sync)         md_misc_wq
>   rdev->del_work                          md_misc_wq
> 
> We need to call flush_workqueue() for md_start_sync and ->event_work
> while holding reconfig_mutex, but mustn't hold it when
> flushing mddev_delayed_delete or rdev->del_work.
> 
> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
> best to leave that alone.
> 
> So create a new workqueue, md_del_wq, and a new work_struct,
> mddev->sync_work, so we can keep two classes of work separate.
> 
> md_del_wq and ->del_work are used only for destroying rdev
> and mddev.
> md_misc_wq is used for event_work and sync_work.
> 
> Also document the purpose of each flush_workqueue() call.
> 
> This removes the lockdep warning.

I had the exactly same patch queued internally, but the mdadm test suite still
shows lockdep warnning. I haven't time to check further.

Thanks,
Shaohua 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  drivers/md/md.c | 51 ++++++++++++++++++++++++++++-----------------------
>  drivers/md/md.h |  1 +
>  2 files changed, 29 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index bf06ff017eda..a9f1352b3849 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -91,6 +91,7 @@ EXPORT_SYMBOL(md_cluster_mod);
>  
>  static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
>  static struct workqueue_struct *md_wq;
> +static struct workqueue_struct *md_del_wq;
>  static struct workqueue_struct *md_misc_wq;
>  
>  static int remove_and_add_spares(struct mddev *mddev,
> @@ -529,7 +530,7 @@ static void mddev_put(struct mddev *mddev)
>  			 * succeed in waiting for the work to be done.
>  			 */
>  			INIT_WORK(&mddev->del_work, mddev_delayed_delete);
> -			queue_work(md_misc_wq, &mddev->del_work);
> +			queue_work(md_del_wq, &mddev->del_work);
>  		} else
>  			kfree(mddev);
>  	}
> @@ -2264,7 +2265,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
>  	synchronize_rcu();
>  	INIT_WORK(&rdev->del_work, md_delayed_delete);
>  	kobject_get(&rdev->kobj);
> -	queue_work(md_misc_wq, &rdev->del_work);
> +	queue_work(md_del_wq, &rdev->del_work);
>  }
>  
>  /*
> @@ -4298,7 +4299,8 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
>  	    minor != MINOR(dev))
>  		return -EOVERFLOW;
>  
> -	flush_workqueue(md_misc_wq);
> +	/* Ensure old devices are fully deleted (rdev->del_work) */
> +	flush_workqueue(md_del_wq);
>  
>  	err = mddev_lock(mddev);
>  	if (err)
> @@ -4537,6 +4539,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
>  			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
>  		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
>  		    mddev_lock(mddev) == 0) {
> +			/* Ensure sync/recovery has fully started (mddev->sync_work) */
>  			flush_workqueue(md_misc_wq);
>  			if (mddev->sync_thread) {
>  				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
> @@ -5280,9 +5283,9 @@ static int md_alloc(dev_t dev, char *name)
>  	unit = MINOR(mddev->unit) >> shift;
>  
>  	/* wait for any previous instance of this device to be
> -	 * completely removed (mddev_delayed_delete).
> +	 * completely removed (mddev_delayed_delete, mddev->del_work).
>  	 */
> -	flush_workqueue(md_misc_wq);
> +	flush_workqueue(md_del_wq);
>  
>  	mutex_lock(&disks_mutex);
>  	error = -EEXIST;
> @@ -5788,6 +5791,7 @@ static void md_clean(struct mddev *mddev)
>  static void __md_stop_writes(struct mddev *mddev)
>  {
>  	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
> +	/* Ensure any sync/recovery has fully started (mddev->sync_work) */
>  	flush_workqueue(md_misc_wq);
>  	if (mddev->sync_thread) {
>  		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
> @@ -7128,8 +7132,8 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
>  	}
>  
>  	if (cmd == ADD_NEW_DISK)
> -		/* need to ensure md_delayed_delete() has completed */
> -		flush_workqueue(md_misc_wq);
> +		/* need to ensure md_delayed_delete() has completed (rdev->del_work) */
> +		flush_workqueue(md_del_wq);
>  
>  	if (cmd == HOT_REMOVE_DISK)
>  		/* need to ensure recovery thread has run */
> @@ -7383,8 +7387,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
>  		 * bd_disk.
>  		 */
>  		mddev_put(mddev);
> -		/* Wait until bdev->bd_disk is definitely gone */
> -		flush_workqueue(md_misc_wq);
> +		/* Wait until bdev->bd_disk is definitely gone (mddev->del_work) */
> +		flush_workqueue(md_del_wq);
>  		/* Then retry the open from the top */
>  		return -ERESTARTSYS;
>  	}
> @@ -8631,7 +8635,7 @@ static int remove_and_add_spares(struct mddev *mddev,
>  
>  static void md_start_sync(struct work_struct *ws)
>  {
> -	struct mddev *mddev = container_of(ws, struct mddev, del_work);
> +	struct mddev *mddev = container_of(ws, struct mddev, sync_work);
>  
>  	mddev->sync_thread = md_register_thread(md_do_sync,
>  						mddev,
> @@ -8823,8 +8827,8 @@ void md_check_recovery(struct mddev *mddev)
>  				 */
>  				bitmap_write_all(mddev->bitmap);
>  			}
> -			INIT_WORK(&mddev->del_work, md_start_sync);
> -			queue_work(md_misc_wq, &mddev->del_work);
> +			INIT_WORK(&mddev->sync_work, md_start_sync);
> +			queue_work(md_misc_wq, &mddev->sync_work);
>  			goto unlock;
>  		}
>  	not_running:
> @@ -9018,15 +9022,13 @@ static int __init md_init(void)
>  	int ret = -ENOMEM;
>  
>  	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
> -	if (!md_wq)
> -		goto err_wq;
> -
> +	md_del_wq = alloc_workqueue("md_del", 0, 0);
>  	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
> -	if (!md_misc_wq)
> -		goto err_misc_wq;
> +	if (!md_wq || !md_del_wq || !md_misc_wq)
> +		goto err;
>  
>  	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
> -		goto err_md;
> +		goto err;
>  
>  	if ((ret = register_blkdev(0, "mdp")) < 0)
>  		goto err_mdp;
> @@ -9045,11 +9047,13 @@ static int __init md_init(void)
>  
>  err_mdp:
>  	unregister_blkdev(MD_MAJOR, "md");
> -err_md:
> -	destroy_workqueue(md_misc_wq);
> -err_misc_wq:
> -	destroy_workqueue(md_wq);
> -err_wq:
> +err:
> +	if (md_wq)
> +		destroy_workqueue(md_wq);
> +	if (md_del_wq)
> +		destroy_workqueue(md_del_wq);
> +	if (md_misc_wq)
> +		destroy_workqueue(md_misc_wq);
>  	return ret;
>  }
>  
> @@ -9304,6 +9308,7 @@ static __exit void md_exit(void)
>  		 */
>  	}
>  	destroy_workqueue(md_misc_wq);
> +	destroy_workqueue(md_del_wq);
>  	destroy_workqueue(md_wq);
>  }
>  
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index 03fc641e5da1..8c2158f3bd59 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -397,6 +397,7 @@ struct mddev {
>  	struct kernfs_node		*sysfs_action;  /* handle for 'sync_action' */
>  
>  	struct work_struct del_work;	/* used for delayed sysfs removal */
> +	struct work_struct sync_work;	/* used for async starting of md_do_sync */
>  
>  	/* "lock" protects:
>  	 *   flush_bio transition from NULL to !NULL
> -- 
> 2.14.0.rc0.dirty
> 



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-18  6:21 ` Shaohua Li
@ 2017-10-18  7:29   ` NeilBrown
  2017-10-18 11:21     ` Artur Paszkiewicz
  0 siblings, 1 reply; 13+ messages in thread
From: NeilBrown @ 2017-10-18  7:29 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Linux Raid

[-- Attachment #1: Type: text/plain, Size: 1777 bytes --]

On Tue, Oct 17 2017, Shaohua Li wrote:

> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>> 
>> lockdep currently complains about a potential deadlock
>> with sysfs access taking reconfig_mutex, and that
>> waiting for a work queue to complete.
>> 
>> The cause is inappropriate overloading of work-items
>> on work-queues.
>> 
>> We currently have two work-queues: md_wq and md_misc_wq.
>> They service 5 different tasks:
>> 
>>   mddev->flush_work                       md_wq
>>   mddev->event_work (for dm-raid)         md_misc_wq
>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>   mddev->del_work (md_start_sync)         md_misc_wq
>>   rdev->del_work                          md_misc_wq
>> 
>> We need to call flush_workqueue() for md_start_sync and ->event_work
>> while holding reconfig_mutex, but mustn't hold it when
>> flushing mddev_delayed_delete or rdev->del_work.
>> 
>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>> best to leave that alone.
>> 
>> So create a new workqueue, md_del_wq, and a new work_struct,
>> mddev->sync_work, so we can keep two classes of work separate.
>> 
>> md_del_wq and ->del_work are used only for destroying rdev
>> and mddev.
>> md_misc_wq is used for event_work and sync_work.
>> 
>> Also document the purpose of each flush_workqueue() call.
>> 
>> This removes the lockdep warning.
>
> I had the exactly same patch queued internally,

Cool :-)

>                                                   but the mdadm test suite still
> shows lockdep warnning. I haven't time to check further.
>

The only other lockdep I've seen later was some ext4 thing, though I
haven't tried the full test suite.  I might have a look tomorrow.
Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-18  7:29   ` NeilBrown
@ 2017-10-18 11:21     ` Artur Paszkiewicz
  2017-10-18 22:36       ` NeilBrown
  0 siblings, 1 reply; 13+ messages in thread
From: Artur Paszkiewicz @ 2017-10-18 11:21 UTC (permalink / raw)
  To: NeilBrown, Shaohua Li; +Cc: Linux Raid

On 10/18/2017 09:29 AM, NeilBrown wrote:
> On Tue, Oct 17 2017, Shaohua Li wrote:
> 
>> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>>>
>>> lockdep currently complains about a potential deadlock
>>> with sysfs access taking reconfig_mutex, and that
>>> waiting for a work queue to complete.
>>>
>>> The cause is inappropriate overloading of work-items
>>> on work-queues.
>>>
>>> We currently have two work-queues: md_wq and md_misc_wq.
>>> They service 5 different tasks:
>>>
>>>   mddev->flush_work                       md_wq
>>>   mddev->event_work (for dm-raid)         md_misc_wq
>>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>>   mddev->del_work (md_start_sync)         md_misc_wq
>>>   rdev->del_work                          md_misc_wq
>>>
>>> We need to call flush_workqueue() for md_start_sync and ->event_work
>>> while holding reconfig_mutex, but mustn't hold it when
>>> flushing mddev_delayed_delete or rdev->del_work.
>>>
>>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>>> best to leave that alone.
>>>
>>> So create a new workqueue, md_del_wq, and a new work_struct,
>>> mddev->sync_work, so we can keep two classes of work separate.
>>>
>>> md_del_wq and ->del_work are used only for destroying rdev
>>> and mddev.
>>> md_misc_wq is used for event_work and sync_work.
>>>
>>> Also document the purpose of each flush_workqueue() call.
>>>
>>> This removes the lockdep warning.
>>
>> I had the exactly same patch queued internally,
> 
> Cool :-)
> 
>>                                                   but the mdadm test suite still
>> shows lockdep warnning. I haven't time to check further.
>>
> 
> The only other lockdep I've seen later was some ext4 thing, though I
> haven't tried the full test suite.  I might have a look tomorrow.

I'm also seeing a lockdep warning with or without this patch,
reproducible with:

export IMSM_NO_PLATFORM=1 # for platforms without IMSM
mdadm -C /dev/md/imsm0 -eimsm -n4 /dev/sd[a-d] -R
mdadm -C /dev/md/vol0 -l5 -n4 /dev/sd[a-d] -R --assume-clean
mdadm -Ss

[   42.489474] md/raid:md126: device sdd operational as raid disk 3
[   42.490731] md/raid:md126: device sdc operational as raid disk 2
[   42.492363] md/raid:md126: device sdb operational as raid disk 1
[   42.494177] md/raid:md126: device sda operational as raid disk 0
[   42.500864] md/raid:md126: raid level 5 active with 4 out of 4 devices, algorithm 0
[   42.503740] md126: detected capacity change from 0 to 3208642560
[   45.177525] md126: detected capacity change from 3208642560 to 0
[   45.179136] md: md126 still in use.
[   45.185846] md: md126 stopped.
[   45.211960] 
[   45.212081] ======================================================
[   45.212081] WARNING: possible circular locking dependency detected
[   45.212081] 4.14.0-rc3+ #391 Not tainted
[   45.212081] ------------------------------------------------------
[   45.212081] kworker/1:0/17 is trying to acquire lock:
[   45.212081]  (kn->count#89){++++}, at: [<ffffffff812514f3>] kernfs_remove+0x23/0x40
[   45.212081] 
[   45.212081] but task is already holding lock:
[   45.212081]  ((&mddev->del_work)){+.+.}, at: [<ffffffff810783d0>] process_one_work+0x1c0/0x630
[   45.212081] 
[   45.212081] which lock already depends on the new lock.
[   45.212081] 
[   45.212081] 
[   45.212081] the existing dependency chain (in reverse order) is:
[   45.212081] 
[   45.212081] -> #2 ((&mddev->del_work)){+.+.}:
[   45.212081]        __lock_acquire+0xc48/0x1140
[   45.212081]        lock_acquire+0x19d/0x1d0
[   45.212081]        process_one_work+0x212/0x630
[   45.212081]        worker_thread+0x211/0x400
[   45.212081]        kthread+0x172/0x180
[   45.212081]        ret_from_fork+0x27/0x40
[   45.212081] 
[   45.212081] -> #1 ("md_del"){+.+.}:
[   45.212081]        __lock_acquire+0xc48/0x1140
[   45.212081]        lock_acquire+0x19d/0x1d0
[   45.212081]        flush_workqueue+0xbb/0x460
[   45.212081]        new_dev_store+0xb2/0x1e0 [md_mod]
[   45.212081]        md_attr_store+0x90/0xc0 [md_mod]
[   45.212081]        sysfs_kf_write+0x42/0x50
[   45.212081]        kernfs_fop_write+0x119/0x180
[   45.212081]        __vfs_write+0x28/0x110
[   45.212081]        vfs_write+0xb4/0x1a0
[   45.212081]        SyS_write+0x49/0xa0
[   45.212081]        entry_SYSCALL_64_fastpath+0x18/0xad
[   45.212081] 
[   45.212081] -> #0 (kn->count#89){++++}:
[   45.212081]        check_prev_add+0x125/0x690
[   45.212081]        __lock_acquire+0xc48/0x1140
[   45.212081]        lock_acquire+0x19d/0x1d0
[   45.212081]        __kernfs_remove+0x15e/0x270
[   45.212081]        kernfs_remove+0x23/0x40
[   45.212081]        sysfs_remove_dir+0x53/0x60
[   45.212081]        kobject_del+0x18/0x50
[   45.212081]        mddev_delayed_delete+0x28/0x40 [md_mod]
[   45.212081]        process_one_work+0x330/0x630
[   45.212081]        worker_thread+0x211/0x400
[   45.212081]        kthread+0x172/0x180
[   45.212081]        ret_from_fork+0x27/0x40
[   45.212081] 
[   45.212081] other info that might help us debug this:
[   45.212081] 
[   45.212081] Chain exists of:
[   45.212081]   kn->count#89 --> "md_del" --> (&mddev->del_work)
[   45.212081] 
[   45.212081]  Possible unsafe locking scenario:
[   45.212081] 
[   45.212081]        CPU0                    CPU1
[   45.212081]        ----                    ----
[   45.212081]   lock((&mddev->del_work));
[   45.212081]                                lock("md_del");
[   45.212081]                                lock((&mddev->del_work));
[   45.212081]   lock(kn->count#89);
[   45.212081] 
[   45.212081]  *** DEADLOCK ***
[   45.212081] 
[   45.212081] 2 locks held by kworker/1:0/17:
[   45.212081]  #0:  ("md_del"){+.+.}, at: [<ffffffff810783d0>] process_one_work+0x1c0/0x630
[   45.212081]  #1:  ((&mddev->del_work)){+.+.}, at: [<ffffffff810783d0>] process_one_work+0x1c0/0x630
[   45.212081] 
[   45.212081] stack backtrace:
[   45.212081] CPU: 1 PID: 17 Comm: kworker/1:0 Not tainted 4.14.0-rc3+ #391
[   45.212081] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
[   45.212081] Workqueue: md_del mddev_delayed_delete [md_mod]
[   45.212081] Call Trace:
[   45.212081]  dump_stack+0x70/0x9a
[   45.212081]  print_circular_bug+0x2d3/0x2f0
[   45.212081]  ? __print_lock_name+0x80/0x80
[   45.212081]  check_prev_add+0x125/0x690
[   45.212081]  ? ret_from_fork+0x27/0x40
[   45.212081]  __lock_acquire+0xc48/0x1140
[   45.212081]  ? __lock_acquire+0xc48/0x1140
[   45.212081]  ? __print_lock_name+0x80/0x80
[   45.212081]  lock_acquire+0x19d/0x1d0
[   45.212081]  ? kernfs_remove+0x23/0x40
[   45.212081]  __kernfs_remove+0x15e/0x270
[   45.212081]  ? kernfs_remove+0x23/0x40
[   45.212081]  kernfs_remove+0x23/0x40
[   45.212081]  sysfs_remove_dir+0x53/0x60
[   45.212081]  kobject_del+0x18/0x50
[   45.212081]  mddev_delayed_delete+0x28/0x40 [md_mod]
[   45.212081]  process_one_work+0x330/0x630
[   45.212081]  worker_thread+0x211/0x400
[   45.212081]  kthread+0x172/0x180
[   45.212081]  ? process_one_work+0x630/0x630
[   45.212081]  ? kthread_stop+0x250/0x250
[   45.212081]  ret_from_fork+0x27/0x40
[   45.295721] md: md127 stopped.

Thanks,
Artur

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-18 11:21     ` Artur Paszkiewicz
@ 2017-10-18 22:36       ` NeilBrown
  2017-10-19  8:27         ` Artur Paszkiewicz
  0 siblings, 1 reply; 13+ messages in thread
From: NeilBrown @ 2017-10-18 22:36 UTC (permalink / raw)
  To: Artur Paszkiewicz, Shaohua Li; +Cc: Linux Raid

[-- Attachment #1: Type: text/plain, Size: 4473 bytes --]

On Wed, Oct 18 2017, Artur Paszkiewicz wrote:

> On 10/18/2017 09:29 AM, NeilBrown wrote:
>> On Tue, Oct 17 2017, Shaohua Li wrote:
>> 
>>> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>>>>
>>>> lockdep currently complains about a potential deadlock
>>>> with sysfs access taking reconfig_mutex, and that
>>>> waiting for a work queue to complete.
>>>>
>>>> The cause is inappropriate overloading of work-items
>>>> on work-queues.
>>>>
>>>> We currently have two work-queues: md_wq and md_misc_wq.
>>>> They service 5 different tasks:
>>>>
>>>>   mddev->flush_work                       md_wq
>>>>   mddev->event_work (for dm-raid)         md_misc_wq
>>>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>>>   mddev->del_work (md_start_sync)         md_misc_wq
>>>>   rdev->del_work                          md_misc_wq
>>>>
>>>> We need to call flush_workqueue() for md_start_sync and ->event_work
>>>> while holding reconfig_mutex, but mustn't hold it when
>>>> flushing mddev_delayed_delete or rdev->del_work.
>>>>
>>>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>>>> best to leave that alone.
>>>>
>>>> So create a new workqueue, md_del_wq, and a new work_struct,
>>>> mddev->sync_work, so we can keep two classes of work separate.
>>>>
>>>> md_del_wq and ->del_work are used only for destroying rdev
>>>> and mddev.
>>>> md_misc_wq is used for event_work and sync_work.
>>>>
>>>> Also document the purpose of each flush_workqueue() call.
>>>>
>>>> This removes the lockdep warning.
>>>
>>> I had the exactly same patch queued internally,
>> 
>> Cool :-)
>> 
>>>                                                   but the mdadm test suite still
>>> shows lockdep warnning. I haven't time to check further.
>>>
>> 
>> The only other lockdep I've seen later was some ext4 thing, though I
>> haven't tried the full test suite.  I might have a look tomorrow.
>
> I'm also seeing a lockdep warning with or without this patch,
> reproducible with:
>

Thanks!
Looks like using one workqueue for mddev->del_work and rdev->del_work
causes problems.
Can you try with this addition please?

Thanks,
NeilBrown

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d1dfc9879368..b3192943de7d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -92,6 +92,7 @@ EXPORT_SYMBOL(md_cluster_mod);
 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 static struct workqueue_struct *md_wq;
 static struct workqueue_struct *md_del_wq;
+static struct workqueue_struct *rdev_del_wq;
 static struct workqueue_struct *md_misc_wq;
 
 static int remove_and_add_spares(struct mddev *mddev,
@@ -2265,7 +2266,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
 	synchronize_rcu();
 	INIT_WORK(&rdev->del_work, md_delayed_delete);
 	kobject_get(&rdev->kobj);
-	queue_work(md_del_wq, &rdev->del_work);
+	queue_work(rdev_del_wq, &rdev->del_work);
 }
 
 /*
@@ -4307,7 +4308,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
 		return -EOVERFLOW;
 
 	/* Ensure old devices are fully deleted (rdev->del_work) */
-	flush_workqueue(md_del_wq);
+	flush_workqueue(rdev_del_wq);
 
 	err = mddev_lock(mddev);
 	if (err)
@@ -7140,7 +7141,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 
 	if (cmd == ADD_NEW_DISK)
 		/* need to ensure md_delayed_delete() has completed (rdev->del_work) */
-		flush_workqueue(md_del_wq);
+		flush_workqueue(rdev_del_wq);
 
 	if (cmd == HOT_REMOVE_DISK)
 		/* need to ensure recovery thread has run */
@@ -9033,8 +9034,9 @@ static int __init md_init(void)
 
 	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
 	md_del_wq = alloc_workqueue("md_del", 0, 0);
+	rdev_del_wq = alloc_workqueue("md_rdev_del", 0, 0);
 	md_misc_wq = alloc_workqueue("md_misc", 0, 0);
-	if (!md_wq || !md_del_wq || !md_misc_wq)
+	if (!md_wq || !md_del_wq || !rdev_del_wq || !md_misc_wq)
 		goto err;
 
 	if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
@@ -9062,6 +9064,8 @@ static int __init md_init(void)
 		destroy_workqueue(md_wq);
 	if (md_del_wq)
 		destroy_workqueue(md_del_wq);
+	if (rdev_del_wq)
+		destroy_workqueue(rdev_del_wq);
 	if (md_misc_wq)
 		destroy_workqueue(md_misc_wq);
 	return ret;
@@ -9319,6 +9323,7 @@ static __exit void md_exit(void)
 	}
 	destroy_workqueue(md_misc_wq);
 	destroy_workqueue(md_del_wq);
+	destroy_workqueue(rdev_del_wq);
 	destroy_workqueue(md_wq);
 }
 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-18 22:36       ` NeilBrown
@ 2017-10-19  8:27         ` Artur Paszkiewicz
  2017-10-19 22:28           ` NeilBrown
  0 siblings, 1 reply; 13+ messages in thread
From: Artur Paszkiewicz @ 2017-10-19  8:27 UTC (permalink / raw)
  To: NeilBrown, Shaohua Li; +Cc: Linux Raid

On 10/19/2017 12:36 AM, NeilBrown wrote:
> On Wed, Oct 18 2017, Artur Paszkiewicz wrote:
> 
>> On 10/18/2017 09:29 AM, NeilBrown wrote:
>>> On Tue, Oct 17 2017, Shaohua Li wrote:
>>>
>>>> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>>>>>
>>>>> lockdep currently complains about a potential deadlock
>>>>> with sysfs access taking reconfig_mutex, and that
>>>>> waiting for a work queue to complete.
>>>>>
>>>>> The cause is inappropriate overloading of work-items
>>>>> on work-queues.
>>>>>
>>>>> We currently have two work-queues: md_wq and md_misc_wq.
>>>>> They service 5 different tasks:
>>>>>
>>>>>   mddev->flush_work                       md_wq
>>>>>   mddev->event_work (for dm-raid)         md_misc_wq
>>>>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>>>>   mddev->del_work (md_start_sync)         md_misc_wq
>>>>>   rdev->del_work                          md_misc_wq
>>>>>
>>>>> We need to call flush_workqueue() for md_start_sync and ->event_work
>>>>> while holding reconfig_mutex, but mustn't hold it when
>>>>> flushing mddev_delayed_delete or rdev->del_work.
>>>>>
>>>>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>>>>> best to leave that alone.
>>>>>
>>>>> So create a new workqueue, md_del_wq, and a new work_struct,
>>>>> mddev->sync_work, so we can keep two classes of work separate.
>>>>>
>>>>> md_del_wq and ->del_work are used only for destroying rdev
>>>>> and mddev.
>>>>> md_misc_wq is used for event_work and sync_work.
>>>>>
>>>>> Also document the purpose of each flush_workqueue() call.
>>>>>
>>>>> This removes the lockdep warning.
>>>>
>>>> I had the exactly same patch queued internally,
>>>
>>> Cool :-)
>>>
>>>>                                                   but the mdadm test suite still
>>>> shows lockdep warnning. I haven't time to check further.
>>>>
>>>
>>> The only other lockdep I've seen later was some ext4 thing, though I
>>> haven't tried the full test suite.  I might have a look tomorrow.
>>
>> I'm also seeing a lockdep warning with or without this patch,
>> reproducible with:
>>
> 
> Thanks!
> Looks like using one workqueue for mddev->del_work and rdev->del_work
> causes problems.
> Can you try with this addition please?

It helped for that case but now there is another warning triggered by:

export IMSM_NO_PLATFORM=1 # for platforms without IMSM
mdadm -C /dev/md/imsm0 -eimsm -n4 /dev/sd[a-d] -R
mdadm -C /dev/md/vol0 -l5 -n4 /dev/sd[a-d] -R --assume-clean
mdadm -If sda
mdadm -a /dev/md127 /dev/sda
mdadm -Ss

[  143.610826] ======================================================
[  143.611111] WARNING: possible circular locking dependency detected
[  143.611111] 4.14.0-rc3+ #391 Not tainted
[  143.611111] ------------------------------------------------------
[  143.611111] mdmon/3635 is trying to acquire lock:
[  143.611111]  ("md_del"){+.+.}, at: [<ffffffff810758a4>] flush_workqueue+0x94/0x460
[  143.611111]
[  143.611111] but task is already holding lock:
[  143.611111]  (&bdev->bd_mutex){+.+.}, at: [<ffffffff8120da08>] __blkdev_get+0x58/0x410
[  143.611111]
[  143.611111] which lock already depends on the new lock.
[  143.611111]
[  143.611111]
[  143.611111] the existing dependency chain (in reverse order) is:
[  143.611111]
[  143.611111] -> #4 (&bdev->bd_mutex){+.+.}:
[  143.611111]        __lock_acquire+0xc48/0x1140
[  143.611111]        lock_acquire+0x19d/0x1d0
[  143.611111]        __mutex_lock+0x70/0x8f0
[  143.611111]        mutex_lock_nested+0x1b/0x20
[  143.611111]        __blkdev_get+0x58/0x410
[  143.611111]        blkdev_get+0x2e3/0x370
[  143.611111]        blkdev_get_by_dev+0x36/0x50
[  143.611111]        lock_rdev+0x32/0x70 [md_mod]
[  143.611111]        md_import_device+0x83/0x1a0 [md_mod]
[  143.611111]        new_dev_store+0x15a/0x1e0 [md_mod]
[  143.611111]        md_attr_store+0x90/0xc0 [md_mod]
[  143.611111]        sysfs_kf_write+0x42/0x50
[  143.611111]        kernfs_fop_write+0x119/0x180
[  143.611111]        __vfs_write+0x28/0x110
[  143.611111]        vfs_write+0xb4/0x1a0
[  143.611111]        SyS_write+0x49/0xa0
[  143.611111]        entry_SYSCALL_64_fastpath+0x18/0xad
[  143.611111]
[  143.611111] -> #3 (&mddev->reconfig_mutex){+.+.}:
[  143.611111]        __lock_acquire+0xc48/0x1140
[  143.611111]        lock_acquire+0x19d/0x1d0
[  143.611111]        __mutex_lock+0x70/0x8f0
[  143.611111]        mutex_lock_interruptible_nested+0x1b/0x20
[  143.611111]        layout_store+0x3e/0x110 [md_mod]
[  143.611111]        md_attr_store+0x90/0xc0 [md_mod]
[  143.611111]        sysfs_kf_write+0x42/0x50
[  143.611111]        kernfs_fop_write+0x119/0x180
[  143.611111]        __vfs_write+0x28/0x110
[  143.611111]        vfs_write+0xb4/0x1a0
[  143.611111]        SyS_write+0x49/0xa0
[  143.611111]        entry_SYSCALL_64_fastpath+0x18/0xad
[  143.611111]
[  143.611111] -> #2 (kn->count#99){++++}:
[  143.611111]        __lock_acquire+0xc48/0x1140
[  143.611111]        lock_acquire+0x19d/0x1d0
[  143.611111]        __kernfs_remove+0x15e/0x270
[  143.611111]        kernfs_remove+0x23/0x40
[  143.611111]        sysfs_remove_dir+0x53/0x60
[  143.611111]        kobject_del+0x18/0x50
[  143.611111]        mddev_delayed_delete+0x28/0x40 [md_mod]
[  143.611111]        process_one_work+0x330/0x630
[  143.611111]        worker_thread+0x211/0x400
[  143.611111]        kthread+0x172/0x180
[  143.611111]        ret_from_fork+0x27/0x40
[  143.611111]
[  143.611111] -> #1 ((&mddev->del_work)){+.+.}:
[  143.611111]        __lock_acquire+0xc48/0x1140
[  143.611111]        lock_acquire+0x19d/0x1d0
[  143.611111]        process_one_work+0x212/0x630
[  143.611111]        worker_thread+0x211/0x400
[  143.611111]        kthread+0x172/0x180
[  143.611111]        ret_from_fork+0x27/0x40
[  143.611111]
[  143.611111] -> #0 ("md_del"){+.+.}:
[  143.611111]        check_prev_add+0x125/0x690
[  143.611111]        __lock_acquire+0xc48/0x1140
[  143.611111]        lock_acquire+0x19d/0x1d0
[  143.611111]        flush_workqueue+0xbb/0x460
[  143.611111]        md_open+0x4e/0xc0 [md_mod]
[  143.611111]        __blkdev_get+0xe0/0x410
[  143.611111]        blkdev_get+0x2e3/0x370
[  143.611111]        blkdev_open+0x9f/0xb0
[  143.611111]        do_dentry_open.isra.17+0x1b2/0x2e0
[  143.611111]        vfs_open+0x5f/0x70
[  143.611111]        path_openat+0x7d6/0xb80
[  143.611111]        do_filp_open+0x8e/0xe0
[  143.611111]        do_sys_open+0x183/0x220
[  143.611111]        SyS_open+0x1e/0x20
[  143.611111]        entry_SYSCALL_64_fastpath+0x18/0xad
[  143.611111]
[  143.611111] other info that might help us debug this:
[  143.611111]
[  143.611111] Chain exists of:
[  143.611111]   "md_del" --> &mddev->reconfig_mutex --> &bdev->bd_mutex
[  143.611111]
[  143.611111]  Possible unsafe locking scenario:
[  143.611111]
[  143.611111]        CPU0                    CPU1
[  143.611111]        ----                    ----
[  143.611111]   lock(&bdev->bd_mutex);
[  143.611111]                                lock(&mddev->reconfig_mutex);
[  143.611111]                                lock(&bdev->bd_mutex);
[  143.611111]   lock("md_del");
[  143.611111]
[  143.611111]  *** DEADLOCK ***
[  143.611111]
[  143.611111] 1 lock held by mdmon/3635:
[  143.611111]  #0:  (&bdev->bd_mutex){+.+.}, at: [<ffffffff8120da08>] __blkdev_get+0x58/0x410
[  143.611111]
[  143.611111] stack backtrace:
[  143.611111] CPU: 1 PID: 3635 Comm: mdmon Not tainted 4.14.0-rc3+ #391
[  143.611111] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014
[  143.611111] Call Trace:
[  143.611111]  dump_stack+0x70/0x9a
[  143.611111]  print_circular_bug+0x2d3/0x2f0
[  143.611111]  ? __print_lock_name+0x80/0x80
[  143.611111]  check_prev_add+0x125/0x690
[  143.611111]  ? __module_address+0x2c/0xe0
[  143.611111]  ? md_open+0x1a/0xc0 [md_mod]
[  143.611111]  ? md_open+0x1a/0xc0 [md_mod]
[  143.611111]  __lock_acquire+0xc48/0x1140
[  143.611111]  ? __lock_acquire+0xc48/0x1140
[  143.611111]  ? __print_lock_name+0x80/0x80
[  143.611111]  lock_acquire+0x19d/0x1d0
[  143.611111]  ? flush_workqueue+0x94/0x460
[  143.611111]  flush_workqueue+0xbb/0x460
[  143.611111]  ? flush_workqueue+0x94/0x460
[  143.611111]  md_open+0x4e/0xc0 [md_mod]
[  143.611111]  ? md_open+0x4e/0xc0 [md_mod]
[  143.611111]  __blkdev_get+0xe0/0x410
[  143.611111]  blkdev_get+0x2e3/0x370
[  143.611111]  ? bd_acquire+0xd0/0xd0
[  143.611111]  ? _raw_spin_unlock+0x27/0x40
[  143.611111]  ? bd_acquire+0xd0/0xd0
[  143.611111]  blkdev_open+0x9f/0xb0
[  143.611111]  do_dentry_open.isra.17+0x1b2/0x2e0
[  143.611111]  vfs_open+0x5f/0x70
[  143.611111]  path_openat+0x7d6/0xb80
[  143.611111]  do_filp_open+0x8e/0xe0
[  143.611111]  ? _raw_spin_unlock+0x27/0x40
[  143.611111]  ? __alloc_fd+0x1be/0x1e0
[  143.611111]  do_sys_open+0x183/0x220
[  143.611111]  ? do_sys_open+0x183/0x220
[  143.611111]  SyS_open+0x1e/0x20
[  143.611111]  entry_SYSCALL_64_fastpath+0x18/0xad
[  143.611111] RIP: 0033:0x7f5a8bcb91ad
[  143.611111] RSP: 002b:00007f5a8b900d10 EFLAGS: 00000293 ORIG_RAX: 0000000000000002
[  143.611111] RAX: ffffffffffffffda RBX: 000000000101f08c RCX: 00007f5a8bcb91ad
[  143.611111] RDX: 000000000000097f RSI: 0000000000004080 RDI: 00007f5a8b900d40
[  143.611111] RBP: 0000000000000006 R08: 00007f5a8b900d40 R09: 00007f5a8bca3440
[  143.611111] R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000006
[  143.611111] R13: 000000000101f08c R14: 0000000000000000 R15: 00007f5a8b901700

Thanks,
Artur

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-19  8:27         ` Artur Paszkiewicz
@ 2017-10-19 22:28           ` NeilBrown
  2017-10-20 14:00             ` Artur Paszkiewicz
  0 siblings, 1 reply; 13+ messages in thread
From: NeilBrown @ 2017-10-19 22:28 UTC (permalink / raw)
  To: Artur Paszkiewicz, Shaohua Li; +Cc: Linux Raid

[-- Attachment #1: Type: text/plain, Size: 3988 bytes --]

On Thu, Oct 19 2017, Artur Paszkiewicz wrote:

> On 10/19/2017 12:36 AM, NeilBrown wrote:
>> On Wed, Oct 18 2017, Artur Paszkiewicz wrote:
>> 
>>> On 10/18/2017 09:29 AM, NeilBrown wrote:
>>>> On Tue, Oct 17 2017, Shaohua Li wrote:
>>>>
>>>>> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>>>>>>
>>>>>> lockdep currently complains about a potential deadlock
>>>>>> with sysfs access taking reconfig_mutex, and that
>>>>>> waiting for a work queue to complete.
>>>>>>
>>>>>> The cause is inappropriate overloading of work-items
>>>>>> on work-queues.
>>>>>>
>>>>>> We currently have two work-queues: md_wq and md_misc_wq.
>>>>>> They service 5 different tasks:
>>>>>>
>>>>>>   mddev->flush_work                       md_wq
>>>>>>   mddev->event_work (for dm-raid)         md_misc_wq
>>>>>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>>>>>   mddev->del_work (md_start_sync)         md_misc_wq
>>>>>>   rdev->del_work                          md_misc_wq
>>>>>>
>>>>>> We need to call flush_workqueue() for md_start_sync and ->event_work
>>>>>> while holding reconfig_mutex, but mustn't hold it when
>>>>>> flushing mddev_delayed_delete or rdev->del_work.
>>>>>>
>>>>>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>>>>>> best to leave that alone.
>>>>>>
>>>>>> So create a new workqueue, md_del_wq, and a new work_struct,
>>>>>> mddev->sync_work, so we can keep two classes of work separate.
>>>>>>
>>>>>> md_del_wq and ->del_work are used only for destroying rdev
>>>>>> and mddev.
>>>>>> md_misc_wq is used for event_work and sync_work.
>>>>>>
>>>>>> Also document the purpose of each flush_workqueue() call.
>>>>>>
>>>>>> This removes the lockdep warning.
>>>>>
>>>>> I had the exactly same patch queued internally,
>>>>
>>>> Cool :-)
>>>>
>>>>>                                                   but the mdadm test suite still
>>>>> shows lockdep warnning. I haven't time to check further.
>>>>>
>>>>
>>>> The only other lockdep I've seen later was some ext4 thing, though I
>>>> haven't tried the full test suite.  I might have a look tomorrow.
>>>
>>> I'm also seeing a lockdep warning with or without this patch,
>>> reproducible with:
>>>
>> 
>> Thanks!
>> Looks like using one workqueue for mddev->del_work and rdev->del_work
>> causes problems.
>> Can you try with this addition please?
>
> It helped for that case but now there is another warning triggered by:
>
> export IMSM_NO_PLATFORM=1 # for platforms without IMSM
> mdadm -C /dev/md/imsm0 -eimsm -n4 /dev/sd[a-d] -R
> mdadm -C /dev/md/vol0 -l5 -n4 /dev/sd[a-d] -R --assume-clean
> mdadm -If sda
> mdadm -a /dev/md127 /dev/sda
> mdadm -Ss

I tried that ... and mdmon gets a SIGSEGV.
imsm_set_disk() calls get_imsm_disk() and gets a NULL back.
It then passes the NULL to mark_failure() and that dereferences it.
(even worse things happen if "CREATE names=yes" appears in mdadm.conf.
I should fix that).

I added
  if (!disk)
     return;

and ran it in a loop, and got the lockdep warning.

This patch gets rid of it for me.  I need to think about it some more
before I commit to it though.

Thanks,
NeilBrown

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e1e7e8dc6878..874e4101721f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5373,8 +5373,11 @@ static int md_alloc(dev_t dev, char *name)
 
 static struct kobject *md_probe(dev_t dev, int *part, void *data)
 {
-	if (create_on_open)
+	if (create_on_open) {
+		/* Wait until bdev->bd_disk is definitely gone (mddev->del_work) */
+		flush_workqueue(md_del_wq);
 		md_alloc(dev, NULL);
+	}
 	return NULL;
 }
 
@@ -7383,8 +7386,6 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 		 * bd_disk.
 		 */
 		mddev_put(mddev);
-		/* Wait until bdev->bd_disk is definitely gone (mddev->del_work) */
-		flush_workqueue(md_del_wq);
 		/* Then retry the open from the top */
 		return -ERESTARTSYS;
 	}

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-19 22:28           ` NeilBrown
@ 2017-10-20 14:00             ` Artur Paszkiewicz
  2017-10-22 23:31               ` NeilBrown
  0 siblings, 1 reply; 13+ messages in thread
From: Artur Paszkiewicz @ 2017-10-20 14:00 UTC (permalink / raw)
  To: NeilBrown, Shaohua Li; +Cc: Linux Raid

On 10/20/2017 12:28 AM, NeilBrown wrote:
> On Thu, Oct 19 2017, Artur Paszkiewicz wrote:
> 
>> On 10/19/2017 12:36 AM, NeilBrown wrote:
>>> On Wed, Oct 18 2017, Artur Paszkiewicz wrote:
>>>
>>>> On 10/18/2017 09:29 AM, NeilBrown wrote:
>>>>> On Tue, Oct 17 2017, Shaohua Li wrote:
>>>>>
>>>>>> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>>>>>>>
>>>>>>> lockdep currently complains about a potential deadlock
>>>>>>> with sysfs access taking reconfig_mutex, and that
>>>>>>> waiting for a work queue to complete.
>>>>>>>
>>>>>>> The cause is inappropriate overloading of work-items
>>>>>>> on work-queues.
>>>>>>>
>>>>>>> We currently have two work-queues: md_wq and md_misc_wq.
>>>>>>> They service 5 different tasks:
>>>>>>>
>>>>>>>   mddev->flush_work                       md_wq
>>>>>>>   mddev->event_work (for dm-raid)         md_misc_wq
>>>>>>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>>>>>>   mddev->del_work (md_start_sync)         md_misc_wq
>>>>>>>   rdev->del_work                          md_misc_wq
>>>>>>>
>>>>>>> We need to call flush_workqueue() for md_start_sync and ->event_work
>>>>>>> while holding reconfig_mutex, but mustn't hold it when
>>>>>>> flushing mddev_delayed_delete or rdev->del_work.
>>>>>>>
>>>>>>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>>>>>>> best to leave that alone.
>>>>>>>
>>>>>>> So create a new workqueue, md_del_wq, and a new work_struct,
>>>>>>> mddev->sync_work, so we can keep two classes of work separate.
>>>>>>>
>>>>>>> md_del_wq and ->del_work are used only for destroying rdev
>>>>>>> and mddev.
>>>>>>> md_misc_wq is used for event_work and sync_work.
>>>>>>>
>>>>>>> Also document the purpose of each flush_workqueue() call.
>>>>>>>
>>>>>>> This removes the lockdep warning.
>>>>>>
>>>>>> I had the exactly same patch queued internally,
>>>>>
>>>>> Cool :-)
>>>>>
>>>>>>                                                   but the mdadm test suite still
>>>>>> shows lockdep warnning. I haven't time to check further.
>>>>>>
>>>>>
>>>>> The only other lockdep I've seen later was some ext4 thing, though I
>>>>> haven't tried the full test suite.  I might have a look tomorrow.
>>>>
>>>> I'm also seeing a lockdep warning with or without this patch,
>>>> reproducible with:
>>>>
>>>
>>> Thanks!
>>> Looks like using one workqueue for mddev->del_work and rdev->del_work
>>> causes problems.
>>> Can you try with this addition please?
>>
>> It helped for that case but now there is another warning triggered by:
>>
>> export IMSM_NO_PLATFORM=1 # for platforms without IMSM
>> mdadm -C /dev/md/imsm0 -eimsm -n4 /dev/sd[a-d] -R
>> mdadm -C /dev/md/vol0 -l5 -n4 /dev/sd[a-d] -R --assume-clean
>> mdadm -If sda
>> mdadm -a /dev/md127 /dev/sda
>> mdadm -Ss
> 
> I tried that ... and mdmon gets a SIGSEGV.
> imsm_set_disk() calls get_imsm_disk() and gets a NULL back.
> It then passes the NULL to mark_failure() and that dereferences it.

Interesting... I can't reproduce this. Can you show the output from
mdadm -E for all disks after mdmon crashes? And maybe a debug log from
mdmon?

> (even worse things happen if "CREATE names=yes" appears in mdadm.conf.
> I should fix that).
> 
> I added
>   if (!disk)
>      return;
> 
> and ran it in a loop, and got the lockdep warning.
> 
> This patch gets rid of it for me.  I need to think about it some more
> before I commit to it though.

This fixes the warning for me too. No other issues so far.

Thanks,
Artur

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-20 14:00             ` Artur Paszkiewicz
@ 2017-10-22 23:31               ` NeilBrown
  2017-10-27 10:44                 ` Artur Paszkiewicz
  0 siblings, 1 reply; 13+ messages in thread
From: NeilBrown @ 2017-10-22 23:31 UTC (permalink / raw)
  To: Artur Paszkiewicz, Shaohua Li; +Cc: Linux Raid

[-- Attachment #1: Type: text/plain, Size: 9259 bytes --]

On Fri, Oct 20 2017, Artur Paszkiewicz wrote:

> On 10/20/2017 12:28 AM, NeilBrown wrote:
>> On Thu, Oct 19 2017, Artur Paszkiewicz wrote:
>> 
>>> On 10/19/2017 12:36 AM, NeilBrown wrote:
>>>> On Wed, Oct 18 2017, Artur Paszkiewicz wrote:
>>>>
>>>>> On 10/18/2017 09:29 AM, NeilBrown wrote:
>>>>>> On Tue, Oct 17 2017, Shaohua Li wrote:
>>>>>>
>>>>>>> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>>>>>>>>
>>>>>>>> lockdep currently complains about a potential deadlock
>>>>>>>> with sysfs access taking reconfig_mutex, and that
>>>>>>>> waiting for a work queue to complete.
>>>>>>>>
>>>>>>>> The cause is inappropriate overloading of work-items
>>>>>>>> on work-queues.
>>>>>>>>
>>>>>>>> We currently have two work-queues: md_wq and md_misc_wq.
>>>>>>>> They service 5 different tasks:
>>>>>>>>
>>>>>>>>   mddev->flush_work                       md_wq
>>>>>>>>   mddev->event_work (for dm-raid)         md_misc_wq
>>>>>>>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>>>>>>>   mddev->del_work (md_start_sync)         md_misc_wq
>>>>>>>>   rdev->del_work                          md_misc_wq
>>>>>>>>
>>>>>>>> We need to call flush_workqueue() for md_start_sync and ->event_work
>>>>>>>> while holding reconfig_mutex, but mustn't hold it when
>>>>>>>> flushing mddev_delayed_delete or rdev->del_work.
>>>>>>>>
>>>>>>>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>>>>>>>> best to leave that alone.
>>>>>>>>
>>>>>>>> So create a new workqueue, md_del_wq, and a new work_struct,
>>>>>>>> mddev->sync_work, so we can keep two classes of work separate.
>>>>>>>>
>>>>>>>> md_del_wq and ->del_work are used only for destroying rdev
>>>>>>>> and mddev.
>>>>>>>> md_misc_wq is used for event_work and sync_work.
>>>>>>>>
>>>>>>>> Also document the purpose of each flush_workqueue() call.
>>>>>>>>
>>>>>>>> This removes the lockdep warning.
>>>>>>>
>>>>>>> I had the exactly same patch queued internally,
>>>>>>
>>>>>> Cool :-)
>>>>>>
>>>>>>>                                                   but the mdadm test suite still
>>>>>>> shows lockdep warnning. I haven't time to check further.
>>>>>>>
>>>>>>
>>>>>> The only other lockdep I've seen later was some ext4 thing, though I
>>>>>> haven't tried the full test suite.  I might have a look tomorrow.
>>>>>
>>>>> I'm also seeing a lockdep warning with or without this patch,
>>>>> reproducible with:
>>>>>
>>>>
>>>> Thanks!
>>>> Looks like using one workqueue for mddev->del_work and rdev->del_work
>>>> causes problems.
>>>> Can you try with this addition please?
>>>
>>> It helped for that case but now there is another warning triggered by:
>>>
>>> export IMSM_NO_PLATFORM=1 # for platforms without IMSM
>>> mdadm -C /dev/md/imsm0 -eimsm -n4 /dev/sd[a-d] -R
>>> mdadm -C /dev/md/vol0 -l5 -n4 /dev/sd[a-d] -R --assume-clean
>>> mdadm -If sda
>>> mdadm -a /dev/md127 /dev/sda
>>> mdadm -Ss
>> 
>> I tried that ... and mdmon gets a SIGSEGV.
>> imsm_set_disk() calls get_imsm_disk() and gets a NULL back.
>> It then passes the NULL to mark_failure() and that dereferences it.
>
> Interesting... I can't reproduce this. Can you show the output from
> mdadm -E for all disks after mdmon crashes? And maybe a debug log from
> mdmon?

The crash happens when I run "mdadm -If sda".
gdb tell me:

Thread 2 "mdmon" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7f5526c24700 (LWP 4757)]
0x000000000041601c in is_failed (disk=0x0) at super-intel.c:1324
1324		return (disk->status & FAILED_DISK) == FAILED_DISK;
(gdb) where
#0  0x000000000041601c in is_failed (disk=0x0) at super-intel.c:1324
#1  0x00000000004255a2 in mark_failure (super=0x65fa30, dev=0x660ba0, 
    disk=0x0, idx=0) at super-intel.c:7973
#2  0x00000000004260e8 in imsm_set_disk (a=0x6635d0, n=0, state=17)
    at super-intel.c:8357
#3  0x0000000000405069 in read_and_act (a=0x6635d0, fds=0x7f5526c23e10)
    at monitor.c:551
#4  0x0000000000405c8e in wait_and_act (container=0x65f010, nowait=0)
    at monitor.c:875
#5  0x0000000000405dc7 in do_monitor (container=0x65f010) at monitor.c:906
#6  0x0000000000403037 in run_child (v=0x65f010) at mdmon.c:85
#7  0x00007f5526fcb494 in start_thread (arg=0x7f5526c24700)
    at pthread_create.c:333
#8  0x00007f5526d0daff in clone ()
    at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97

The super-disks list that get_imsm_dl_disk() looks through contains
sdc, sdd, sde, but not sda - so get_imsm_disk() returns NULL.
(the 4 devices I use are sda sdc sde sde).
 mdadm --examine of sda and sdc after the crash are below.
 mdmon debug output is below that.

Thanks,
NeilBrown


/dev/sda:
          Magic : Intel Raid ISM Cfg Sig.
        Version : 1.2.02
    Orig Family : 0a44d090
         Family : 0a44d090
     Generation : 00000002
     Attributes : All supported
           UUID : 9897925b:e497e1d9:9af0a04a:88429b8b
       Checksum : 56aeb059 correct
    MPB Sectors : 2
          Disks : 4
   RAID Devices : 1

[vol0]:
           UUID : 89a43a61:a39615db:fe4a4210:021acc13
     RAID Level : 5
        Members : 4
          Slots : [UUUU]
    Failed disk : none
      This Slot : ?
    Sector Size : 512
     Array Size : 36864 (18.00 MiB 18.87 MB)
   Per Dev Size : 12288 (6.00 MiB 6.29 MB)
  Sector Offset : 0
    Num Stripes : 48
     Chunk Size : 128 KiB
       Reserved : 0
  Migrate State : idle
      Map State : normal
    Dirty State : clean
     RWH Policy : off

  Disk00 Serial : 
          State : active
             Id : 00000000
    Usable Size : 36028797018957662

  Disk01 Serial : QM00002
          State : active
             Id : 01000100
    Usable Size : 14174 (6.92 MiB 7.26 MB)

  Disk02 Serial : QM00003
          State : active
             Id : 02000000
    Usable Size : 14174 (6.92 MiB 7.26 MB)

  Disk03 Serial : QM00004
          State : active
             Id : 02000100
    Usable Size : 14174 (6.92 MiB 7.26 MB)

/dev/sdc:
          Magic : Intel Raid ISM Cfg Sig.
        Version : 1.2.02
    Orig Family : 0a44d090
         Family : 0a44d090
     Generation : 00000004
     Attributes : All supported
           UUID : 9897925b:e497e1d9:9af0a04a:88429b8b
       Checksum : 56b1b08e correct
    MPB Sectors : 2
          Disks : 4
   RAID Devices : 1

  Disk01 Serial : QM00002
          State : active
             Id : 01000100
    Usable Size : 14174 (6.92 MiB 7.26 MB)

[vol0]:
           UUID : 89a43a61:a39615db:fe4a4210:021acc13
     RAID Level : 5
        Members : 4
          Slots : [_UUU]
    Failed disk : 0
      This Slot : 1
    Sector Size : 512
     Array Size : 36864 (18.00 MiB 18.87 MB)
   Per Dev Size : 12288 (6.00 MiB 6.29 MB)
  Sector Offset : 0
    Num Stripes : 48
     Chunk Size : 128 KiB
       Reserved : 0
  Migrate State : idle
      Map State : degraded
    Dirty State : clean
     RWH Policy : off

  Disk00 Serial : 0
          State : active failed
             Id : ffffffff
    Usable Size : 36028797018957662

  Disk02 Serial : QM00003
          State : active
             Id : 02000000
    Usable Size : 14174 (6.92 MiB 7.26 MB)

  Disk03 Serial : QM00004
          State : active
             Id : 02000100
    Usable Size : 14174 (6.92 MiB 7.26 MB)

mdmon: mdmon: starting mdmon for md127
mdmon: __prep_thunderdome: mpb from 8:0 prefer 8:48
mdmon: __prep_thunderdome: mpb from 8:32 matches 8:48
mdmon: __prep_thunderdome: mpb from 8:64 matches 8:32
monitor: wake ( )
monitor: wake ( )
....
monitor: wake ( )
monitor: wake ( )
monitor: wake ( )
mdmon: manage_new: inst: 0 action: 25 state: 26
mdmon: imsm_open_new: imsm: open_new 0

mdmon: wait_and_act: monitor: caught signal
mdmon: read_and_act: (0): 1508714952.508532 state:write-pending prev:inactive action:idle prev: idle start:18446744073709551615
mdmon: imsm_set_array_state: imsm: mark 'dirty'
mdmon: imsm_set_disk: imsm: set_disk 0:11

Thread 2 "mdmon" received signal SIGSEGV, Segmentation fault.
0x00000000004168f1 in is_failed (disk=0x0) at super-intel.c:1324
1324		return (disk->status & FAILED_DISK) == FAILED_DISK;
(gdb) where
#0  0x00000000004168f1 in is_failed (disk=0x0) at super-intel.c:1324
#1  0x0000000000426bec in mark_failure (super=0x667a30, dev=0x668ba0, 
    disk=0x0, idx=0) at super-intel.c:7973
#2  0x000000000042784b in imsm_set_disk (a=0x66b9b0, n=0, state=17)
    at super-intel.c:8357
#3  0x000000000040520c in read_and_act (a=0x66b9b0, fds=0x7ffff7617e10)
    at monitor.c:551
#4  0x00000000004061aa in wait_and_act (container=0x667010, nowait=0)
    at monitor.c:875
#5  0x00000000004062e3 in do_monitor (container=0x667010) at monitor.c:906
#6  0x0000000000403037 in run_child (v=0x667010) at mdmon.c:85
#7  0x00007ffff79bf494 in start_thread (arg=0x7ffff7618700)
    at pthread_create.c:333
#8  0x00007ffff7701aff in clone ()
    at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
(gdb) quit
A debugging session is active.

	Inferior 1 [process 5774] will be killed.

Quit anyway? (y or n) ty
Please answer y or n.
A debugging session is active.

	Inferior 1 [process 5774] will be killed.

Quit anyway? (y or n) y

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-22 23:31               ` NeilBrown
@ 2017-10-27 10:44                 ` Artur Paszkiewicz
  2017-10-29 22:18                   ` NeilBrown
  0 siblings, 1 reply; 13+ messages in thread
From: Artur Paszkiewicz @ 2017-10-27 10:44 UTC (permalink / raw)
  To: NeilBrown, Shaohua Li; +Cc: Linux Raid

On 10/23/2017 01:31 AM, NeilBrown wrote:
> On Fri, Oct 20 2017, Artur Paszkiewicz wrote:
> 
>> On 10/20/2017 12:28 AM, NeilBrown wrote:
>>> On Thu, Oct 19 2017, Artur Paszkiewicz wrote:
>>>
>>>> On 10/19/2017 12:36 AM, NeilBrown wrote:
>>>>> On Wed, Oct 18 2017, Artur Paszkiewicz wrote:
>>>>>
>>>>>> On 10/18/2017 09:29 AM, NeilBrown wrote:
>>>>>>> On Tue, Oct 17 2017, Shaohua Li wrote:
>>>>>>>
>>>>>>>> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>>>>>>>>>
>>>>>>>>> lockdep currently complains about a potential deadlock
>>>>>>>>> with sysfs access taking reconfig_mutex, and that
>>>>>>>>> waiting for a work queue to complete.
>>>>>>>>>
>>>>>>>>> The cause is inappropriate overloading of work-items
>>>>>>>>> on work-queues.
>>>>>>>>>
>>>>>>>>> We currently have two work-queues: md_wq and md_misc_wq.
>>>>>>>>> They service 5 different tasks:
>>>>>>>>>
>>>>>>>>>   mddev->flush_work                       md_wq
>>>>>>>>>   mddev->event_work (for dm-raid)         md_misc_wq
>>>>>>>>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>>>>>>>>   mddev->del_work (md_start_sync)         md_misc_wq
>>>>>>>>>   rdev->del_work                          md_misc_wq
>>>>>>>>>
>>>>>>>>> We need to call flush_workqueue() for md_start_sync and ->event_work
>>>>>>>>> while holding reconfig_mutex, but mustn't hold it when
>>>>>>>>> flushing mddev_delayed_delete or rdev->del_work.
>>>>>>>>>
>>>>>>>>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>>>>>>>>> best to leave that alone.
>>>>>>>>>
>>>>>>>>> So create a new workqueue, md_del_wq, and a new work_struct,
>>>>>>>>> mddev->sync_work, so we can keep two classes of work separate.
>>>>>>>>>
>>>>>>>>> md_del_wq and ->del_work are used only for destroying rdev
>>>>>>>>> and mddev.
>>>>>>>>> md_misc_wq is used for event_work and sync_work.
>>>>>>>>>
>>>>>>>>> Also document the purpose of each flush_workqueue() call.
>>>>>>>>>
>>>>>>>>> This removes the lockdep warning.
>>>>>>>>
>>>>>>>> I had the exactly same patch queued internally,
>>>>>>>
>>>>>>> Cool :-)
>>>>>>>
>>>>>>>>                                                   but the mdadm test suite still
>>>>>>>> shows lockdep warnning. I haven't time to check further.
>>>>>>>>
>>>>>>>
>>>>>>> The only other lockdep I've seen later was some ext4 thing, though I
>>>>>>> haven't tried the full test suite.  I might have a look tomorrow.
>>>>>>
>>>>>> I'm also seeing a lockdep warning with or without this patch,
>>>>>> reproducible with:
>>>>>>
>>>>>
>>>>> Thanks!
>>>>> Looks like using one workqueue for mddev->del_work and rdev->del_work
>>>>> causes problems.
>>>>> Can you try with this addition please?
>>>>
>>>> It helped for that case but now there is another warning triggered by:
>>>>
>>>> export IMSM_NO_PLATFORM=1 # for platforms without IMSM
>>>> mdadm -C /dev/md/imsm0 -eimsm -n4 /dev/sd[a-d] -R
>>>> mdadm -C /dev/md/vol0 -l5 -n4 /dev/sd[a-d] -R --assume-clean
>>>> mdadm -If sda
>>>> mdadm -a /dev/md127 /dev/sda
>>>> mdadm -Ss
>>>
>>> I tried that ... and mdmon gets a SIGSEGV.
>>> imsm_set_disk() calls get_imsm_disk() and gets a NULL back.
>>> It then passes the NULL to mark_failure() and that dereferences it.
>>
>> Interesting... I can't reproduce this. Can you show the output from
>> mdadm -E for all disks after mdmon crashes? And maybe a debug log from
>> mdmon?
> 
> The crash happens when I run "mdadm -If sda".
> gdb tell me:
> 
> Thread 2 "mdmon" received signal SIGSEGV, Segmentation fault.
> [Switching to Thread 0x7f5526c24700 (LWP 4757)]
> 0x000000000041601c in is_failed (disk=0x0) at super-intel.c:1324
> 1324		return (disk->status & FAILED_DISK) == FAILED_DISK;
> (gdb) where
> #0  0x000000000041601c in is_failed (disk=0x0) at super-intel.c:1324
> #1  0x00000000004255a2 in mark_failure (super=0x65fa30, dev=0x660ba0, 
>     disk=0x0, idx=0) at super-intel.c:7973
> #2  0x00000000004260e8 in imsm_set_disk (a=0x6635d0, n=0, state=17)
>     at super-intel.c:8357
> #3  0x0000000000405069 in read_and_act (a=0x6635d0, fds=0x7f5526c23e10)
>     at monitor.c:551
> #4  0x0000000000405c8e in wait_and_act (container=0x65f010, nowait=0)
>     at monitor.c:875
> #5  0x0000000000405dc7 in do_monitor (container=0x65f010) at monitor.c:906
> #6  0x0000000000403037 in run_child (v=0x65f010) at mdmon.c:85
> #7  0x00007f5526fcb494 in start_thread (arg=0x7f5526c24700)
>     at pthread_create.c:333
> #8  0x00007f5526d0daff in clone ()
>     at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
> 
> The super-disks list that get_imsm_dl_disk() looks through contains
> sdc, sdd, sde, but not sda - so get_imsm_disk() returns NULL.
> (the 4 devices I use are sda sdc sde sde).
>  mdadm --examine of sda and sdc after the crash are below.
>  mdmon debug output is below that.

Thank you for the information. The metadata output shows that there is
something wrong with sda. Is there anything different about this device?
The other disks are 10M QEMU SCSI drives, is sda the same? Can you
check its serial e.g. with sg_inq?

Thanks,
Artur

> 
> Thanks,
> NeilBrown
> 
> 
> /dev/sda:
>           Magic : Intel Raid ISM Cfg Sig.
>         Version : 1.2.02
>     Orig Family : 0a44d090
>          Family : 0a44d090
>      Generation : 00000002
>      Attributes : All supported
>            UUID : 9897925b:e497e1d9:9af0a04a:88429b8b
>        Checksum : 56aeb059 correct
>     MPB Sectors : 2
>           Disks : 4
>    RAID Devices : 1
> 
> [vol0]:
>            UUID : 89a43a61:a39615db:fe4a4210:021acc13
>      RAID Level : 5
>         Members : 4
>           Slots : [UUUU]
>     Failed disk : none
>       This Slot : ?
>     Sector Size : 512
>      Array Size : 36864 (18.00 MiB 18.87 MB)
>    Per Dev Size : 12288 (6.00 MiB 6.29 MB)
>   Sector Offset : 0
>     Num Stripes : 48
>      Chunk Size : 128 KiB
>        Reserved : 0
>   Migrate State : idle
>       Map State : normal
>     Dirty State : clean
>      RWH Policy : off
> 
>   Disk00 Serial : 
>           State : active
>              Id : 00000000
>     Usable Size : 36028797018957662
> 
>   Disk01 Serial : QM00002
>           State : active
>              Id : 01000100
>     Usable Size : 14174 (6.92 MiB 7.26 MB)
> 
>   Disk02 Serial : QM00003
>           State : active
>              Id : 02000000
>     Usable Size : 14174 (6.92 MiB 7.26 MB)
> 
>   Disk03 Serial : QM00004
>           State : active
>              Id : 02000100
>     Usable Size : 14174 (6.92 MiB 7.26 MB)
> 
> /dev/sdc:
>           Magic : Intel Raid ISM Cfg Sig.
>         Version : 1.2.02
>     Orig Family : 0a44d090
>          Family : 0a44d090
>      Generation : 00000004
>      Attributes : All supported
>            UUID : 9897925b:e497e1d9:9af0a04a:88429b8b
>        Checksum : 56b1b08e correct
>     MPB Sectors : 2
>           Disks : 4
>    RAID Devices : 1
> 
>   Disk01 Serial : QM00002
>           State : active
>              Id : 01000100
>     Usable Size : 14174 (6.92 MiB 7.26 MB)
> 
> [vol0]:
>            UUID : 89a43a61:a39615db:fe4a4210:021acc13
>      RAID Level : 5
>         Members : 4
>           Slots : [_UUU]
>     Failed disk : 0
>       This Slot : 1
>     Sector Size : 512
>      Array Size : 36864 (18.00 MiB 18.87 MB)
>    Per Dev Size : 12288 (6.00 MiB 6.29 MB)
>   Sector Offset : 0
>     Num Stripes : 48
>      Chunk Size : 128 KiB
>        Reserved : 0
>   Migrate State : idle
>       Map State : degraded
>     Dirty State : clean
>      RWH Policy : off
> 
>   Disk00 Serial : 0
>           State : active failed
>              Id : ffffffff
>     Usable Size : 36028797018957662
> 
>   Disk02 Serial : QM00003
>           State : active
>              Id : 02000000
>     Usable Size : 14174 (6.92 MiB 7.26 MB)
> 
>   Disk03 Serial : QM00004
>           State : active
>              Id : 02000100
>     Usable Size : 14174 (6.92 MiB 7.26 MB)
> 
> mdmon: mdmon: starting mdmon for md127
> mdmon: __prep_thunderdome: mpb from 8:0 prefer 8:48
> mdmon: __prep_thunderdome: mpb from 8:32 matches 8:48
> mdmon: __prep_thunderdome: mpb from 8:64 matches 8:32
> monitor: wake ( )
> monitor: wake ( )
> ....
> monitor: wake ( )
> monitor: wake ( )
> monitor: wake ( )
> mdmon: manage_new: inst: 0 action: 25 state: 26
> mdmon: imsm_open_new: imsm: open_new 0
> 
> mdmon: wait_and_act: monitor: caught signal
> mdmon: read_and_act: (0): 1508714952.508532 state:write-pending prev:inactive action:idle prev: idle start:18446744073709551615
> mdmon: imsm_set_array_state: imsm: mark 'dirty'
> mdmon: imsm_set_disk: imsm: set_disk 0:11
> 
> Thread 2 "mdmon" received signal SIGSEGV, Segmentation fault.
> 0x00000000004168f1 in is_failed (disk=0x0) at super-intel.c:1324
> 1324		return (disk->status & FAILED_DISK) == FAILED_DISK;
> (gdb) where
> #0  0x00000000004168f1 in is_failed (disk=0x0) at super-intel.c:1324
> #1  0x0000000000426bec in mark_failure (super=0x667a30, dev=0x668ba0, 
>     disk=0x0, idx=0) at super-intel.c:7973
> #2  0x000000000042784b in imsm_set_disk (a=0x66b9b0, n=0, state=17)
>     at super-intel.c:8357
> #3  0x000000000040520c in read_and_act (a=0x66b9b0, fds=0x7ffff7617e10)
>     at monitor.c:551
> #4  0x00000000004061aa in wait_and_act (container=0x667010, nowait=0)
>     at monitor.c:875
> #5  0x00000000004062e3 in do_monitor (container=0x667010) at monitor.c:906
> #6  0x0000000000403037 in run_child (v=0x667010) at mdmon.c:85
> #7  0x00007ffff79bf494 in start_thread (arg=0x7ffff7618700)
>     at pthread_create.c:333
> #8  0x00007ffff7701aff in clone ()
>     at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
> (gdb) quit
> A debugging session is active.
> 
> 	Inferior 1 [process 5774] will be killed.
> 
> Quit anyway? (y or n) ty
> Please answer y or n.
> A debugging session is active.
> 
> 	Inferior 1 [process 5774] will be killed.
> 
> Quit anyway? (y or n) y
> 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-27 10:44                 ` Artur Paszkiewicz
@ 2017-10-29 22:18                   ` NeilBrown
  2017-10-30 13:02                     ` Artur Paszkiewicz
  0 siblings, 1 reply; 13+ messages in thread
From: NeilBrown @ 2017-10-29 22:18 UTC (permalink / raw)
  To: Artur Paszkiewicz, Shaohua Li; +Cc: Linux Raid

[-- Attachment #1: Type: text/plain, Size: 11336 bytes --]

On Fri, Oct 27 2017, Artur Paszkiewicz wrote:

> On 10/23/2017 01:31 AM, NeilBrown wrote:
>> On Fri, Oct 20 2017, Artur Paszkiewicz wrote:
>> 
>>> On 10/20/2017 12:28 AM, NeilBrown wrote:
>>>> On Thu, Oct 19 2017, Artur Paszkiewicz wrote:
>>>>
>>>>> On 10/19/2017 12:36 AM, NeilBrown wrote:
>>>>>> On Wed, Oct 18 2017, Artur Paszkiewicz wrote:
>>>>>>
>>>>>>> On 10/18/2017 09:29 AM, NeilBrown wrote:
>>>>>>>> On Tue, Oct 17 2017, Shaohua Li wrote:
>>>>>>>>
>>>>>>>>> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>>>>>>>>>>
>>>>>>>>>> lockdep currently complains about a potential deadlock
>>>>>>>>>> with sysfs access taking reconfig_mutex, and that
>>>>>>>>>> waiting for a work queue to complete.
>>>>>>>>>>
>>>>>>>>>> The cause is inappropriate overloading of work-items
>>>>>>>>>> on work-queues.
>>>>>>>>>>
>>>>>>>>>> We currently have two work-queues: md_wq and md_misc_wq.
>>>>>>>>>> They service 5 different tasks:
>>>>>>>>>>
>>>>>>>>>>   mddev->flush_work                       md_wq
>>>>>>>>>>   mddev->event_work (for dm-raid)         md_misc_wq
>>>>>>>>>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>>>>>>>>>   mddev->del_work (md_start_sync)         md_misc_wq
>>>>>>>>>>   rdev->del_work                          md_misc_wq
>>>>>>>>>>
>>>>>>>>>> We need to call flush_workqueue() for md_start_sync and ->event_work
>>>>>>>>>> while holding reconfig_mutex, but mustn't hold it when
>>>>>>>>>> flushing mddev_delayed_delete or rdev->del_work.
>>>>>>>>>>
>>>>>>>>>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>>>>>>>>>> best to leave that alone.
>>>>>>>>>>
>>>>>>>>>> So create a new workqueue, md_del_wq, and a new work_struct,
>>>>>>>>>> mddev->sync_work, so we can keep two classes of work separate.
>>>>>>>>>>
>>>>>>>>>> md_del_wq and ->del_work are used only for destroying rdev
>>>>>>>>>> and mddev.
>>>>>>>>>> md_misc_wq is used for event_work and sync_work.
>>>>>>>>>>
>>>>>>>>>> Also document the purpose of each flush_workqueue() call.
>>>>>>>>>>
>>>>>>>>>> This removes the lockdep warning.
>>>>>>>>>
>>>>>>>>> I had the exactly same patch queued internally,
>>>>>>>>
>>>>>>>> Cool :-)
>>>>>>>>
>>>>>>>>>                                                   but the mdadm test suite still
>>>>>>>>> shows lockdep warnning. I haven't time to check further.
>>>>>>>>>
>>>>>>>>
>>>>>>>> The only other lockdep I've seen later was some ext4 thing, though I
>>>>>>>> haven't tried the full test suite.  I might have a look tomorrow.
>>>>>>>
>>>>>>> I'm also seeing a lockdep warning with or without this patch,
>>>>>>> reproducible with:
>>>>>>>
>>>>>>
>>>>>> Thanks!
>>>>>> Looks like using one workqueue for mddev->del_work and rdev->del_work
>>>>>> causes problems.
>>>>>> Can you try with this addition please?
>>>>>
>>>>> It helped for that case but now there is another warning triggered by:
>>>>>
>>>>> export IMSM_NO_PLATFORM=1 # for platforms without IMSM
>>>>> mdadm -C /dev/md/imsm0 -eimsm -n4 /dev/sd[a-d] -R
>>>>> mdadm -C /dev/md/vol0 -l5 -n4 /dev/sd[a-d] -R --assume-clean
>>>>> mdadm -If sda
>>>>> mdadm -a /dev/md127 /dev/sda
>>>>> mdadm -Ss
>>>>
>>>> I tried that ... and mdmon gets a SIGSEGV.
>>>> imsm_set_disk() calls get_imsm_disk() and gets a NULL back.
>>>> It then passes the NULL to mark_failure() and that dereferences it.
>>>
>>> Interesting... I can't reproduce this. Can you show the output from
>>> mdadm -E for all disks after mdmon crashes? And maybe a debug log from
>>> mdmon?
>> 
>> The crash happens when I run "mdadm -If sda".
>> gdb tell me:
>> 
>> Thread 2 "mdmon" received signal SIGSEGV, Segmentation fault.
>> [Switching to Thread 0x7f5526c24700 (LWP 4757)]
>> 0x000000000041601c in is_failed (disk=0x0) at super-intel.c:1324
>> 1324		return (disk->status & FAILED_DISK) == FAILED_DISK;
>> (gdb) where
>> #0  0x000000000041601c in is_failed (disk=0x0) at super-intel.c:1324
>> #1  0x00000000004255a2 in mark_failure (super=0x65fa30, dev=0x660ba0, 
>>     disk=0x0, idx=0) at super-intel.c:7973
>> #2  0x00000000004260e8 in imsm_set_disk (a=0x6635d0, n=0, state=17)
>>     at super-intel.c:8357
>> #3  0x0000000000405069 in read_and_act (a=0x6635d0, fds=0x7f5526c23e10)
>>     at monitor.c:551
>> #4  0x0000000000405c8e in wait_and_act (container=0x65f010, nowait=0)
>>     at monitor.c:875
>> #5  0x0000000000405dc7 in do_monitor (container=0x65f010) at monitor.c:906
>> #6  0x0000000000403037 in run_child (v=0x65f010) at mdmon.c:85
>> #7  0x00007f5526fcb494 in start_thread (arg=0x7f5526c24700)
>>     at pthread_create.c:333
>> #8  0x00007f5526d0daff in clone ()
>>     at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
>> 
>> The super-disks list that get_imsm_dl_disk() looks through contains
>> sdc, sdd, sde, but not sda - so get_imsm_disk() returns NULL.
>> (the 4 devices I use are sda sdc sde sde).
>>  mdadm --examine of sda and sdc after the crash are below.
>>  mdmon debug output is below that.
>
> Thank you for the information. The metadata output shows that there is
> something wrong with sda. Is there anything different about this device?
> The other disks are 10M QEMU SCSI drives, is sda the same? Can you
> check its serial e.g. with sg_inq?

sdc, sdd, and sde are specified to qemu with

       -hdb /var/tmp/mdtest10 \
       -hdc /var/tmp/mdtest11 \
       -hdd /var/tmp/mdtest12 \

sda comes from
       -drive file=/var/tmp/mdtest13,if=scsi,index=3,media=disk -s

/var/tmp/mdtest* are simple raw images, 10M each.

sg_inq report sd[cde] as
  Vendor: ATA
  Product: QEMU HARDDISK
  Serial: QM0000[234]

sda is
  Vendor: QEMU
  Product: QEMU HARDDISK
  no serial number.


If I change my script to use
       -drive file=/var/tmp/mdtest13,if=scsi,index=3,serial=QM00009,media=disk -s

for sda, mdmon doesn't crash.  It may well be reasonable to refuse to
work with a device that has no serial number.  It is not very friendly
to crash :-(

Thanks,
NeilBrown

>
> Thanks,
> Artur
>
>> 
>> Thanks,
>> NeilBrown
>> 
>> 
>> /dev/sda:
>>           Magic : Intel Raid ISM Cfg Sig.
>>         Version : 1.2.02
>>     Orig Family : 0a44d090
>>          Family : 0a44d090
>>      Generation : 00000002
>>      Attributes : All supported
>>            UUID : 9897925b:e497e1d9:9af0a04a:88429b8b
>>        Checksum : 56aeb059 correct
>>     MPB Sectors : 2
>>           Disks : 4
>>    RAID Devices : 1
>> 
>> [vol0]:
>>            UUID : 89a43a61:a39615db:fe4a4210:021acc13
>>      RAID Level : 5
>>         Members : 4
>>           Slots : [UUUU]
>>     Failed disk : none
>>       This Slot : ?
>>     Sector Size : 512
>>      Array Size : 36864 (18.00 MiB 18.87 MB)
>>    Per Dev Size : 12288 (6.00 MiB 6.29 MB)
>>   Sector Offset : 0
>>     Num Stripes : 48
>>      Chunk Size : 128 KiB
>>        Reserved : 0
>>   Migrate State : idle
>>       Map State : normal
>>     Dirty State : clean
>>      RWH Policy : off
>> 
>>   Disk00 Serial : 
>>           State : active
>>              Id : 00000000
>>     Usable Size : 36028797018957662
>> 
>>   Disk01 Serial : QM00002
>>           State : active
>>              Id : 01000100
>>     Usable Size : 14174 (6.92 MiB 7.26 MB)
>> 
>>   Disk02 Serial : QM00003
>>           State : active
>>              Id : 02000000
>>     Usable Size : 14174 (6.92 MiB 7.26 MB)
>> 
>>   Disk03 Serial : QM00004
>>           State : active
>>              Id : 02000100
>>     Usable Size : 14174 (6.92 MiB 7.26 MB)
>> 
>> /dev/sdc:
>>           Magic : Intel Raid ISM Cfg Sig.
>>         Version : 1.2.02
>>     Orig Family : 0a44d090
>>          Family : 0a44d090
>>      Generation : 00000004
>>      Attributes : All supported
>>            UUID : 9897925b:e497e1d9:9af0a04a:88429b8b
>>        Checksum : 56b1b08e correct
>>     MPB Sectors : 2
>>           Disks : 4
>>    RAID Devices : 1
>> 
>>   Disk01 Serial : QM00002
>>           State : active
>>              Id : 01000100
>>     Usable Size : 14174 (6.92 MiB 7.26 MB)
>> 
>> [vol0]:
>>            UUID : 89a43a61:a39615db:fe4a4210:021acc13
>>      RAID Level : 5
>>         Members : 4
>>           Slots : [_UUU]
>>     Failed disk : 0
>>       This Slot : 1
>>     Sector Size : 512
>>      Array Size : 36864 (18.00 MiB 18.87 MB)
>>    Per Dev Size : 12288 (6.00 MiB 6.29 MB)
>>   Sector Offset : 0
>>     Num Stripes : 48
>>      Chunk Size : 128 KiB
>>        Reserved : 0
>>   Migrate State : idle
>>       Map State : degraded
>>     Dirty State : clean
>>      RWH Policy : off
>> 
>>   Disk00 Serial : 0
>>           State : active failed
>>              Id : ffffffff
>>     Usable Size : 36028797018957662
>> 
>>   Disk02 Serial : QM00003
>>           State : active
>>              Id : 02000000
>>     Usable Size : 14174 (6.92 MiB 7.26 MB)
>> 
>>   Disk03 Serial : QM00004
>>           State : active
>>              Id : 02000100
>>     Usable Size : 14174 (6.92 MiB 7.26 MB)
>> 
>> mdmon: mdmon: starting mdmon for md127
>> mdmon: __prep_thunderdome: mpb from 8:0 prefer 8:48
>> mdmon: __prep_thunderdome: mpb from 8:32 matches 8:48
>> mdmon: __prep_thunderdome: mpb from 8:64 matches 8:32
>> monitor: wake ( )
>> monitor: wake ( )
>> ....
>> monitor: wake ( )
>> monitor: wake ( )
>> monitor: wake ( )
>> mdmon: manage_new: inst: 0 action: 25 state: 26
>> mdmon: imsm_open_new: imsm: open_new 0
>> 
>> mdmon: wait_and_act: monitor: caught signal
>> mdmon: read_and_act: (0): 1508714952.508532 state:write-pending prev:inactive action:idle prev: idle start:18446744073709551615
>> mdmon: imsm_set_array_state: imsm: mark 'dirty'
>> mdmon: imsm_set_disk: imsm: set_disk 0:11
>> 
>> Thread 2 "mdmon" received signal SIGSEGV, Segmentation fault.
>> 0x00000000004168f1 in is_failed (disk=0x0) at super-intel.c:1324
>> 1324		return (disk->status & FAILED_DISK) == FAILED_DISK;
>> (gdb) where
>> #0  0x00000000004168f1 in is_failed (disk=0x0) at super-intel.c:1324
>> #1  0x0000000000426bec in mark_failure (super=0x667a30, dev=0x668ba0, 
>>     disk=0x0, idx=0) at super-intel.c:7973
>> #2  0x000000000042784b in imsm_set_disk (a=0x66b9b0, n=0, state=17)
>>     at super-intel.c:8357
>> #3  0x000000000040520c in read_and_act (a=0x66b9b0, fds=0x7ffff7617e10)
>>     at monitor.c:551
>> #4  0x00000000004061aa in wait_and_act (container=0x667010, nowait=0)
>>     at monitor.c:875
>> #5  0x00000000004062e3 in do_monitor (container=0x667010) at monitor.c:906
>> #6  0x0000000000403037 in run_child (v=0x667010) at mdmon.c:85
>> #7  0x00007ffff79bf494 in start_thread (arg=0x7ffff7618700)
>>     at pthread_create.c:333
>> #8  0x00007ffff7701aff in clone ()
>>     at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
>> (gdb) quit
>> A debugging session is active.
>> 
>> 	Inferior 1 [process 5774] will be killed.
>> 
>> Quit anyway? (y or n) ty
>> Please answer y or n.
>> A debugging session is active.
>> 
>> 	Inferior 1 [process 5774] will be killed.
>> 
>> Quit anyway? (y or n) y
>> 
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-29 22:18                   ` NeilBrown
@ 2017-10-30 13:02                     ` Artur Paszkiewicz
  2017-11-01  3:57                       ` NeilBrown
  0 siblings, 1 reply; 13+ messages in thread
From: Artur Paszkiewicz @ 2017-10-30 13:02 UTC (permalink / raw)
  To: NeilBrown, Shaohua Li; +Cc: Linux Raid

On 10/29/2017 11:18 PM, NeilBrown wrote:
> On Fri, Oct 27 2017, Artur Paszkiewicz wrote:
> 
>> On 10/23/2017 01:31 AM, NeilBrown wrote:
>>> On Fri, Oct 20 2017, Artur Paszkiewicz wrote:
>>>
>>>> On 10/20/2017 12:28 AM, NeilBrown wrote:
>>>>> On Thu, Oct 19 2017, Artur Paszkiewicz wrote:
>>>>>
>>>>>> On 10/19/2017 12:36 AM, NeilBrown wrote:
>>>>>>> On Wed, Oct 18 2017, Artur Paszkiewicz wrote:
>>>>>>>
>>>>>>>> On 10/18/2017 09:29 AM, NeilBrown wrote:
>>>>>>>>> On Tue, Oct 17 2017, Shaohua Li wrote:
>>>>>>>>>
>>>>>>>>>> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>>>>>>>>>>>
>>>>>>>>>>> lockdep currently complains about a potential deadlock
>>>>>>>>>>> with sysfs access taking reconfig_mutex, and that
>>>>>>>>>>> waiting for a work queue to complete.
>>>>>>>>>>>
>>>>>>>>>>> The cause is inappropriate overloading of work-items
>>>>>>>>>>> on work-queues.
>>>>>>>>>>>
>>>>>>>>>>> We currently have two work-queues: md_wq and md_misc_wq.
>>>>>>>>>>> They service 5 different tasks:
>>>>>>>>>>>
>>>>>>>>>>>   mddev->flush_work                       md_wq
>>>>>>>>>>>   mddev->event_work (for dm-raid)         md_misc_wq
>>>>>>>>>>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>>>>>>>>>>   mddev->del_work (md_start_sync)         md_misc_wq
>>>>>>>>>>>   rdev->del_work                          md_misc_wq
>>>>>>>>>>>
>>>>>>>>>>> We need to call flush_workqueue() for md_start_sync and ->event_work
>>>>>>>>>>> while holding reconfig_mutex, but mustn't hold it when
>>>>>>>>>>> flushing mddev_delayed_delete or rdev->del_work.
>>>>>>>>>>>
>>>>>>>>>>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>>>>>>>>>>> best to leave that alone.
>>>>>>>>>>>
>>>>>>>>>>> So create a new workqueue, md_del_wq, and a new work_struct,
>>>>>>>>>>> mddev->sync_work, so we can keep two classes of work separate.
>>>>>>>>>>>
>>>>>>>>>>> md_del_wq and ->del_work are used only for destroying rdev
>>>>>>>>>>> and mddev.
>>>>>>>>>>> md_misc_wq is used for event_work and sync_work.
>>>>>>>>>>>
>>>>>>>>>>> Also document the purpose of each flush_workqueue() call.
>>>>>>>>>>>
>>>>>>>>>>> This removes the lockdep warning.
>>>>>>>>>>
>>>>>>>>>> I had the exactly same patch queued internally,
>>>>>>>>>
>>>>>>>>> Cool :-)
>>>>>>>>>
>>>>>>>>>>                                                   but the mdadm test suite still
>>>>>>>>>> shows lockdep warnning. I haven't time to check further.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> The only other lockdep I've seen later was some ext4 thing, though I
>>>>>>>>> haven't tried the full test suite.  I might have a look tomorrow.
>>>>>>>>
>>>>>>>> I'm also seeing a lockdep warning with or without this patch,
>>>>>>>> reproducible with:
>>>>>>>>
>>>>>>>
>>>>>>> Thanks!
>>>>>>> Looks like using one workqueue for mddev->del_work and rdev->del_work
>>>>>>> causes problems.
>>>>>>> Can you try with this addition please?
>>>>>>
>>>>>> It helped for that case but now there is another warning triggered by:
>>>>>>
>>>>>> export IMSM_NO_PLATFORM=1 # for platforms without IMSM
>>>>>> mdadm -C /dev/md/imsm0 -eimsm -n4 /dev/sd[a-d] -R
>>>>>> mdadm -C /dev/md/vol0 -l5 -n4 /dev/sd[a-d] -R --assume-clean
>>>>>> mdadm -If sda
>>>>>> mdadm -a /dev/md127 /dev/sda
>>>>>> mdadm -Ss
>>>>>
>>>>> I tried that ... and mdmon gets a SIGSEGV.
>>>>> imsm_set_disk() calls get_imsm_disk() and gets a NULL back.
>>>>> It then passes the NULL to mark_failure() and that dereferences it.
>>>>
>>>> Interesting... I can't reproduce this. Can you show the output from
>>>> mdadm -E for all disks after mdmon crashes? And maybe a debug log from
>>>> mdmon?
>>>
>>> The crash happens when I run "mdadm -If sda".
>>> gdb tell me:
>>>
>>> Thread 2 "mdmon" received signal SIGSEGV, Segmentation fault.
>>> [Switching to Thread 0x7f5526c24700 (LWP 4757)]
>>> 0x000000000041601c in is_failed (disk=0x0) at super-intel.c:1324
>>> 1324		return (disk->status & FAILED_DISK) == FAILED_DISK;
>>> (gdb) where
>>> #0  0x000000000041601c in is_failed (disk=0x0) at super-intel.c:1324
>>> #1  0x00000000004255a2 in mark_failure (super=0x65fa30, dev=0x660ba0, 
>>>     disk=0x0, idx=0) at super-intel.c:7973
>>> #2  0x00000000004260e8 in imsm_set_disk (a=0x6635d0, n=0, state=17)
>>>     at super-intel.c:8357
>>> #3  0x0000000000405069 in read_and_act (a=0x6635d0, fds=0x7f5526c23e10)
>>>     at monitor.c:551
>>> #4  0x0000000000405c8e in wait_and_act (container=0x65f010, nowait=0)
>>>     at monitor.c:875
>>> #5  0x0000000000405dc7 in do_monitor (container=0x65f010) at monitor.c:906
>>> #6  0x0000000000403037 in run_child (v=0x65f010) at mdmon.c:85
>>> #7  0x00007f5526fcb494 in start_thread (arg=0x7f5526c24700)
>>>     at pthread_create.c:333
>>> #8  0x00007f5526d0daff in clone ()
>>>     at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
>>>
>>> The super-disks list that get_imsm_dl_disk() looks through contains
>>> sdc, sdd, sde, but not sda - so get_imsm_disk() returns NULL.
>>> (the 4 devices I use are sda sdc sde sde).
>>>  mdadm --examine of sda and sdc after the crash are below.
>>>  mdmon debug output is below that.
>>
>> Thank you for the information. The metadata output shows that there is
>> something wrong with sda. Is there anything different about this device?
>> The other disks are 10M QEMU SCSI drives, is sda the same? Can you
>> check its serial e.g. with sg_inq?
> 
> sdc, sdd, and sde are specified to qemu with
> 
>        -hdb /var/tmp/mdtest10 \
>        -hdc /var/tmp/mdtest11 \
>        -hdd /var/tmp/mdtest12 \
> 
> sda comes from
>        -drive file=/var/tmp/mdtest13,if=scsi,index=3,media=disk -s
> 
> /var/tmp/mdtest* are simple raw images, 10M each.
> 
> sg_inq report sd[cde] as
>   Vendor: ATA
>   Product: QEMU HARDDISK
>   Serial: QM0000[234]
> 
> sda is
>   Vendor: QEMU
>   Product: QEMU HARDDISK
>   no serial number.
> 
> 
> If I change my script to use
>        -drive file=/var/tmp/mdtest13,if=scsi,index=3,serial=QM00009,media=disk -s
> 
> for sda, mdmon doesn't crash.  It may well be reasonable to refuse to
> work with a device that has no serial number.  It is not very friendly
> to crash :-(

OK, this explains a lot. Can you try the same with this patch? It looks
like there was insufficient error checking when retrieving the scsi
serial. Mdadm should now abort when creating the container.
IMSM_DEVNAME_AS_SERIAL can be used to create an array with disks that
don't have a serial number.

Thanks,
Artur

diff --git a/sg_io.c b/sg_io.c
index 42c91e1e..7889a95e 100644
--- a/sg_io.c
+++ b/sg_io.c
@@ -46,6 +46,9 @@ int scsi_get_serial(int fd, void *buf, size_t buf_len)
        if (rv)
                return rv;

+       if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
+               return -1;
+
        rsp_len = rsp_buf[3];

        if (!rsp_len || buf_len < rsp_len)

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] md: create new workqueue for object destruction
  2017-10-30 13:02                     ` Artur Paszkiewicz
@ 2017-11-01  3:57                       ` NeilBrown
  0 siblings, 0 replies; 13+ messages in thread
From: NeilBrown @ 2017-11-01  3:57 UTC (permalink / raw)
  To: Artur Paszkiewicz, Shaohua Li; +Cc: Linux Raid

[-- Attachment #1: Type: text/plain, Size: 7364 bytes --]

On Mon, Oct 30 2017, Artur Paszkiewicz wrote:

> On 10/29/2017 11:18 PM, NeilBrown wrote:
>> On Fri, Oct 27 2017, Artur Paszkiewicz wrote:
>> 
>>> On 10/23/2017 01:31 AM, NeilBrown wrote:
>>>> On Fri, Oct 20 2017, Artur Paszkiewicz wrote:
>>>>
>>>>> On 10/20/2017 12:28 AM, NeilBrown wrote:
>>>>>> On Thu, Oct 19 2017, Artur Paszkiewicz wrote:
>>>>>>
>>>>>>> On 10/19/2017 12:36 AM, NeilBrown wrote:
>>>>>>>> On Wed, Oct 18 2017, Artur Paszkiewicz wrote:
>>>>>>>>
>>>>>>>>> On 10/18/2017 09:29 AM, NeilBrown wrote:
>>>>>>>>>> On Tue, Oct 17 2017, Shaohua Li wrote:
>>>>>>>>>>
>>>>>>>>>>> On Tue, Oct 17, 2017 at 04:04:52PM +1100, Neil Brown wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> lockdep currently complains about a potential deadlock
>>>>>>>>>>>> with sysfs access taking reconfig_mutex, and that
>>>>>>>>>>>> waiting for a work queue to complete.
>>>>>>>>>>>>
>>>>>>>>>>>> The cause is inappropriate overloading of work-items
>>>>>>>>>>>> on work-queues.
>>>>>>>>>>>>
>>>>>>>>>>>> We currently have two work-queues: md_wq and md_misc_wq.
>>>>>>>>>>>> They service 5 different tasks:
>>>>>>>>>>>>
>>>>>>>>>>>>   mddev->flush_work                       md_wq
>>>>>>>>>>>>   mddev->event_work (for dm-raid)         md_misc_wq
>>>>>>>>>>>>   mddev->del_work (mddev_delayed_delete)  md_misc_wq
>>>>>>>>>>>>   mddev->del_work (md_start_sync)         md_misc_wq
>>>>>>>>>>>>   rdev->del_work                          md_misc_wq
>>>>>>>>>>>>
>>>>>>>>>>>> We need to call flush_workqueue() for md_start_sync and ->event_work
>>>>>>>>>>>> while holding reconfig_mutex, but mustn't hold it when
>>>>>>>>>>>> flushing mddev_delayed_delete or rdev->del_work.
>>>>>>>>>>>>
>>>>>>>>>>>> md_wq is a bit special as it has WQ_MEM_RECLAIM so it is
>>>>>>>>>>>> best to leave that alone.
>>>>>>>>>>>>
>>>>>>>>>>>> So create a new workqueue, md_del_wq, and a new work_struct,
>>>>>>>>>>>> mddev->sync_work, so we can keep two classes of work separate.
>>>>>>>>>>>>
>>>>>>>>>>>> md_del_wq and ->del_work are used only for destroying rdev
>>>>>>>>>>>> and mddev.
>>>>>>>>>>>> md_misc_wq is used for event_work and sync_work.
>>>>>>>>>>>>
>>>>>>>>>>>> Also document the purpose of each flush_workqueue() call.
>>>>>>>>>>>>
>>>>>>>>>>>> This removes the lockdep warning.
>>>>>>>>>>>
>>>>>>>>>>> I had the exactly same patch queued internally,
>>>>>>>>>>
>>>>>>>>>> Cool :-)
>>>>>>>>>>
>>>>>>>>>>>                                                   but the mdadm test suite still
>>>>>>>>>>> shows lockdep warnning. I haven't time to check further.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> The only other lockdep I've seen later was some ext4 thing, though I
>>>>>>>>>> haven't tried the full test suite.  I might have a look tomorrow.
>>>>>>>>>
>>>>>>>>> I'm also seeing a lockdep warning with or without this patch,
>>>>>>>>> reproducible with:
>>>>>>>>>
>>>>>>>>
>>>>>>>> Thanks!
>>>>>>>> Looks like using one workqueue for mddev->del_work and rdev->del_work
>>>>>>>> causes problems.
>>>>>>>> Can you try with this addition please?
>>>>>>>
>>>>>>> It helped for that case but now there is another warning triggered by:
>>>>>>>
>>>>>>> export IMSM_NO_PLATFORM=1 # for platforms without IMSM
>>>>>>> mdadm -C /dev/md/imsm0 -eimsm -n4 /dev/sd[a-d] -R
>>>>>>> mdadm -C /dev/md/vol0 -l5 -n4 /dev/sd[a-d] -R --assume-clean
>>>>>>> mdadm -If sda
>>>>>>> mdadm -a /dev/md127 /dev/sda
>>>>>>> mdadm -Ss
>>>>>>
>>>>>> I tried that ... and mdmon gets a SIGSEGV.
>>>>>> imsm_set_disk() calls get_imsm_disk() and gets a NULL back.
>>>>>> It then passes the NULL to mark_failure() and that dereferences it.
>>>>>
>>>>> Interesting... I can't reproduce this. Can you show the output from
>>>>> mdadm -E for all disks after mdmon crashes? And maybe a debug log from
>>>>> mdmon?
>>>>
>>>> The crash happens when I run "mdadm -If sda".
>>>> gdb tell me:
>>>>
>>>> Thread 2 "mdmon" received signal SIGSEGV, Segmentation fault.
>>>> [Switching to Thread 0x7f5526c24700 (LWP 4757)]
>>>> 0x000000000041601c in is_failed (disk=0x0) at super-intel.c:1324
>>>> 1324		return (disk->status & FAILED_DISK) == FAILED_DISK;
>>>> (gdb) where
>>>> #0  0x000000000041601c in is_failed (disk=0x0) at super-intel.c:1324
>>>> #1  0x00000000004255a2 in mark_failure (super=0x65fa30, dev=0x660ba0, 
>>>>     disk=0x0, idx=0) at super-intel.c:7973
>>>> #2  0x00000000004260e8 in imsm_set_disk (a=0x6635d0, n=0, state=17)
>>>>     at super-intel.c:8357
>>>> #3  0x0000000000405069 in read_and_act (a=0x6635d0, fds=0x7f5526c23e10)
>>>>     at monitor.c:551
>>>> #4  0x0000000000405c8e in wait_and_act (container=0x65f010, nowait=0)
>>>>     at monitor.c:875
>>>> #5  0x0000000000405dc7 in do_monitor (container=0x65f010) at monitor.c:906
>>>> #6  0x0000000000403037 in run_child (v=0x65f010) at mdmon.c:85
>>>> #7  0x00007f5526fcb494 in start_thread (arg=0x7f5526c24700)
>>>>     at pthread_create.c:333
>>>> #8  0x00007f5526d0daff in clone ()
>>>>     at ../sysdeps/unix/sysv/linux/x86_64/clone.S:97
>>>>
>>>> The super-disks list that get_imsm_dl_disk() looks through contains
>>>> sdc, sdd, sde, but not sda - so get_imsm_disk() returns NULL.
>>>> (the 4 devices I use are sda sdc sde sde).
>>>>  mdadm --examine of sda and sdc after the crash are below.
>>>>  mdmon debug output is below that.
>>>
>>> Thank you for the information. The metadata output shows that there is
>>> something wrong with sda. Is there anything different about this device?
>>> The other disks are 10M QEMU SCSI drives, is sda the same? Can you
>>> check its serial e.g. with sg_inq?
>> 
>> sdc, sdd, and sde are specified to qemu with
>> 
>>        -hdb /var/tmp/mdtest10 \
>>        -hdc /var/tmp/mdtest11 \
>>        -hdd /var/tmp/mdtest12 \
>> 
>> sda comes from
>>        -drive file=/var/tmp/mdtest13,if=scsi,index=3,media=disk -s
>> 
>> /var/tmp/mdtest* are simple raw images, 10M each.
>> 
>> sg_inq report sd[cde] as
>>   Vendor: ATA
>>   Product: QEMU HARDDISK
>>   Serial: QM0000[234]
>> 
>> sda is
>>   Vendor: QEMU
>>   Product: QEMU HARDDISK
>>   no serial number.
>> 
>> 
>> If I change my script to use
>>        -drive file=/var/tmp/mdtest13,if=scsi,index=3,serial=QM00009,media=disk -s
>> 
>> for sda, mdmon doesn't crash.  It may well be reasonable to refuse to
>> work with a device that has no serial number.  It is not very friendly
>> to crash :-(
>
> OK, this explains a lot. Can you try the same with this patch? It looks
> like there was insufficient error checking when retrieving the scsi
> serial. Mdadm should now abort when creating the container.
> IMSM_DEVNAME_AS_SERIAL can be used to create an array with disks that
> don't have a serial number.
>
> Thanks,
> Artur
>
> diff --git a/sg_io.c b/sg_io.c
> index 42c91e1e..7889a95e 100644
> --- a/sg_io.c
> +++ b/sg_io.c
> @@ -46,6 +46,9 @@ int scsi_get_serial(int fd, void *buf, size_t buf_len)
>         if (rv)
>                 return rv;
>
> +       if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
> +               return -1;
> +
>         rsp_len = rsp_buf[3];
>
>         if (!rsp_len || buf_len < rsp_len)

Thanks. That does seem to make a useful difference.
It doesn't crash now.  I need IMSM_DEVNAME_AS_SERIAL=1 to create the
array, but then it runs smoothly.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2017-11-01  3:57 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-17  5:04 [PATCH] md: create new workqueue for object destruction NeilBrown
2017-10-18  6:21 ` Shaohua Li
2017-10-18  7:29   ` NeilBrown
2017-10-18 11:21     ` Artur Paszkiewicz
2017-10-18 22:36       ` NeilBrown
2017-10-19  8:27         ` Artur Paszkiewicz
2017-10-19 22:28           ` NeilBrown
2017-10-20 14:00             ` Artur Paszkiewicz
2017-10-22 23:31               ` NeilBrown
2017-10-27 10:44                 ` Artur Paszkiewicz
2017-10-29 22:18                   ` NeilBrown
2017-10-30 13:02                     ` Artur Paszkiewicz
2017-11-01  3:57                       ` NeilBrown

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.