All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] dm-bdev-rename-suspended_bdev-to-bdev.patch
@ 2009-04-28 16:14 Mikulas Patocka
  2009-04-28 16:14 ` [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch Mikulas Patocka
  0 siblings, 1 reply; 12+ messages in thread
From: Mikulas Patocka @ 2009-04-28 16:14 UTC (permalink / raw)
  To: Alasdair G Kergon; +Cc: dm-devel

Hi

This is the first patch that keeps md->bdev. Note that you must apply both 
patches, if you apply only this one, barriers will compile but don't work.

Mikulas

---

Rename suspended_bdev to bdev.

This patch doesn't change any functionality, just renames the variable.
In the next patch, the variable will be used even for non-suspended device.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm.c |   36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

Index: linux-2.6.30-rc2-devel/drivers/md/dm.c
===================================================================
--- linux-2.6.30-rc2-devel.orig/drivers/md/dm.c	2009-04-15 16:23:23.000000000 +0200
+++ linux-2.6.30-rc2-devel/drivers/md/dm.c	2009-04-15 16:24:19.000000000 +0200
@@ -168,7 +168,7 @@ struct mapped_device {
 	 * freeze/thaw support require holding onto a super block
 	 */
 	struct super_block *frozen_sb;
-	struct block_device *suspended_bdev;
+	struct block_device *bdev;
 
 	/* forced geometry settings */
 	struct hd_geometry geometry;
@@ -1217,9 +1217,9 @@ static void free_dev(struct mapped_devic
 {
 	int minor = MINOR(disk_devt(md->disk));
 
-	if (md->suspended_bdev) {
+	if (md->bdev) {
 		unlock_fs(md);
-		bdput(md->suspended_bdev);
+		bdput(md->bdev);
 	}
 	destroy_workqueue(md->wq);
 	mempool_destroy(md->tio_pool);
@@ -1262,9 +1262,9 @@ static void __set_size(struct mapped_dev
 {
 	set_capacity(md->disk, size);
 
-	mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
-	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-	mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
+	mutex_lock(&md->bdev->bd_inode->i_mutex);
+	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+	mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 
 static int __bind(struct mapped_device *md, struct dm_table *t)
@@ -1280,7 +1280,7 @@ static int __bind(struct mapped_device *
 	if (size != get_capacity(md->disk))
 		memset(&md->geometry, 0, sizeof(md->geometry));
 
-	if (md->suspended_bdev)
+	if (md->bdev)
 		__set_size(md, size);
 
 	if (!size) {
@@ -1524,7 +1524,7 @@ int dm_swap_table(struct mapped_device *
 		goto out;
 
 	/* without bdev, the device size cannot be changed */
-	if (!md->suspended_bdev)
+	if (!md->bdev)
 		if (get_capacity(md->disk) != dm_table_get_size(table))
 			goto out;
 
@@ -1546,7 +1546,7 @@ static int lock_fs(struct mapped_device 
 
 	WARN_ON(md->frozen_sb);
 
-	md->frozen_sb = freeze_bdev(md->suspended_bdev);
+	md->frozen_sb = freeze_bdev(md->bdev);
 	if (IS_ERR(md->frozen_sb)) {
 		r = PTR_ERR(md->frozen_sb);
 		md->frozen_sb = NULL;
@@ -1566,7 +1566,7 @@ static void unlock_fs(struct mapped_devi
 	if (!test_bit(DMF_FROZEN, &md->flags))
 		return;
 
-	thaw_bdev(md->suspended_bdev, md->frozen_sb);
+	thaw_bdev(md->bdev, md->frozen_sb);
 	md->frozen_sb = NULL;
 	clear_bit(DMF_FROZEN, &md->flags);
 }
@@ -1606,8 +1606,8 @@ int dm_suspend(struct mapped_device *md,
 
 	/* bdget() can stall if the pending I/Os are not flushed */
 	if (!noflush) {
-		md->suspended_bdev = bdget_disk(md->disk, 0);
-		if (!md->suspended_bdev) {
+		md->bdev = bdget_disk(md->disk, 0);
+		if (!md->bdev) {
 			DMWARN("bdget failed in dm_suspend");
 			r = -ENOMEM;
 			goto out;
@@ -1678,9 +1678,9 @@ int dm_suspend(struct mapped_device *md,
 	set_bit(DMF_SUSPENDED, &md->flags);
 
 out:
-	if (r && md->suspended_bdev) {
-		bdput(md->suspended_bdev);
-		md->suspended_bdev = NULL;
+	if (r && md->bdev) {
+		bdput(md->bdev);
+		md->bdev = NULL;
 	}
 
 	dm_table_put(map);
@@ -1711,9 +1711,9 @@ int dm_resume(struct mapped_device *md)
 
 	unlock_fs(md);
 
-	if (md->suspended_bdev) {
-		bdput(md->suspended_bdev);
-		md->suspended_bdev = NULL;
+	if (md->bdev) {
+		bdput(md->bdev);
+		md->bdev = NULL;
 	}
 
 	clear_bit(DMF_SUSPENDED, &md->flags);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch
  2009-04-28 16:14 [PATCH 1/2] dm-bdev-rename-suspended_bdev-to-bdev.patch Mikulas Patocka
@ 2009-04-28 16:14 ` Mikulas Patocka
  2009-04-29  6:26   ` Mikulas Patocka
  0 siblings, 1 reply; 12+ messages in thread
From: Mikulas Patocka @ 2009-04-28 16:14 UTC (permalink / raw)
  To: Alasdair G Kergon; +Cc: dm-devel

Always keep the reference to structure block_device.

Reference to block_device is obtained with function bdget_disk (and is released
with bdput.

bdget_disk was called while the device was being suspended, in dm_suspend() ---
however at this point, there could be another devices already suspended.

bdget_disk can wait for IO and allocate memory and this could result in waiting
for already suspended device, resulting in deadlock.

It caused bug https://bugzilla.redhat.com/show_bug.cgi?id=179786

This patch changes the code so that it gets the reference to struct block_device
when struct mapped_device is allocated and initialized in (alloc_dev) and drops
the reference when it is destroyed in free_dev. Thus, there is no call to
bdget_disk, while any device is suspended.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm.c |   37 +++++++++----------------------------
 1 file changed, 9 insertions(+), 28 deletions(-)

Index: linux-2.6.30-rc2-devel/drivers/md/dm.c
===================================================================
--- linux-2.6.30-rc2-devel.orig/drivers/md/dm.c	2009-04-15 16:24:19.000000000 +0200
+++ linux-2.6.30-rc2-devel/drivers/md/dm.c	2009-04-15 16:24:25.000000000 +0200
@@ -1183,6 +1183,10 @@ static struct mapped_device *alloc_dev(i
 	if (!md->wq)
 		goto bad_thread;
 
+	md->bdev = bdget_disk(md->disk, 0);
+	if (!md->bdev)
+		goto bad_bdev;
+
 	/* Populate the mapping, nobody knows we exist yet */
 	spin_lock(&_minor_lock);
 	old_md = idr_replace(&_minor_idr, md, minor);
@@ -1192,6 +1196,8 @@ static struct mapped_device *alloc_dev(i
 
 	return md;
 
+bad_bdev:
+	destroy_workqueue(md->wq);
 bad_thread:
 	put_disk(md->disk);
 bad_disk:
@@ -1217,10 +1223,8 @@ static void free_dev(struct mapped_devic
 {
 	int minor = MINOR(disk_devt(md->disk));
 
-	if (md->bdev) {
-		unlock_fs(md);
-		bdput(md->bdev);
-	}
+	unlock_fs(md);
+	bdput(md->bdev);
 	destroy_workqueue(md->wq);
 	mempool_destroy(md->tio_pool);
 	mempool_destroy(md->io_pool);
@@ -1280,8 +1284,7 @@ static int __bind(struct mapped_device *
 	if (size != get_capacity(md->disk))
 		memset(&md->geometry, 0, sizeof(md->geometry));
 
-	if (md->bdev)
-		__set_size(md, size);
+	__set_size(md, size);
 
 	if (!size) {
 		dm_table_destroy(t);
@@ -1523,11 +1526,6 @@ int dm_swap_table(struct mapped_device *
 	if (!dm_suspended(md))
 		goto out;
 
-	/* without bdev, the device size cannot be changed */
-	if (!md->bdev)
-		if (get_capacity(md->disk) != dm_table_get_size(table))
-			goto out;
-
 	__unbind(md);
 	r = __bind(md, table);
 
@@ -1606,13 +1604,6 @@ int dm_suspend(struct mapped_device *md,
 
 	/* bdget() can stall if the pending I/Os are not flushed */
 	if (!noflush) {
-		md->bdev = bdget_disk(md->disk, 0);
-		if (!md->bdev) {
-			DMWARN("bdget failed in dm_suspend");
-			r = -ENOMEM;
-			goto out;
-		}
-
 		/*
 		 * Flush I/O to the device. noflush supersedes do_lockfs,
 		 * because lock_fs() needs to flush I/Os.
@@ -1678,11 +1669,6 @@ int dm_suspend(struct mapped_device *md,
 	set_bit(DMF_SUSPENDED, &md->flags);
 
 out:
-	if (r && md->bdev) {
-		bdput(md->bdev);
-		md->bdev = NULL;
-	}
-
 	dm_table_put(map);
 
 out_unlock:
@@ -1711,11 +1697,6 @@ int dm_resume(struct mapped_device *md)
 
 	unlock_fs(md);
 
-	if (md->bdev) {
-		bdput(md->bdev);
-		md->bdev = NULL;
-	}
-
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
 	dm_table_unplug_all(map);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch
  2009-04-28 16:14 ` [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch Mikulas Patocka
@ 2009-04-29  6:26   ` Mikulas Patocka
  2009-05-12  8:32     ` Jun'ichi Nomura
  0 siblings, 1 reply; 12+ messages in thread
From: Mikulas Patocka @ 2009-04-29  6:26 UTC (permalink / raw)
  To: Alasdair G Kergon; +Cc: dm-devel

Hi

This is the second version of the patch, I removed those two bogus 
comments and added more explanations to the description.

Mikulas

---

Always keep the reference to structure block_device.

Reference to block_device is obtained with function bdget_disk (and is released
with bdput.

bdget_disk was called while the device was being suspended, in dm_suspend() ---
however at this point, there could be another devices already suspended.

bdget_disk can wait for IO and allocate memory and this could result in waiting
for already suspended device, resulting in deadlock.

It caused bug https://bugzilla.redhat.com/show_bug.cgi?id=179786

This patch changes the code so that it gets the reference to struct block_device
when struct mapped_device is allocated and initialized in (alloc_dev) and drops
the reference when it is destroyed in free_dev. Thus, there is no call to
bdget_disk, while any device is suspended.

This patch removes the comment "/* bdget() can stall if the pending I/Os are
not flushed */". This comment and a relevant condition was likely some
attempt to make a workaround (not a correct fix) for the deadlock referenced
above. After this patch, bdget is called only on device-creation path, without
any suspension, where the code can allocate memory or wait for any I/O.

It also removes the comment "/* don't bdput right now, we don't want the bdev
to go away while it is locked. */". With this patch, bdev is held all the time.

The call unlockfs() is called unconditionally (previously it was called only
if bdev was held), unlockfs() checks at its beginning whether the filesystem
is frozen, if it isn't, it does nothing. So there is no adverse effect from
calling unlockfs() superflously.

This patch allows to change the device size in noflushsuspend, before it was
not allowed because bdev was not held. But there is no adverse effect of this.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm.c |   41 +++++++++--------------------------------
 1 file changed, 9 insertions(+), 32 deletions(-)

Index: linux-2.6.30-rc2-devel/drivers/md/dm.c
===================================================================
--- linux-2.6.30-rc2-devel.orig/drivers/md/dm.c	2009-04-29 08:01:16.000000000 +0200
+++ linux-2.6.30-rc2-devel/drivers/md/dm.c	2009-04-29 08:05:51.000000000 +0200
@@ -1183,6 +1183,10 @@ static struct mapped_device *alloc_dev(i
 	if (!md->wq)
 		goto bad_thread;
 
+	md->bdev = bdget_disk(md->disk, 0);
+	if (!md->bdev)
+		goto bad_bdev;
+
 	/* Populate the mapping, nobody knows we exist yet */
 	spin_lock(&_minor_lock);
 	old_md = idr_replace(&_minor_idr, md, minor);
@@ -1192,6 +1196,8 @@ static struct mapped_device *alloc_dev(i
 
 	return md;
 
+bad_bdev:
+	destroy_workqueue(md->wq);
 bad_thread:
 	put_disk(md->disk);
 bad_disk:
@@ -1217,10 +1223,8 @@ static void free_dev(struct mapped_devic
 {
 	int minor = MINOR(disk_devt(md->disk));
 
-	if (md->bdev) {
-		unlock_fs(md);
-		bdput(md->bdev);
-	}
+	unlock_fs(md);
+	bdput(md->bdev);
 	destroy_workqueue(md->wq);
 	mempool_destroy(md->tio_pool);
 	mempool_destroy(md->io_pool);
@@ -1280,8 +1284,7 @@ static int __bind(struct mapped_device *
 	if (size != get_capacity(md->disk))
 		memset(&md->geometry, 0, sizeof(md->geometry));
 
-	if (md->bdev)
-		__set_size(md, size);
+	__set_size(md, size);
 
 	if (!size) {
 		dm_table_destroy(t);
@@ -1523,11 +1526,6 @@ int dm_swap_table(struct mapped_device *
 	if (!dm_suspended(md))
 		goto out;
 
-	/* without bdev, the device size cannot be changed */
-	if (!md->bdev)
-		if (get_capacity(md->disk) != dm_table_get_size(table))
-			goto out;
-
 	__unbind(md);
 	r = __bind(md, table);
 
@@ -1555,9 +1553,6 @@ static int lock_fs(struct mapped_device 
 
 	set_bit(DMF_FROZEN, &md->flags);
 
-	/* don't bdput right now, we don't want the bdev
-	 * to go away while it is locked.
-	 */
 	return 0;
 }
 
@@ -1604,15 +1599,7 @@ int dm_suspend(struct mapped_device *md,
 	/* This does not get reverted if there's an error later. */
 	dm_table_presuspend_targets(map);
 
-	/* bdget() can stall if the pending I/Os are not flushed */
 	if (!noflush) {
-		md->bdev = bdget_disk(md->disk, 0);
-		if (!md->bdev) {
-			DMWARN("bdget failed in dm_suspend");
-			r = -ENOMEM;
-			goto out;
-		}
-
 		/*
 		 * Flush I/O to the device. noflush supersedes do_lockfs,
 		 * because lock_fs() needs to flush I/Os.
@@ -1678,11 +1665,6 @@ int dm_suspend(struct mapped_device *md,
 	set_bit(DMF_SUSPENDED, &md->flags);
 
 out:
-	if (r && md->bdev) {
-		bdput(md->bdev);
-		md->bdev = NULL;
-	}
-
 	dm_table_put(map);
 
 out_unlock:
@@ -1711,11 +1693,6 @@ int dm_resume(struct mapped_device *md)
 
 	unlock_fs(md);
 
-	if (md->bdev) {
-		bdput(md->bdev);
-		md->bdev = NULL;
-	}
-
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
 	dm_table_unplug_all(map);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch
  2009-04-29  6:26   ` Mikulas Patocka
@ 2009-05-12  8:32     ` Jun'ichi Nomura
  2009-05-18 15:34       ` Mikulas Patocka
                         ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Jun'ichi Nomura @ 2009-05-12  8:32 UTC (permalink / raw)
  To: device-mapper development, Mikulas Patocka; +Cc: Alasdair G Kergon

Mikulas Patocka wrote:
> @@ -1280,8 +1284,7 @@ static int __bind(struct mapped_device *
>  	if (size != get_capacity(md->disk))
>  		memset(&md->geometry, 0, sizeof(md->geometry));
>  
> -	if (md->bdev)
> -		__set_size(md, size);
> +	__set_size(md, size);
>  
>  	if (!size) {
>  		dm_table_destroy(t);
> @@ -1523,11 +1526,6 @@ int dm_swap_table(struct mapped_device *
>  	if (!dm_suspended(md))
>  		goto out;
>  
> -	/* without bdev, the device size cannot be changed */
> -	if (!md->bdev)
> -		if (get_capacity(md->disk) != dm_table_get_size(table))
> -			goto out;
> -
>  	__unbind(md);
>  	r = __bind(md, table);

When the device is suspended with noflush,
can __set_size() wait forever on i_mutex
if somebody is waiting for I/O flushing with i_mutex held (e.g. fsync)?

md->bdev was also used as a marker to tell whether the device was
suspended with noflush.
Sorry, the original comment in the code was perhaps not adequate.

Thanks,
-- 
Jun'ichi Nomura, NEC Corporation

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch
  2009-05-12  8:32     ` Jun'ichi Nomura
@ 2009-05-18 15:34       ` Mikulas Patocka
  2009-05-19  1:36         ` Jun'ichi Nomura
  2009-05-18 15:35       ` [PATCH] use i_size_read() instead of i_size Mikulas Patocka
  2009-07-23  8:52       ` [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch Jun'ichi Nomura
  2 siblings, 1 reply; 12+ messages in thread
From: Mikulas Patocka @ 2009-05-18 15:34 UTC (permalink / raw)
  To: Jun'ichi Nomura; +Cc: device-mapper development, Alasdair G Kergon

On Tue, 12 May 2009, Jun'ichi Nomura wrote:

> Mikulas Patocka wrote:
> > @@ -1280,8 +1284,7 @@ static int __bind(struct mapped_device *
> >  	if (size != get_capacity(md->disk))
> >  		memset(&md->geometry, 0, sizeof(md->geometry));
> >  
> > -	if (md->bdev)
> > -		__set_size(md, size);
> > +	__set_size(md, size);
> >  
> >  	if (!size) {
> >  		dm_table_destroy(t);
> > @@ -1523,11 +1526,6 @@ int dm_swap_table(struct mapped_device *
> >  	if (!dm_suspended(md))
> >  		goto out;
> >  
> > -	/* without bdev, the device size cannot be changed */
> > -	if (!md->bdev)
> > -		if (get_capacity(md->disk) != dm_table_get_size(table))
> > -			goto out;
> > -
> >  	__unbind(md);
> >  	r = __bind(md, table);
> 
> When the device is suspended with noflush,
> can __set_size() wait forever on i_mutex
> if somebody is waiting for I/O flushing with i_mutex held (e.g. fsync)?
> 
> md->bdev was also used as a marker to tell whether the device was
> suspended with noflush.
> Sorry, the original comment in the code was perhaps not adequate.
> 
> Thanks,
> -- 
> Jun'ichi Nomura, NEC Corporation

Hi

Thanks for reviewing it. Your concern is correct. But maybe that 
mutex_lock in __set_size is not needed at all --- because no one can 
change size simultaneously. What do you think?

BTW. I have found another problem --- a few places in dm don't use 
i_size_read and read i_size directly, which is wrong, see the next patch.

Mikulas

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] use i_size_read() instead of i_size
  2009-05-12  8:32     ` Jun'ichi Nomura
  2009-05-18 15:34       ` Mikulas Patocka
@ 2009-05-18 15:35       ` Mikulas Patocka
  2009-07-23  8:52       ` [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch Jun'ichi Nomura
  2 siblings, 0 replies; 12+ messages in thread
From: Mikulas Patocka @ 2009-05-18 15:35 UTC (permalink / raw)
  To: Jun'ichi Nomura; +Cc: device-mapper development, Alasdair G Kergon

Use i_size_read() instead of reading i_size.

If someone changes the size of the device simultaneously, i_size_read
is guaranteed to return a valid value (either the old one or the new one).

i_size can return some intermediate invalid value (on 32-bit computers
with 64-bit i_size, the reads to both halves of i_size can be interleaved
with updates to i_size, resulting in garbage being returned).

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-exception-store.h |    2 +-
 drivers/md/dm-log.c             |    2 +-
 drivers/md/dm-table.c           |    2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

Index: linux-2.6.30-rc5-fast/drivers/md/dm-exception-store.h
===================================================================
--- linux-2.6.30-rc5-fast.orig/drivers/md/dm-exception-store.h	2009-05-18 17:29:01.000000000 +0200
+++ linux-2.6.30-rc5-fast/drivers/md/dm-exception-store.h	2009-05-18 17:29:24.000000000 +0200
@@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_
  */
 static inline sector_t get_dev_size(struct block_device *bdev)
 {
-	return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 }
 
 static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
Index: linux-2.6.30-rc5-fast/drivers/md/dm-log.c
===================================================================
--- linux-2.6.30-rc5-fast.orig/drivers/md/dm-log.c	2009-05-18 17:29:32.000000000 +0200
+++ linux-2.6.30-rc5-fast/drivers/md/dm-log.c	2009-05-18 17:29:42.000000000 +0200
@@ -431,7 +431,7 @@ static int create_log_context(struct dm_
 		buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
 				       bitset_size, ti->limits.hardsect_size);
 
-		if (buf_size > dev->bdev->bd_inode->i_size) {
+		if (buf_size > i_size_read(dev->bdev->bd_inode)) {
 			DMWARN("log device %s too small: need %llu bytes",
 				dev->name, (unsigned long long)buf_size);
 			kfree(lc);
Index: linux-2.6.30-rc5-fast/drivers/md/dm-table.c
===================================================================
--- linux-2.6.30-rc5-fast.orig/drivers/md/dm-table.c	2009-05-18 17:29:46.000000000 +0200
+++ linux-2.6.30-rc5-fast/drivers/md/dm-table.c	2009-05-18 17:29:57.000000000 +0200
@@ -387,7 +387,7 @@ static void close_dev(struct dm_dev_inte
 static int device_area_is_valid(struct dm_target *ti, struct block_device *bdev,
 			     sector_t start, sector_t len)
 {
-	sector_t dev_size = bdev->bd_inode->i_size >> SECTOR_SHIFT;
+	sector_t dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 	unsigned short hardsect_size_sectors = ti->limits.hardsect_size >>
 					       SECTOR_SHIFT;
 	char b[BDEVNAME_SIZE];

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch
  2009-05-18 15:34       ` Mikulas Patocka
@ 2009-05-19  1:36         ` Jun'ichi Nomura
  0 siblings, 0 replies; 12+ messages in thread
From: Jun'ichi Nomura @ 2009-05-19  1:36 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, Alasdair G Kergon

Mikulas Patocka wrote:
> On Tue, 12 May 2009, Jun'ichi Nomura wrote:
> 
>> Mikulas Patocka wrote:
>>> @@ -1280,8 +1284,7 @@ static int __bind(struct mapped_device *
>>>  	if (size != get_capacity(md->disk))
>>>  		memset(&md->geometry, 0, sizeof(md->geometry));
>>>  
>>> -	if (md->bdev)
>>> -		__set_size(md, size);
>>> +	__set_size(md, size);
>>>  
>>>  	if (!size) {
>>>  		dm_table_destroy(t);
>>> @@ -1523,11 +1526,6 @@ int dm_swap_table(struct mapped_device *
>>>  	if (!dm_suspended(md))
>>>  		goto out;
>>>  
>>> -	/* without bdev, the device size cannot be changed */
>>> -	if (!md->bdev)
>>> -		if (get_capacity(md->disk) != dm_table_get_size(table))
>>> -			goto out;
>>> -
>>>  	__unbind(md);
>>>  	r = __bind(md, table);
>> When the device is suspended with noflush,
>> can __set_size() wait forever on i_mutex
>> if somebody is waiting for I/O flushing with i_mutex held (e.g. fsync)?
>>
>> md->bdev was also used as a marker to tell whether the device was
>> suspended with noflush.
>> Sorry, the original comment in the code was perhaps not adequate.
> 
> Hi
> 
> Thanks for reviewing it. Your concern is correct. But maybe that 
> mutex_lock in __set_size is not needed at all --- because no one can 
> change size simultaneously. What do you think?

I think a lock is still needed.

bd_set_size() can be called concurrently and overwrite
the i_size with old capacity:

  __set_size() (in dm.c)       __blkdev_get()
  -----------------------------------------------------------
                               get_capacity
  set_capacity
  i_size_write
                               bd_set_size


I don't think check_disk_size_change() is called for dm device
but if somebody decides to use it on dm device, the following race is
also possible:

  __set_size() (in dm.c)       check_disk_size_change()
  -----------------------------------------------------------
  set_capacity
                               get_capacity
                               i_size_read
  i_size_write                 i_size_write

and the concurrent i_size_write could break i_size_seqcount on 32bit SMP.


Given that functions like check_disk_size_change() and bd_set_size()
are protected by bd_mutex, using bd_mutex in __set_size() might be a fix.
However, I suspect a holder of bd_mutex can still block on
the I/O on the device.


So perhaps the right solution is deferring the i_size_write
until the process can safely wait for bd_mutex
or introducing a new spinlock to protect the i_size_write on bd_inode.


> BTW. I have found another problem --- a few places in dm don't use 
> i_size_read and read i_size directly, which is wrong, see the next patch.

I think those changes are ok.

Thanks,
-- 
Jun'ichi Nomura, NEC Corporation

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch
  2009-05-12  8:32     ` Jun'ichi Nomura
  2009-05-18 15:34       ` Mikulas Patocka
  2009-05-18 15:35       ` [PATCH] use i_size_read() instead of i_size Mikulas Patocka
@ 2009-07-23  8:52       ` Jun'ichi Nomura
  2009-07-27 23:44         ` [PATCH] drop mutex in __set_size Mikulas Patocka
  2 siblings, 1 reply; 12+ messages in thread
From: Jun'ichi Nomura @ 2009-07-23  8:52 UTC (permalink / raw)
  To: device-mapper development; +Cc: Mikulas Patocka, Alasdair G Kergon

Hi Alasdair, Mikulas,

I found 2.6.31-rc includes the following commit:
  commit 32a926da5a16c01a8213331e5764472ce2f14a8d
  Author: Mikulas Patocka <mpatocka@redhat.com>
  Date:   Mon Jun 22 10:12:17 2009 +0100

which will introduce a deadlock problem described here:
https://www.redhat.com/archives/dm-devel/2009-May/msg00097.html
(or see below)

Was the problem fixed/worked around somehow?

Jun'ichi Nomura wrote:
> Mikulas Patocka wrote:
>> @@ -1280,8 +1284,7 @@ static int __bind(struct mapped_device *
>>  	if (size != get_capacity(md->disk))
>>  		memset(&md->geometry, 0, sizeof(md->geometry));
>>  
>> -	if (md->bdev)
>> -		__set_size(md, size);
>> +	__set_size(md, size);
>>  
>>  	if (!size) {
>>  		dm_table_destroy(t);
>> @@ -1523,11 +1526,6 @@ int dm_swap_table(struct mapped_device *
>>  	if (!dm_suspended(md))
>>  		goto out;
>>  
>> -	/* without bdev, the device size cannot be changed */
>> -	if (!md->bdev)
>> -		if (get_capacity(md->disk) != dm_table_get_size(table))
>> -			goto out;
>> -
>>  	__unbind(md);
>>  	r = __bind(md, table);
> 
> When the device is suspended with noflush,
> can __set_size() wait forever on i_mutex
> if somebody is waiting for I/O flushing with i_mutex held (e.g. fsync)?
> 
> md->bdev was also used as a marker to tell whether the device was
> suspended with noflush.
> Sorry, the original comment in the code was perhaps not adequate.

Thanks,
-- 
Jun'ichi Nomura, NEC Corporation

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH] drop mutex in __set_size
  2009-07-23  8:52       ` [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch Jun'ichi Nomura
@ 2009-07-27 23:44         ` Mikulas Patocka
  2009-07-29  0:14           ` Jun'ichi Nomura
  0 siblings, 1 reply; 12+ messages in thread
From: Mikulas Patocka @ 2009-07-27 23:44 UTC (permalink / raw)
  To: Jun'ichi Nomura; +Cc: device-mapper development, Alasdair G Kergon



On Thu, 23 Jul 2009, Jun'ichi Nomura wrote:

> Hi Alasdair, Mikulas,
> 
> I found 2.6.31-rc includes the following commit:
>   commit 32a926da5a16c01a8213331e5764472ce2f14a8d
>   Author: Mikulas Patocka <mpatocka@redhat.com>
>   Date:   Mon Jun 22 10:12:17 2009 +0100
> 
> which will introduce a deadlock problem described here:
> https://www.redhat.com/archives/dm-devel/2009-May/msg00097.html
> (or see below)
> 
> Was the problem fixed/worked around somehow?

Hi.

I'd submit this. What do others think about it?

Mikulas

---

Drop the mutex.

It doesn't make sense to lock it for a single assignment, this code can't
be executed concurrently. The size should be read with i_size_read which
is automatically protected against concurrent i_size_write.

This locking can lead to a deadlock, if someone holds i_mutex while
the device is being suspended, described by Jun'ichi Nomura at
https://www.redhat.com/archives/dm-devel/2009-May/msg00097.html

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm.c |    2 --
 1 file changed, 2 deletions(-)

Index: linux-2.6.31-rc3-devel/drivers/md/dm.c
===================================================================
--- linux-2.6.31-rc3-devel.orig/drivers/md/dm.c	2009-07-28 01:20:17.000000000 +0200
+++ linux-2.6.31-rc3-devel/drivers/md/dm.c	2009-07-28 01:20:27.000000000 +0200
@@ -1901,9 +1901,7 @@ static void __set_size(struct mapped_dev
 {
 	set_capacity(md->disk, size);
 
-	mutex_lock(&md->bdev->bd_inode->i_mutex);
 	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-	mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 
 static int __bind(struct mapped_device *md, struct dm_table *t,

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drop mutex in __set_size
  2009-07-27 23:44         ` [PATCH] drop mutex in __set_size Mikulas Patocka
@ 2009-07-29  0:14           ` Jun'ichi Nomura
  2009-07-30 15:17             ` Mikulas Patocka
  0 siblings, 1 reply; 12+ messages in thread
From: Jun'ichi Nomura @ 2009-07-29  0:14 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, Alasdair G Kergon

Hi Mikulas,

Mikulas Patocka wrote:
> Drop the mutex.
> 
> It doesn't make sense to lock it for a single assignment, this code can't
> be executed concurrently. The size should be read with i_size_read which
> is automatically protected against concurrent i_size_write.

But it seems there are codes which depend on i_mutex for
protected access to i_size: e.g. block_write_begin().

> This locking can lead to a deadlock, if someone holds i_mutex while
> the device is being suspended, described by Jun'ichi Nomura at
> https://www.redhat.com/archives/dm-devel/2009-May/msg00097.html

And while my description only mentioned noflush suspend,
with 2.6.31-rc, the deadlock can occur under normal suspend
because the new barrier code may push bios to deferred queue.

> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> 
> ---
>  drivers/md/dm.c |    2 --
>  1 file changed, 2 deletions(-)
> 
> Index: linux-2.6.31-rc3-devel/drivers/md/dm.c
> ===================================================================
> --- linux-2.6.31-rc3-devel.orig/drivers/md/dm.c	2009-07-28 01:20:17.000000000 +0200
> +++ linux-2.6.31-rc3-devel/drivers/md/dm.c	2009-07-28 01:20:27.000000000 +0200
> @@ -1901,9 +1901,7 @@ static void __set_size(struct mapped_dev
>  {
>  	set_capacity(md->disk, size);
>  
> -	mutex_lock(&md->bdev->bd_inode->i_mutex);
>  	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
> -	mutex_unlock(&md->bdev->bd_inode->i_mutex);
>  }
>  
>  static int __bind(struct mapped_device *md, struct dm_table *t,

-- 
Jun'ichi Nomura, NEC Corporation

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drop mutex in __set_size
  2009-07-29  0:14           ` Jun'ichi Nomura
@ 2009-07-30 15:17             ` Mikulas Patocka
  2009-07-30 23:49               ` Jun'ichi Nomura
  0 siblings, 1 reply; 12+ messages in thread
From: Mikulas Patocka @ 2009-07-30 15:17 UTC (permalink / raw)
  To: Jun'ichi Nomura; +Cc: device-mapper development, Alasdair G Kergon



On Wed, 29 Jul 2009, Jun'ichi Nomura wrote:

> Hi Mikulas,
> 
> Mikulas Patocka wrote:
> > Drop the mutex.
> > 
> > It doesn't make sense to lock it for a single assignment, this code can't
> > be executed concurrently. The size should be read with i_size_read which
> > is automatically protected against concurrent i_size_write.
> 
> But it seems there are codes which depend on i_mutex for
> protected access to i_size: e.g. block_write_begin().

You are right, but I don't know what to do with it. Catch such uses and 
convert them to i_size_read?

> And while my description only mentioned noflush suspend,
> with 2.6.31-rc, the deadlock can occur under normal suspend
> because the new barrier code may push bios to deferred queue.

It can deadlock with any code that takes i_mutex and submits or waits for 
i/os, regardless of barriers.

Mikulas

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drop mutex in __set_size
  2009-07-30 15:17             ` Mikulas Patocka
@ 2009-07-30 23:49               ` Jun'ichi Nomura
  0 siblings, 0 replies; 12+ messages in thread
From: Jun'ichi Nomura @ 2009-07-30 23:49 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: device-mapper development, Alasdair G Kergon

Hi Mikulas,

Mikulas Patocka wrote:
> On Wed, 29 Jul 2009, Jun'ichi Nomura wrote:
>> Mikulas Patocka wrote:
>>> Drop the mutex.
>>>
>>> It doesn't make sense to lock it for a single assignment, this code can't
>>> be executed concurrently. The size should be read with i_size_read which
>>> is automatically protected against concurrent i_size_write.
>> But it seems there are codes which depend on i_mutex for
>> protected access to i_size: e.g. block_write_begin().
> 
> You are right, but I don't know what to do with it. Catch such uses and 
> convert them to i_size_read?

Or move the i_size_write outside of dm suspended state?
Not deeply examined, but I don't think it doesn't have
to be in the suspended code path.

>> And while my description only mentioned noflush suspend,
>> with 2.6.31-rc, the deadlock can occur under normal suspend
>> because the new barrier code may push bios to deferred queue.
> 
> It can deadlock with any code that takes i_mutex and submits or waits for 
> i/os, regardless of barriers.

Yes, you are right.

I attached a reproducer script for easy testing.

-- 
Jun'ichi Nomura, NEC Corporation

#!/bin/sh -x

# --------------------------------------------------------------------
#
# Reproducer for i_mutex deadlock
#
# Usage:
#   1. Edit variables in the config section (especially DEV and MAP)
#   2. Run this script and wait
#   3. The script should stall with the following output:
#        + dmsetup resume ${MAP}
#
# What this scripts do:
#   2 proceeses do the test.
#   One creates a dm dev and repeat resizing it.
#   The other writes to the dm dev and fsync, repeatedly.
#   If the dm resume is performed during the fsync() holding i_mutex,
#   the processes deadlock.
#
# --------------------------------------------------------------------

# --------------------------------------------------------------------
# Config section

# Device to use for testing. Contents will be destroyed.
DEV=/dev/sdX
# dm table name for testing. ${MAP} is overlaid on ${DEV}
MAP=map1
# SIZE1 and SIZE2 are the sizes (in sectors) of the dm dev ${MAP}
SIZE1=1000000
SIZE2=2000000
# Options for 'dmsetup suspend'
SUSPEND_OPTS="--nolockfs --noflush"

# Name of generated test program. Files ${TP}.c and ${TP} are generated.
TP=./fsync-io
# Total I/O size (in MB) that ${TP} should perform before fsync().
IOSIZE=400

## You don't need to edit codes below.
## But you might want to change the length of sleeps and/or
## print_table function for other target type.

# --------------------------------------------------------------------
# Create a test dev
#

function print_table() {
  local sz=$1
#  echo "0 ${sz} multipath 1 queue_if_no_path 0 1 1 round-robin 0 1 1 ${DEV} 2"
  echo "0 ${sz} linear ${DEV} 0"
}

print_table ${SIZE1} | dmsetup create ${MAP} || exit 1

# --------------------------------------------------------------------
# Repeat write+fsync on dm dev
#

lineno=$(cat $0 | sed -n '/^==BEGIN C CODE==/=')
cat $0 | sed -n "$((lineno + 1)),\$p" > ${TP}.c
gcc -o ${TP} ${TP}.c || exit 1

# Wait a while for the resizing routine to come up and start testing
(while [ -b /dev/mapper/${MAP} ] && ${TP} /dev/mapper/${MAP} ${IOSIZE}; do :; done) &

# --------------------------------------------------------------------
# Repeat resizing dm dev
#

while sleep 1; do
  for sz in $SIZE1 $SIZE2; do
    print_table ${sz} | dmsetup load ${MAP} \
	&& dmsetup suspend ${SUSPEND_OPTS} ${MAP} \
	&& sleep 1 && dmsetup resume ${MAP}
  done
done

exit

# --------------------------------------------------------------------
#
#
==BEGIN C CODE========================================================
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

char buf[1024*1024];

int
main(int argc, char** argv)
{
	int fd, i, size, cnt = 0;
	char *fname;

	if (argc < 3) {
		printf("Usage: <this prog> <dev> <size in MB>\n");
		exit(1);
	}

	fname = argv[1];
	size = atoi(argv[2]);

	fd = open(fname, O_RDWR);
	if (fd < 0) {
		perror("open");
		exit(1);
	}

	for (i = 0; i < size; i++)
		cnt += write(fd, buf, sizeof(buf));

	printf("%d Mbytes written\n", cnt >> 20);

	fsync(fd);
	printf("fsync done.\n");

	close(fd);
	return 0;
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2009-07-30 23:49 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-28 16:14 [PATCH 1/2] dm-bdev-rename-suspended_bdev-to-bdev.patch Mikulas Patocka
2009-04-28 16:14 ` [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch Mikulas Patocka
2009-04-29  6:26   ` Mikulas Patocka
2009-05-12  8:32     ` Jun'ichi Nomura
2009-05-18 15:34       ` Mikulas Patocka
2009-05-19  1:36         ` Jun'ichi Nomura
2009-05-18 15:35       ` [PATCH] use i_size_read() instead of i_size Mikulas Patocka
2009-07-23  8:52       ` [PATCH 2/2] dm-bdev-keep-bdev-always-referenced.patch Jun'ichi Nomura
2009-07-27 23:44         ` [PATCH] drop mutex in __set_size Mikulas Patocka
2009-07-29  0:14           ` Jun'ichi Nomura
2009-07-30 15:17             ` Mikulas Patocka
2009-07-30 23:49               ` Jun'ichi Nomura

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.