linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [REPOST, PATCH 0/3] dio: serialise unaligned direct IO
@ 2010-11-08  7:40 Dave Chinner
  2010-11-08  7:40 ` [PATCH 1/3] dio: track and " Dave Chinner
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Dave Chinner @ 2010-11-08  7:40 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel

Repost of the last version of the series I wrote to prevent data corruption
cause by concurrent sub-block direct IO to the same block in the filesystem.
The problem still exists, the xfstest 240 is still triggering it, so we still
need to fix it.



^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/3] dio: track and serialise unaligned direct IO
  2010-11-08  7:40 [REPOST, PATCH 0/3] dio: serialise unaligned direct IO Dave Chinner
@ 2010-11-08  7:40 ` Dave Chinner
  2010-11-08 15:28   ` Jeff Moyer
  2010-11-08  7:40 ` [PATCH 2/3] dio: scale unaligned IO tracking via multiple lists Dave Chinner
  2010-11-08  7:40 ` [PATCH 3/3] dio: add a mempool for the unaligned block structures Dave Chinner
  2 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2010-11-08  7:40 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel

From: Dave Chinner <dchinner@redhat.com>

If we get two unaligned direct IO's to the same filesystem block
that is marked as a new allocation (i.e. buffer_new), then both IOs
will zero the portion of the block they are not writing data to. As
a result, when the IOs complete there will be a portion of the block
that contains zeros from the last IO to complete rather than the
data that should be there.

This is easily manifested by qemu using aio+dio with an unaligned
guest filesystem - every IO is unaligned and fileystem corruption is
encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
is also a simple reproducer.

To avoid this problem, track unaligned IO that triggers sub-block
zeroing and check new incoming unaligned IO that require sub-block
zeroing against that list. If we get an overlap where the start and
end of unaligned IOs hit the same filesystem block, then we need to
block the incoming IOs until the IO that is zeroing the block
completes. The blocked IO can then continue without needing to do
any zeroing and hence won't overwrite valid data with zeros.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/direct-io.c |  152 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 146 insertions(+), 6 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 85882f6..1a69efd 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -71,6 +71,9 @@ struct dio {
 	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
 					   been performed at the start of a
 					   write */
+#define LAST_SECTOR ((sector_t)-1LL)
+	sector_t zero_block_front;	/* fs block we are zeroing at front */
+	sector_t zero_block_rear;	/* fs block we are zeroing at rear */
 	int pages_in_io;		/* approximate total IO pages */
 	size_t	size;			/* total request size (doesn't change)*/
 	sector_t block_in_file;		/* Current offset into the underlying
@@ -135,6 +138,101 @@ struct dio {
 	struct page *pages[DIO_PAGES];	/* page buffer */
 };
 
+
+/*
+ * record fs blocks we are doing zeroing on in a zero block list.
+ * unaligned IO is not very performant and so is relatively uncommon,
+ * so a global list should be sufficent to track them.
+ */
+struct dio_zero_block {
+	struct list_head dio_list;	/* list of io in progress */
+	sector_t	zero_block;	/* block being zeroed */
+	struct dio	*dio;		/* owner dio */
+	wait_queue_head_t wq;		/* New IO block here */
+	atomic_t	ref;		/* reference count */
+};
+
+static DEFINE_SPINLOCK(dio_zero_block_lock);
+static LIST_HEAD(dio_zero_block_list);
+
+/*
+ * Add a filesystem block to the list of blocks we are tracking.
+ */
+static void
+dio_start_zero_block(struct dio *dio, sector_t zero_block)
+{
+	struct dio_zero_block *zb;
+
+	zb = kmalloc(sizeof(*zb), GFP_NOIO);
+	if (!zb)
+		return;
+	INIT_LIST_HEAD(&zb->dio_list);
+	init_waitqueue_head(&zb->wq);
+	zb->zero_block = zero_block;
+	zb->dio = dio;
+	atomic_set(&zb->ref, 1);
+
+	spin_lock(&dio_zero_block_lock);
+	list_add(&zb->dio_list, &dio_zero_block_list);
+	spin_unlock(&dio_zero_block_lock);
+}
+
+static void
+dio_drop_zero_block(struct dio_zero_block *zb)
+{
+	if (atomic_dec_and_test(&zb->ref))
+		kfree(zb);
+}
+
+/*
+ * Check whether a filesystem block is currently being zeroed, and if it is
+ * wait for it to complete before returning. If we waited for a block being
+ * zeroed, return 1 to indicate that the block is already initialised,
+ * otherwise return 0 to indicate that it needs zeroing.
+ */
+static int
+dio_wait_zero_block(struct dio *dio, sector_t zero_block)
+{
+	struct dio_zero_block *zb;
+
+	spin_lock(&dio_zero_block_lock);
+	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
+		if (zb->dio->inode != dio->inode)
+			continue;
+		if (zb->zero_block != zero_block)
+			continue;
+		atomic_inc(&zb->ref);
+		spin_unlock(&dio_zero_block_lock);
+		wait_event(zb->wq, (list_empty(&zb->dio_list)));
+		dio_drop_zero_block(zb);
+		return 1;
+	}
+	spin_unlock(&dio_zero_block_lock);
+	return 0;
+}
+
+/*
+ * Complete a block zeroing and wake up anyone waiting for it.
+ */
+static void dio_end_zero_block(struct dio *dio, sector_t zero_block)
+{
+	struct dio_zero_block *zb;
+
+	spin_lock(&dio_zero_block_lock);
+	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
+		if (zb->dio->inode != dio->inode)
+			continue;
+		if (zb->zero_block != zero_block)
+			continue;
+		list_del_init(&zb->dio_list);
+		spin_unlock(&dio_zero_block_lock);
+		wake_up(&zb->wq);
+		dio_drop_zero_block(zb);
+		return;
+	}
+	spin_unlock(&dio_zero_block_lock);
+}
+
 /*
  * How many pages are in the queue?
  */
@@ -253,6 +351,11 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
 		aio_complete(dio->iocb, ret, 0);
 	}
 
+	if (dio->zero_block_front != LAST_SECTOR)
+		dio_end_zero_block(dio, dio->zero_block_front);
+	if (dio->zero_block_rear != LAST_SECTOR)
+		dio_end_zero_block(dio, dio->zero_block_rear);
+
 	if (dio->flags & DIO_LOCKING)
 		/* lockdep: non-owner release */
 		up_read_non_owner(&dio->inode->i_alloc_sem);
@@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio)
  * block with zeros. This happens only if user-buffer, fileoffset or
  * io length is not filesystem block-size multiple.
  *
+ * We need to track the blocks we are zeroing. If we have concurrent IOs that hit
+ * the same start or end block, we do not want all the IOs to zero the portion
+ * they are not writing data to as that will overwrite data from the other IOs.
+ * Hence we need to block until the first unaligned IO completes before we can
+ * continue (without executing any zeroing).
+ *
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
@@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end)
 {
 	unsigned dio_blocks_per_fs_block;
 	unsigned this_chunk_blocks;	/* In dio_blocks */
-	unsigned this_chunk_bytes;
 	struct page *page;
+	sector_t fsblock;
 
 	dio->start_zero_done = 1;
 	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
@@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int end)
 	if (!this_chunk_blocks)
 		return;
 
+	if (end)
+		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
+
 	/*
 	 * We need to zero out part of an fs block.  It is either at the
-	 * beginning or the end of the fs block.
+	 * beginning or the end of the fs block, but first we need to check if
+	 * there is already a zeroing being run on this block.
+	 *
+	 * If we are doing a sub-block IO (i.e. zeroing both front and rear of
+	 * the same block) we don't need to wait or set a gaurd for the rear of
+	 * the block as we already have one set.
 	 */
-	if (end) 
-		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
+	fsblock = dio->block_in_file >> dio->blkfactor;
+	if (!end || dio->zero_block_front != fsblock) {
 
-	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+		/* wait for any zeroing already in progress */
+		if (dio_wait_zero_block(dio, fsblock)) {
+			/* skip the range we would have zeroed. */
+			dio->next_block_for_io += this_chunk_blocks;
+			return;
+		}
+
+		/*
+		 * we are going to zero stuff now, so set a guard to catch
+		 * others that might want to zero the same block.
+		 */
+		dio_start_zero_block(dio, fsblock);
+		if (end)
+			dio->zero_block_rear = fsblock;
+		else
+			dio->zero_block_front = fsblock;
+	}
 
 	page = ZERO_PAGE(0);
-	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
+	if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbits,
 				dio->next_block_for_io))
 		return;
 
@@ -1210,6 +1343,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	memset(dio, 0, offsetof(struct dio, pages));
 
+	/*
+	 * zero_blocks need to initialised to largeѕt value to avoid
+	 * matching the zero block accidentally.
+	 */
+	dio->zero_block_front = LAST_SECTOR;
+	dio->zero_block_rear = LAST_SECTOR;
+
 	dio->flags = flags;
 	if (dio->flags & DIO_LOCKING) {
 		/* watch out for a 0 len io from a tricksy fs */
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] dio: scale unaligned IO tracking via multiple lists
  2010-11-08  7:40 [REPOST, PATCH 0/3] dio: serialise unaligned direct IO Dave Chinner
  2010-11-08  7:40 ` [PATCH 1/3] dio: track and " Dave Chinner
@ 2010-11-08  7:40 ` Dave Chinner
  2010-11-08 15:36   ` Jeff Moyer
  2010-11-08  7:40 ` [PATCH 3/3] dio: add a mempool for the unaligned block structures Dave Chinner
  2 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2010-11-08  7:40 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel

From: Dave Chinner <dchinner@redhat.com>

To avoid concerns that a single list and lock tracking the unaligned
IOs will not scale appropriately, create multiple lists and locks
and chose them by hashing the unaligned block being zeroed.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/direct-io.c |   49 ++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1a69efd..353ac52 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -152,8 +152,28 @@ struct dio_zero_block {
 	atomic_t	ref;		/* reference count */
 };
 
-static DEFINE_SPINLOCK(dio_zero_block_lock);
-static LIST_HEAD(dio_zero_block_list);
+#define DIO_ZERO_BLOCK_NR	37LL
+struct dio_zero_block_head {
+	struct list_head	list;
+	spinlock_t		lock;
+};
+
+static struct dio_zero_block_head dio_zero_blocks[DIO_ZERO_BLOCK_NR];
+#define to_dio_zero_head(zb)	(&dio_zero_blocks[zb % DIO_ZERO_BLOCK_NR])
+
+
+static int __init
+dio_init_zero_block(void)
+{
+	int i;
+
+	for (i = 0; i < DIO_ZERO_BLOCK_NR; i++) {
+		spin_lock_init(&dio_zero_blocks[i].lock);
+		INIT_LIST_HEAD(&dio_zero_blocks[i].list);
+	}
+	return 0;
+}
+subsys_initcall(dio_init_zero_block);
 
 /*
  * Add a filesystem block to the list of blocks we are tracking.
@@ -161,6 +181,7 @@ static LIST_HEAD(dio_zero_block_list);
 static void
 dio_start_zero_block(struct dio *dio, sector_t zero_block)
 {
+	struct dio_zero_block_head *zbh = to_dio_zero_head(zero_block);
 	struct dio_zero_block *zb;
 
 	zb = kmalloc(sizeof(*zb), GFP_NOIO);
@@ -172,9 +193,9 @@ dio_start_zero_block(struct dio *dio, sector_t zero_block)
 	zb->dio = dio;
 	atomic_set(&zb->ref, 1);
 
-	spin_lock(&dio_zero_block_lock);
-	list_add(&zb->dio_list, &dio_zero_block_list);
-	spin_unlock(&dio_zero_block_lock);
+	spin_lock(&zbh->lock);
+	list_add(&zb->dio_list, &zbh->list);
+	spin_unlock(&zbh->lock);
 }
 
 static void
@@ -193,21 +214,22 @@ dio_drop_zero_block(struct dio_zero_block *zb)
 static int
 dio_wait_zero_block(struct dio *dio, sector_t zero_block)
 {
+	struct dio_zero_block_head *zbh = to_dio_zero_head(zero_block);
 	struct dio_zero_block *zb;
 
-	spin_lock(&dio_zero_block_lock);
-	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
+	spin_lock(&zbh->lock);
+	list_for_each_entry(zb, &zbh->list, dio_list) {
 		if (zb->dio->inode != dio->inode)
 			continue;
 		if (zb->zero_block != zero_block)
 			continue;
 		atomic_inc(&zb->ref);
-		spin_unlock(&dio_zero_block_lock);
+		spin_unlock(&zbh->lock);
 		wait_event(zb->wq, (list_empty(&zb->dio_list)));
 		dio_drop_zero_block(zb);
 		return 1;
 	}
-	spin_unlock(&dio_zero_block_lock);
+	spin_unlock(&zbh->lock);
 	return 0;
 }
 
@@ -216,21 +238,22 @@ dio_wait_zero_block(struct dio *dio, sector_t zero_block)
  */
 static void dio_end_zero_block(struct dio *dio, sector_t zero_block)
 {
+	struct dio_zero_block_head *zbh = to_dio_zero_head(zero_block);
 	struct dio_zero_block *zb;
 
-	spin_lock(&dio_zero_block_lock);
-	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
+	spin_lock(&zbh->lock);
+	list_for_each_entry(zb, &zbh->list, dio_list) {
 		if (zb->dio->inode != dio->inode)
 			continue;
 		if (zb->zero_block != zero_block)
 			continue;
 		list_del_init(&zb->dio_list);
-		spin_unlock(&dio_zero_block_lock);
+		spin_unlock(&zbh->lock);
 		wake_up(&zb->wq);
 		dio_drop_zero_block(zb);
 		return;
 	}
-	spin_unlock(&dio_zero_block_lock);
+	spin_unlock(&zbh->lock);
 }
 
 /*
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] dio: add a mempool for the unaligned block structures
  2010-11-08  7:40 [REPOST, PATCH 0/3] dio: serialise unaligned direct IO Dave Chinner
  2010-11-08  7:40 ` [PATCH 1/3] dio: track and " Dave Chinner
  2010-11-08  7:40 ` [PATCH 2/3] dio: scale unaligned IO tracking via multiple lists Dave Chinner
@ 2010-11-08  7:40 ` Dave Chinner
  2010-11-08 15:40   ` Jeff Moyer
  2 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2010-11-08  7:40 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: linux-kernel

From: Dave Chinner <dchinner@redhat.com>

We need the zero block tracking structure allocation to succeed.
Silently failing and potentially allowing data corruption is not an
option. Add a mempool to ensure this allocation will always succeed.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/direct-io.c |   14 +++++++++++---
 1 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 353ac52..f8d7f6d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -161,12 +161,22 @@ struct dio_zero_block_head {
 static struct dio_zero_block_head dio_zero_blocks[DIO_ZERO_BLOCK_NR];
 #define to_dio_zero_head(zb)	(&dio_zero_blocks[zb % DIO_ZERO_BLOCK_NR])
 
+/* a small mempool just to guarantee progress. */
+#define DIO_ZERO_BLOCK_ENTRIES	2
+static mempool_t *dio_zero_block_pool;
+
 
 static int __init
 dio_init_zero_block(void)
 {
 	int i;
 
+	dio_zero_block_pool = mempool_create_kmalloc_pool(
+					DIO_ZERO_BLOCK_ENTRIES,
+					sizeof(struct dio_zero_block));
+	if (!dio_zero_block_pool)
+		panic("dio: can't create zero block pool");
+
 	for (i = 0; i < DIO_ZERO_BLOCK_NR; i++) {
 		spin_lock_init(&dio_zero_blocks[i].lock);
 		INIT_LIST_HEAD(&dio_zero_blocks[i].list);
@@ -184,9 +194,7 @@ dio_start_zero_block(struct dio *dio, sector_t zero_block)
 	struct dio_zero_block_head *zbh = to_dio_zero_head(zero_block);
 	struct dio_zero_block *zb;
 
-	zb = kmalloc(sizeof(*zb), GFP_NOIO);
-	if (!zb)
-		return;
+	zb = mempool_alloc(dio_zero_block_pool, GFP_NOIO);
 	INIT_LIST_HEAD(&zb->dio_list);
 	init_waitqueue_head(&zb->wq);
 	zb->zero_block = zero_block;
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] dio: track and serialise unaligned direct IO
  2010-11-08  7:40 ` [PATCH 1/3] dio: track and " Dave Chinner
@ 2010-11-08 15:28   ` Jeff Moyer
  2010-11-08 22:55     ` Dave Chinner
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Moyer @ 2010-11-08 15:28 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel

Dave Chinner <david@fromorbit.com> writes:

> +struct dio_zero_block {
> +	struct list_head dio_list;	/* list of io in progress */
> +	sector_t	zero_block;	/* block being zeroed */
> +	struct dio	*dio;		/* owner dio */
> +	wait_queue_head_t wq;		/* New IO block here */
                      New IOs block here, or new IO blocks here?

> +/*
> + * Add a filesystem block to the list of blocks we are tracking.
> + */
> +static void
> +dio_start_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	zb = kmalloc(sizeof(*zb), GFP_NOIO);
> +	if (!zb)
> +		return;
> +	INIT_LIST_HEAD(&zb->dio_list);
> +	init_waitqueue_head(&zb->wq);
> +	zb->zero_block = zero_block;
> +	zb->dio = dio;
> +	atomic_set(&zb->ref, 1);
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_add(&zb->dio_list, &dio_zero_block_list);
> +	spin_unlock(&dio_zero_block_lock);

What protects from two processes getting here at the same time, and
hence adding the same block to the list?  i_mutex?

> @@ -1210,6 +1343,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
>  	 */
>  	memset(dio, 0, offsetof(struct dio, pages));
>  
> +	/*
> +	 * zero_blocks need to initialised to largeѕt value to avoid
                              ^ be

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] dio: scale unaligned IO tracking via multiple lists
  2010-11-08  7:40 ` [PATCH 2/3] dio: scale unaligned IO tracking via multiple lists Dave Chinner
@ 2010-11-08 15:36   ` Jeff Moyer
  2010-11-08 23:12     ` Dave Chinner
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Moyer @ 2010-11-08 15:36 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel

Dave Chinner <david@fromorbit.com> writes:

> From: Dave Chinner <dchinner@redhat.com>
>
> To avoid concerns that a single list and lock tracking the unaligned
> IOs will not scale appropriately, create multiple lists and locks
> and chose them by hashing the unaligned block being zeroed.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/direct-io.c |   49 ++++++++++++++++++++++++++++++++++++-------------
>  1 files changed, 36 insertions(+), 13 deletions(-)
>
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 1a69efd..353ac52 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -152,8 +152,28 @@ struct dio_zero_block {
>  	atomic_t	ref;		/* reference count */
>  };
>  
> -static DEFINE_SPINLOCK(dio_zero_block_lock);
> -static LIST_HEAD(dio_zero_block_list);
> +#define DIO_ZERO_BLOCK_NR	37LL

I'm always curious to know how these numbers are derived.  Why 37?

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] dio: add a mempool for the unaligned block structures
  2010-11-08  7:40 ` [PATCH 3/3] dio: add a mempool for the unaligned block structures Dave Chinner
@ 2010-11-08 15:40   ` Jeff Moyer
  0 siblings, 0 replies; 12+ messages in thread
From: Jeff Moyer @ 2010-11-08 15:40 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel

Dave Chinner <david@fromorbit.com> writes:

> From: Dave Chinner <dchinner@redhat.com>
>
> We need the zero block tracking structure allocation to succeed.
> Silently failing and potentially allowing data corruption is not an
> option. Add a mempool to ensure this allocation will always succeed.
>
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

Acked-by: Jeff Moyer <jmoyer@redhat.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/3] dio: track and serialise unaligned direct IO
  2010-11-08 15:28   ` Jeff Moyer
@ 2010-11-08 22:55     ` Dave Chinner
  0 siblings, 0 replies; 12+ messages in thread
From: Dave Chinner @ 2010-11-08 22:55 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-fsdevel, linux-kernel

On Mon, Nov 08, 2010 at 10:28:29AM -0500, Jeff Moyer wrote:
> Dave Chinner <david@fromorbit.com> writes:
> 
> > +struct dio_zero_block {
> > +	struct list_head dio_list;	/* list of io in progress */
> > +	sector_t	zero_block;	/* block being zeroed */
> > +	struct dio	*dio;		/* owner dio */
> > +	wait_queue_head_t wq;		/* New IO block here */
>                       New IOs block here, or new IO blocks here?
> 
> > +/*
> > + * Add a filesystem block to the list of blocks we are tracking.
> > + */
> > +static void
> > +dio_start_zero_block(struct dio *dio, sector_t zero_block)
> > +{
> > +	struct dio_zero_block *zb;
> > +
> > +	zb = kmalloc(sizeof(*zb), GFP_NOIO);
> > +	if (!zb)
> > +		return;
> > +	INIT_LIST_HEAD(&zb->dio_list);
> > +	init_waitqueue_head(&zb->wq);
> > +	zb->zero_block = zero_block;
> > +	zb->dio = dio;
> > +	atomic_set(&zb->ref, 1);
> > +
> > +	spin_lock(&dio_zero_block_lock);
> > +	list_add(&zb->dio_list, &dio_zero_block_list);
> > +	spin_unlock(&dio_zero_block_lock);
> 
> What protects from two processes getting here at the same time, and
> hence adding the same block to the list?  i_mutex?

The wait in dio_zero_block() called before this function is called is
supposed to serialise them, but now that you point it out, it's not
an atomic wait-and-add so ther eis a small window where two IOs
could pass through here. Easy enoug to fix by combining the search
and inѕert - I'll restructure it along those lines.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] dio: scale unaligned IO tracking via multiple lists
  2010-11-08 15:36   ` Jeff Moyer
@ 2010-11-08 23:12     ` Dave Chinner
  2010-11-09 21:04       ` Jeff Moyer
  0 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2010-11-08 23:12 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-fsdevel, linux-kernel

On Mon, Nov 08, 2010 at 10:36:06AM -0500, Jeff Moyer wrote:
> Dave Chinner <david@fromorbit.com> writes:
> 
> > From: Dave Chinner <dchinner@redhat.com>
> >
> > To avoid concerns that a single list and lock tracking the unaligned
> > IOs will not scale appropriately, create multiple lists and locks
> > and chose them by hashing the unaligned block being zeroed.
> >
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> > ---
> >  fs/direct-io.c |   49 ++++++++++++++++++++++++++++++++++++-------------
> >  1 files changed, 36 insertions(+), 13 deletions(-)
> >
> > diff --git a/fs/direct-io.c b/fs/direct-io.c
> > index 1a69efd..353ac52 100644
> > --- a/fs/direct-io.c
> > +++ b/fs/direct-io.c
> > @@ -152,8 +152,28 @@ struct dio_zero_block {
> >  	atomic_t	ref;		/* reference count */
> >  };
> >  
> > -static DEFINE_SPINLOCK(dio_zero_block_lock);
> > -static LIST_HEAD(dio_zero_block_list);
> > +#define DIO_ZERO_BLOCK_NR	37LL
> 
> I'm always curious to know how these numbers are derived.  Why 37?

It's a prime number large enough to give enough lists to minimise
contention whilst providing decent distribution for 8 byte aligned
addresses with low overhead. XFS uses the same sort of waitqueue
hashing for global IO completion wait queues used by truncation
and inode eviction (see xfs_ioend_wait()).

Seemed reasonable (and simple!) just to copy that design pattern
for another global IO completion wait queue....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] dio: scale unaligned IO tracking via multiple lists
  2010-11-08 23:12     ` Dave Chinner
@ 2010-11-09 21:04       ` Jeff Moyer
  2010-11-09 23:06         ` Dave Chinner
  0 siblings, 1 reply; 12+ messages in thread
From: Jeff Moyer @ 2010-11-09 21:04 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel

Dave Chinner <david@fromorbit.com> writes:

> On Mon, Nov 08, 2010 at 10:36:06AM -0500, Jeff Moyer wrote:
>> Dave Chinner <david@fromorbit.com> writes:
>> 
>> > From: Dave Chinner <dchinner@redhat.com>
>> >
>> > To avoid concerns that a single list and lock tracking the unaligned
>> > IOs will not scale appropriately, create multiple lists and locks
>> > and chose them by hashing the unaligned block being zeroed.
>> >
>> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
>> > ---
>> >  fs/direct-io.c |   49 ++++++++++++++++++++++++++++++++++++-------------
>> >  1 files changed, 36 insertions(+), 13 deletions(-)
>> >
>> > diff --git a/fs/direct-io.c b/fs/direct-io.c
>> > index 1a69efd..353ac52 100644
>> > --- a/fs/direct-io.c
>> > +++ b/fs/direct-io.c
>> > @@ -152,8 +152,28 @@ struct dio_zero_block {
>> >  	atomic_t	ref;		/* reference count */
>> >  };
>> >  
>> > -static DEFINE_SPINLOCK(dio_zero_block_lock);
>> > -static LIST_HEAD(dio_zero_block_list);
>> > +#define DIO_ZERO_BLOCK_NR	37LL
>> 
>> I'm always curious to know how these numbers are derived.  Why 37?
>
> It's a prime number large enough to give enough lists to minimise
> contention whilst providing decent distribution for 8 byte aligned
> addresses with low overhead. XFS uses the same sort of waitqueue
> hashing for global IO completion wait queues used by truncation
> and inode eviction (see xfs_ioend_wait()).
>
> Seemed reasonable (and simple!) just to copy that design pattern
> for another global IO completion wait queue....

OK.  I just had our performance team record some statistics for me on an
unmodified kernel during an OLTP-type workload.  I've attached the
systemtap script that I had them run.  I wanted to see just how common
the sub-page-block zeroing was, and I was frightened to find that, in a
10 minute period , over 1.2 million calls were recorded.  If we're
lucky, my script is buggy.  Please give it a look-see.

I'm all ears for next steps.  We can check to see how deep the hash
chains get.  We could also ask the folks at Intel to run this through
their database testing rig to get a quantification of the overhead.

What do you think?

Cheers,
Jeff


#! /usr/bin/env stap
#
# This file is free software. You can redistribute it and/or modify it under 
# the terms of the GNU General Public License (GPL); either version 2, or (at
# your option) any later version.

global zeroes = 0

probe kernel.function("dio_zero_block") {
	BH_New = 1 << 6;

	if ($dio->blkfactor != 0 && !($dio->map_bh->b_state & BH_New)) {
		zeroes++;
	}
}

probe end {
	printf("zeroes performed: %d\n", zeroes);
}


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] dio: scale unaligned IO tracking via multiple lists
  2010-11-09 21:04       ` Jeff Moyer
@ 2010-11-09 23:06         ` Dave Chinner
  2010-11-11 15:32           ` Jeff Moyer
  0 siblings, 1 reply; 12+ messages in thread
From: Dave Chinner @ 2010-11-09 23:06 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: linux-fsdevel, linux-kernel

On Tue, Nov 09, 2010 at 04:04:41PM -0500, Jeff Moyer wrote:
> Dave Chinner <david@fromorbit.com> writes:
> 
> > On Mon, Nov 08, 2010 at 10:36:06AM -0500, Jeff Moyer wrote:
> >> Dave Chinner <david@fromorbit.com> writes:
> >> 
> >> > From: Dave Chinner <dchinner@redhat.com>
> >> >
> >> > To avoid concerns that a single list and lock tracking the unaligned
> >> > IOs will not scale appropriately, create multiple lists and locks
> >> > and chose them by hashing the unaligned block being zeroed.
> >> >
> >> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> >> > ---
> >> >  fs/direct-io.c |   49 ++++++++++++++++++++++++++++++++++++-------------
> >> >  1 files changed, 36 insertions(+), 13 deletions(-)
> >> >
> >> > diff --git a/fs/direct-io.c b/fs/direct-io.c
> >> > index 1a69efd..353ac52 100644
> >> > --- a/fs/direct-io.c
> >> > +++ b/fs/direct-io.c
> >> > @@ -152,8 +152,28 @@ struct dio_zero_block {
> >> >  	atomic_t	ref;		/* reference count */
> >> >  };
> >> >  
> >> > -static DEFINE_SPINLOCK(dio_zero_block_lock);
> >> > -static LIST_HEAD(dio_zero_block_list);
> >> > +#define DIO_ZERO_BLOCK_NR	37LL
> >> 
> >> I'm always curious to know how these numbers are derived.  Why 37?
> >
> > It's a prime number large enough to give enough lists to minimise
> > contention whilst providing decent distribution for 8 byte aligned
> > addresses with low overhead. XFS uses the same sort of waitqueue
> > hashing for global IO completion wait queues used by truncation
> > and inode eviction (see xfs_ioend_wait()).
> >
> > Seemed reasonable (and simple!) just to copy that design pattern
> > for another global IO completion wait queue....
> 
> OK.  I just had our performance team record some statistics for me on an
> unmodified kernel during an OLTP-type workload.  I've attached the
> systemtap script that I had them run.  I wanted to see just how common
> the sub-page-block zeroing was, and I was frightened to find that, in a
> 10 minute period , over 1.2 million calls were recorded.  If we're
> lucky, my script is buggy.  Please give it a look-see.

Well, it's just checking how many blocks are candidates for zeroing
inside the dio_zero_block() function call. i.e. the function gets
called on every newly allocated block at the start of an IO. Your
result implies that there were 1.2 million IOs requiring allocation
in ten minutes, because the next check in the dio_zero_block():

        dio_blocks_per_fs_block = 1 << dio->blkfactor;
        this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);

        if (!this_chunk_blocks)
                return;

determines if the IO is unaligned and zeroing is really necessary or
not. Your script needs to take this into account, not just count the
number of times the function is called with a new buffer.

> I'm all ears for next steps.  We can check to see how deep the hash
> chains get.  We could also ask the folks at Intel to run this through
> their database testing rig to get a quantification of the overhead.
> 
> What do you think?

Let's run a fixed script first - if databases are really doing so
much unaligned sub-block IO, then they need to be fixed as a matter
of major priority because they are doing far more IO than they need
to be....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] dio: scale unaligned IO tracking via multiple lists
  2010-11-09 23:06         ` Dave Chinner
@ 2010-11-11 15:32           ` Jeff Moyer
  0 siblings, 0 replies; 12+ messages in thread
From: Jeff Moyer @ 2010-11-11 15:32 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, linux-kernel

Dave Chinner <david@fromorbit.com> writes:

> On Tue, Nov 09, 2010 at 04:04:41PM -0500, Jeff Moyer wrote:
>> Dave Chinner <david@fromorbit.com> writes:
>> 
>> > On Mon, Nov 08, 2010 at 10:36:06AM -0500, Jeff Moyer wrote:
>> >> Dave Chinner <david@fromorbit.com> writes:
>> >> 
>> >> > From: Dave Chinner <dchinner@redhat.com>
>> >> >
>> >> > To avoid concerns that a single list and lock tracking the unaligned
>> >> > IOs will not scale appropriately, create multiple lists and locks
>> >> > and chose them by hashing the unaligned block being zeroed.
>> >> >
>> >> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
>> >> > ---
>> >> >  fs/direct-io.c |   49 ++++++++++++++++++++++++++++++++++++-------------
>> >> >  1 files changed, 36 insertions(+), 13 deletions(-)
>> >> >
>> >> > diff --git a/fs/direct-io.c b/fs/direct-io.c
>> >> > index 1a69efd..353ac52 100644
>> >> > --- a/fs/direct-io.c
>> >> > +++ b/fs/direct-io.c
>> >> > @@ -152,8 +152,28 @@ struct dio_zero_block {
>> >> >  	atomic_t	ref;		/* reference count */
>> >> >  };
>> >> >  
>> >> > -static DEFINE_SPINLOCK(dio_zero_block_lock);
>> >> > -static LIST_HEAD(dio_zero_block_list);
>> >> > +#define DIO_ZERO_BLOCK_NR	37LL
>> >> 
>> >> I'm always curious to know how these numbers are derived.  Why 37?
>> >
>> > It's a prime number large enough to give enough lists to minimise
>> > contention whilst providing decent distribution for 8 byte aligned
>> > addresses with low overhead. XFS uses the same sort of waitqueue
>> > hashing for global IO completion wait queues used by truncation
>> > and inode eviction (see xfs_ioend_wait()).
>> >
>> > Seemed reasonable (and simple!) just to copy that design pattern
>> > for another global IO completion wait queue....
>> 
>> OK.  I just had our performance team record some statistics for me on an
>> unmodified kernel during an OLTP-type workload.  I've attached the
>> systemtap script that I had them run.  I wanted to see just how common
>> the sub-page-block zeroing was, and I was frightened to find that, in a
>> 10 minute period , over 1.2 million calls were recorded.  If we're
>> lucky, my script is buggy.  Please give it a look-see.
>
> Well, it's just checking how many blocks are candidates for zeroing
> inside the dio_zero_block() function call. i.e. the function gets
> called on every newly allocated block at the start of an IO. Your
> result implies that there were 1.2 million IOs requiring allocation
> in ten minutes, because the next check in the dio_zero_block():

It's still surprising to me that the database log wasn't preallocated.
Perhaps they just use fallocate, now.

>         dio_blocks_per_fs_block = 1 << dio->blkfactor;
>         this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
>
>         if (!this_chunk_blocks)
>                 return;
>
> determines if the IO is unaligned and zeroing is really necessary or
> not. Your script needs to take this into account, not just count the
> number of times the function is called with a new buffer.

Yeah, I can't believe I missed that.  FWIW, I was told was that the
database log needs to force out commits of various sizes, so it can't
always issue a fixed sized/aligned I/O.  Anyway, I'll have them re-run
the test with the attached script.  Thanks for pointing out this obvious
stupidity.  ;-)

Dave, can you CC me and akpm on your next patch posting?  The dio
changes typically trickle in through Andrew's tree.

Cheers,
Jeff

#! /usr/bin/env stap
#
# This file is free software. You can redistribute it and/or modify it under 
# the terms of the GNU General Public License (GPL); either version 2, or (at
# your option) any later version.

global zeroes = 0
global start_time = 0

probe kernel.function("dio_zero_block") {
	BH_New = 1 << 6;

	dio_blocks_per_fs_block = 1 << $dio->blkfactor;
	this_chunk_blocks = $dio->block_in_file & (dio_blocks_per_fs_block - 1);

	if ($dio->blkfactor != 0 && !($dio->map_bh->b_state & BH_New) &&
	    this_chunk_blocks != 0) {
		zeroes++;
	}
}

probe begin {
	start_time=gettimeofday_s();
}
probe end {
	printf("%d zeroes performed in %d seconds\n", zeroes, gettimeofday_s() - start_time);
}


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2010-11-11 15:36 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-11-08  7:40 [REPOST, PATCH 0/3] dio: serialise unaligned direct IO Dave Chinner
2010-11-08  7:40 ` [PATCH 1/3] dio: track and " Dave Chinner
2010-11-08 15:28   ` Jeff Moyer
2010-11-08 22:55     ` Dave Chinner
2010-11-08  7:40 ` [PATCH 2/3] dio: scale unaligned IO tracking via multiple lists Dave Chinner
2010-11-08 15:36   ` Jeff Moyer
2010-11-08 23:12     ` Dave Chinner
2010-11-09 21:04       ` Jeff Moyer
2010-11-09 23:06         ` Dave Chinner
2010-11-11 15:32           ` Jeff Moyer
2010-11-08  7:40 ` [PATCH 3/3] dio: add a mempool for the unaligned block structures Dave Chinner
2010-11-08 15:40   ` Jeff Moyer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).