All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] dio: track and serialise unaligned direct IO
@ 2010-07-29 22:45 ` Dave Chinner
  0 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-07-29 22:45 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: xfs, sandeen

From: Dave Chinner <dchinner@redhat.com>

If we get two unaligned direct IO's to the same filesystem block
that is marked as a new allocation (i.e. buffer_new), then both IOs will
zero the portion of the block they are not writing data to. As a
result, when the IOs complete there will be a portion of the block
that contains zeros from the last IO to complete rather than the
data that should be there.

This is easily manifested by qemu using aio+dio with an unaligned
guest filesystem - every IO is unaligned and fileystem corruption is
encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
is also a simple reproducer.

To avoid this problem, track unaligned IO that triggers sub-block zeroing and
check new incoming unaligned IO that require sub-block zeroing against that
list. If we get an overlap where the start and end of unaligned IOs hit the
same filesystem block, then we need to block the incoming IOs until the IO that
is zeroing the block completes. The blocked IO can then continue without
needing to do any zeroing and hence won't overwrite valid data with zeros.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/direct-io.c |  152 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 146 insertions(+), 6 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index a10cb91..611524e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -71,6 +71,9 @@ struct dio {
 	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
 					   been performed at the start of a
 					   write */
+#define LAST_SECTOR ((sector_t)-1LL)
+	sector_t zero_block_front;	/* fs block we are zeroing at front */
+	sector_t zero_block_rear;	/* fs block we are zeroing at rear */
 	int pages_in_io;		/* approximate total IO pages */
 	size_t	size;			/* total request size (doesn't change)*/
 	sector_t block_in_file;		/* Current offset into the underlying
@@ -135,6 +138,101 @@ struct dio {
 	struct page *pages[DIO_PAGES];	/* page buffer */
 };
 
+
+/*
+ * record fs blocks we are doing zeroing on in a zero block list.
+ * unaligned IO is not very performant and so is relatively uncommon,
+ * so a global list should be sufficent to track them.
+ */
+struct dio_zero_block {
+	struct list_head dio_list;	/* list of io in progress */
+	sector_t	zero_block;	/* block being zeroed */
+	struct dio	*dio;		/* owner dio */
+	wait_queue_head_t wq;		/* New IO block here */
+	atomic_t	ref;		/* reference count */
+};
+
+DEFINE_SPINLOCK(dio_zero_block_lock);
+LIST_HEAD(dio_zero_block_list);
+
+/*
+ * Add a filesystem block to the list of blocks we are tracking.
+ */
+static void
+dio_start_zero_block(struct dio *dio, sector_t zero_block)
+{
+	struct dio_zero_block *zb;
+
+	zb = kmalloc(sizeof(*zb), GFP_NOIO);
+	if (!zb)
+		return;
+	INIT_LIST_HEAD(&zb->dio_list);
+	init_waitqueue_head(&zb->wq);
+	zb->zero_block = zero_block;
+	zb->dio = dio;
+	atomic_set(&zb->ref, 1);
+
+	spin_lock(&dio_zero_block_lock);
+	list_add(&zb->dio_list, &dio_zero_block_list);
+	spin_unlock(&dio_zero_block_lock);
+}
+
+static void
+dio_drop_zero_block(struct dio_zero_block *zb)
+{
+	if (atomic_dec_and_test(&zb->ref))
+		kfree(zb);
+}
+
+/*
+ * Check whether a filesystem block is currently being zeroed, and if it is
+ * wait for it to complete before returning. If we waited for a block being
+ * zeroed, return 1 to indicate that the block is already initialised,
+ * otherwise return 0 to indicate that it needs zeroing.
+ */
+static int
+dio_wait_zero_block(struct dio *dio, sector_t zero_block)
+{
+	struct dio_zero_block *zb;
+
+	spin_lock(&dio_zero_block_lock);
+	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
+		if (zb->dio->inode != dio->inode)
+			continue;
+		if (zb->zero_block != zero_block)
+			continue;
+		atomic_inc(&zb->ref);
+		spin_unlock(&dio_zero_block_lock);
+		wait_event(zb->wq, (list_empty(&zb->dio_list)));
+		dio_drop_zero_block(zb);
+		return 1;
+	}
+	spin_unlock(&dio_zero_block_lock);
+	return 0;
+}
+
+/*
+ * Complete a block zeroing and wake up anyone waiting for it.
+ */
+static void dio_end_zero_block(struct dio *dio, sector_t zero_block)
+{
+	struct dio_zero_block *zb;
+
+	spin_lock(&dio_zero_block_lock);
+	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
+		if (zb->dio->inode != dio->inode)
+			continue;
+		if (zb->zero_block != zero_block)
+			continue;
+		list_del_init(&zb->dio_list);
+		spin_unlock(&dio_zero_block_lock);
+		wake_up(&zb->wq);
+		dio_drop_zero_block(zb);
+		return;
+	}
+	spin_unlock(&dio_zero_block_lock);
+}
+
 /*
  * How many pages are in the queue?
  */
@@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
 		aio_complete(dio->iocb, ret, 0);
 	}
 
+	if (dio->zero_block_front != LAST_SECTOR)
+		dio_end_zero_block(dio, dio->zero_block_front);
+	if (dio->zero_block_rear != LAST_SECTOR)
+		dio_end_zero_block(dio, dio->zero_block_rear);
+
 	if (dio->flags & DIO_LOCKING)
 		/* lockdep: non-owner release */
 		up_read_non_owner(&dio->inode->i_alloc_sem);
@@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio)
  * block with zeros. This happens only if user-buffer, fileoffset or
  * io length is not filesystem block-size multiple.
  *
+ * We need to track the blocks we are zeroing. If we have concurrent IOs that hit
+ * the same start or end block, we do not want all the IOs to zero the portion
+ * they are not writing data to as that will overwrite data from the other IOs.
+ * Hence we need to block until the first unaligned IO completes before we can
+ * continue (without executing any zeroing).
+ *
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
@@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end)
 {
 	unsigned dio_blocks_per_fs_block;
 	unsigned this_chunk_blocks;	/* In dio_blocks */
-	unsigned this_chunk_bytes;
 	struct page *page;
+	sector_t fsblock;
 
 	dio->start_zero_done = 1;
 	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
@@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int end)
 	if (!this_chunk_blocks)
 		return;
 
+	if (end)
+		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
+
 	/*
 	 * We need to zero out part of an fs block.  It is either at the
-	 * beginning or the end of the fs block.
+	 * beginning or the end of the fs block, but first we need to check if
+	 * there is already a zeroing being run on this block.
+	 *
+	 * If we are doing a sub-block IO (i.e. zeroing both front and rear of
+	 * the same block) we don't need to wait or set a gaurd for the rear of
+	 * the block as we already have one set.
 	 */
-	if (end) 
-		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
+	fsblock = dio->block_in_file >> dio->blkfactor;
+	if (!end || dio->zero_block_front != fsblock) {
 
-	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+		/* wait for any zeroing already in progress */
+		if (dio_wait_zero_block(dio, fsblock)) {
+			/* skip the range we would have zeroed. */
+			dio->next_block_for_io += this_chunk_blocks;
+			return;
+		}
+
+		/*
+		 * we are going to zero stuff now, so set a guard to catch
+		 * others that might want to zero the same block.
+		 */
+		dio_start_zero_block(dio, fsblock);
+		if (end)
+			dio->zero_block_rear = fsblock;
+		else
+			dio->zero_block_front = fsblock;
+	}
 
 	page = ZERO_PAGE(0);
-	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
+	if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbits,
 				dio->next_block_for_io))
 		return;
 
@@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	memset(dio, 0, offsetof(struct dio, pages));
 
+	/*
+	 * zero_blocks need to initialised to largeѕt value to avoid
+	 * matching the zero block accidentally.
+	 */
+	dio->zero_block_front = LAST_SECTOR;
+	dio->zero_block_rear = LAST_SECTOR;
+
 	dio->flags = flags;
 	if (dio->flags & DIO_LOCKING) {
 		/* watch out for a 0 len io from a tricksy fs */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH] dio: track and serialise unaligned direct IO
@ 2010-07-29 22:45 ` Dave Chinner
  0 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-07-29 22:45 UTC (permalink / raw)
  To: linux-fsdevel; +Cc: sandeen, xfs

[-- Attachment #1: Type: text/plain, Size: 8023 bytes --]

From: Dave Chinner <dchinner@redhat.com>

If we get two unaligned direct IO's to the same filesystem block
that is marked as a new allocation (i.e. buffer_new), then both IOs will
zero the portion of the block they are not writing data to. As a
result, when the IOs complete there will be a portion of the block
that contains zeros from the last IO to complete rather than the
data that should be there.

This is easily manifested by qemu using aio+dio with an unaligned
guest filesystem - every IO is unaligned and fileystem corruption is
encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
is also a simple reproducer.

To avoid this problem, track unaligned IO that triggers sub-block zeroing and
check new incoming unaligned IO that require sub-block zeroing against that
list. If we get an overlap where the start and end of unaligned IOs hit the
same filesystem block, then we need to block the incoming IOs until the IO that
is zeroing the block completes. The blocked IO can then continue without
needing to do any zeroing and hence won't overwrite valid data with zeros.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/direct-io.c |  152 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 146 insertions(+), 6 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index a10cb91..611524e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -71,6 +71,9 @@ struct dio {
 	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
 					   been performed at the start of a
 					   write */
+#define LAST_SECTOR ((sector_t)-1LL)
+	sector_t zero_block_front;	/* fs block we are zeroing at front */
+	sector_t zero_block_rear;	/* fs block we are zeroing at rear */
 	int pages_in_io;		/* approximate total IO pages */
 	size_t	size;			/* total request size (doesn't change)*/
 	sector_t block_in_file;		/* Current offset into the underlying
@@ -135,6 +138,101 @@ struct dio {
 	struct page *pages[DIO_PAGES];	/* page buffer */
 };
 
+
+/*
+ * record fs blocks we are doing zeroing on in a zero block list.
+ * unaligned IO is not very performant and so is relatively uncommon,
+ * so a global list should be sufficent to track them.
+ */
+struct dio_zero_block {
+	struct list_head dio_list;	/* list of io in progress */
+	sector_t	zero_block;	/* block being zeroed */
+	struct dio	*dio;		/* owner dio */
+	wait_queue_head_t wq;		/* New IO block here */
+	atomic_t	ref;		/* reference count */
+};
+
+DEFINE_SPINLOCK(dio_zero_block_lock);
+LIST_HEAD(dio_zero_block_list);
+
+/*
+ * Add a filesystem block to the list of blocks we are tracking.
+ */
+static void
+dio_start_zero_block(struct dio *dio, sector_t zero_block)
+{
+	struct dio_zero_block *zb;
+
+	zb = kmalloc(sizeof(*zb), GFP_NOIO);
+	if (!zb)
+		return;
+	INIT_LIST_HEAD(&zb->dio_list);
+	init_waitqueue_head(&zb->wq);
+	zb->zero_block = zero_block;
+	zb->dio = dio;
+	atomic_set(&zb->ref, 1);
+
+	spin_lock(&dio_zero_block_lock);
+	list_add(&zb->dio_list, &dio_zero_block_list);
+	spin_unlock(&dio_zero_block_lock);
+}
+
+static void
+dio_drop_zero_block(struct dio_zero_block *zb)
+{
+	if (atomic_dec_and_test(&zb->ref))
+		kfree(zb);
+}
+
+/*
+ * Check whether a filesystem block is currently being zeroed, and if it is
+ * wait for it to complete before returning. If we waited for a block being
+ * zeroed, return 1 to indicate that the block is already initialised,
+ * otherwise return 0 to indicate that it needs zeroing.
+ */
+static int
+dio_wait_zero_block(struct dio *dio, sector_t zero_block)
+{
+	struct dio_zero_block *zb;
+
+	spin_lock(&dio_zero_block_lock);
+	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
+		if (zb->dio->inode != dio->inode)
+			continue;
+		if (zb->zero_block != zero_block)
+			continue;
+		atomic_inc(&zb->ref);
+		spin_unlock(&dio_zero_block_lock);
+		wait_event(zb->wq, (list_empty(&zb->dio_list)));
+		dio_drop_zero_block(zb);
+		return 1;
+	}
+	spin_unlock(&dio_zero_block_lock);
+	return 0;
+}
+
+/*
+ * Complete a block zeroing and wake up anyone waiting for it.
+ */
+static void dio_end_zero_block(struct dio *dio, sector_t zero_block)
+{
+	struct dio_zero_block *zb;
+
+	spin_lock(&dio_zero_block_lock);
+	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
+		if (zb->dio->inode != dio->inode)
+			continue;
+		if (zb->zero_block != zero_block)
+			continue;
+		list_del_init(&zb->dio_list);
+		spin_unlock(&dio_zero_block_lock);
+		wake_up(&zb->wq);
+		dio_drop_zero_block(zb);
+		return;
+	}
+	spin_unlock(&dio_zero_block_lock);
+}
+
 /*
  * How many pages are in the queue?
  */
@@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
 		aio_complete(dio->iocb, ret, 0);
 	}
 
+	if (dio->zero_block_front != LAST_SECTOR)
+		dio_end_zero_block(dio, dio->zero_block_front);
+	if (dio->zero_block_rear != LAST_SECTOR)
+		dio_end_zero_block(dio, dio->zero_block_rear);
+
 	if (dio->flags & DIO_LOCKING)
 		/* lockdep: non-owner release */
 		up_read_non_owner(&dio->inode->i_alloc_sem);
@@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio)
  * block with zeros. This happens only if user-buffer, fileoffset or
  * io length is not filesystem block-size multiple.
  *
+ * We need to track the blocks we are zeroing. If we have concurrent IOs that hit
+ * the same start or end block, we do not want all the IOs to zero the portion
+ * they are not writing data to as that will overwrite data from the other IOs.
+ * Hence we need to block until the first unaligned IO completes before we can
+ * continue (without executing any zeroing).
+ *
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
@@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end)
 {
 	unsigned dio_blocks_per_fs_block;
 	unsigned this_chunk_blocks;	/* In dio_blocks */
-	unsigned this_chunk_bytes;
 	struct page *page;
+	sector_t fsblock;
 
 	dio->start_zero_done = 1;
 	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
@@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int end)
 	if (!this_chunk_blocks)
 		return;
 
+	if (end)
+		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
+
 	/*
 	 * We need to zero out part of an fs block.  It is either at the
-	 * beginning or the end of the fs block.
+	 * beginning or the end of the fs block, but first we need to check if
+	 * there is already a zeroing being run on this block.
+	 *
+	 * If we are doing a sub-block IO (i.e. zeroing both front and rear of
+	 * the same block) we don't need to wait or set a gaurd for the rear of
+	 * the block as we already have one set.
 	 */
-	if (end) 
-		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
+	fsblock = dio->block_in_file >> dio->blkfactor;
+	if (!end || dio->zero_block_front != fsblock) {
 
-	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+		/* wait for any zeroing already in progress */
+		if (dio_wait_zero_block(dio, fsblock)) {
+			/* skip the range we would have zeroed. */
+			dio->next_block_for_io += this_chunk_blocks;
+			return;
+		}
+
+		/*
+		 * we are going to zero stuff now, so set a guard to catch
+		 * others that might want to zero the same block.
+		 */
+		dio_start_zero_block(dio, fsblock);
+		if (end)
+			dio->zero_block_rear = fsblock;
+		else
+			dio->zero_block_front = fsblock;
+	}
 
 	page = ZERO_PAGE(0);
-	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
+	if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbits,
 				dio->next_block_for_io))
 		return;
 
@@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	memset(dio, 0, offsetof(struct dio, pages));
 
+	/*
+	 * zero_blocks need to initialised to largeѕt value to avoid
+	 * matching the zero block accidentally.
+	 */
+	dio->zero_block_front = LAST_SECTOR;
+	dio->zero_block_rear = LAST_SECTOR;
+
 	dio->flags = flags;
 	if (dio->flags & DIO_LOCKING) {
 		/* watch out for a 0 len io from a tricksy fs */
-- 
1.7.1


[-- Attachment #2: Type: text/plain, Size: 121 bytes --]

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-07-29 22:45 ` Dave Chinner
@ 2010-07-30  2:53   ` Matthew Wilcox
  -1 siblings, 0 replies; 28+ messages in thread
From: Matthew Wilcox @ 2010-07-30  2:53 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, xfs, sandeen

On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> If we get two unaligned direct IO's to the same filesystem block
> that is marked as a new allocation (i.e. buffer_new), then both IOs will
> zero the portion of the block they are not writing data to. As a
> result, when the IOs complete there will be a portion of the block
> that contains zeros from the last IO to complete rather than the
> data that should be there.

Urgh.  Yuck.

> This is easily manifested by qemu using aio+dio with an unaligned
> guest filesystem - every IO is unaligned and fileystem corruption is
> encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> is also a simple reproducer.
> 
> To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> check new incoming unaligned IO that require sub-block zeroing against that
> list. If we get an overlap where the start and end of unaligned IOs hit the
> same filesystem block, then we need to block the incoming IOs until the IO that
> is zeroing the block completes. The blocked IO can then continue without
> needing to do any zeroing and hence won't overwrite valid data with zeros.

Urgh.  Yuck.

Could we perhaps handle this by making an IO instantiate a page cache
page for partial writes, and forcing that portion of the IO through the
page cache?  The second IO would hit the same page and use the existing
O_DIRECT vs page cache paths.

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-07-30  2:53   ` Matthew Wilcox
  0 siblings, 0 replies; 28+ messages in thread
From: Matthew Wilcox @ 2010-07-30  2:53 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, sandeen, xfs

On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> If we get two unaligned direct IO's to the same filesystem block
> that is marked as a new allocation (i.e. buffer_new), then both IOs will
> zero the portion of the block they are not writing data to. As a
> result, when the IOs complete there will be a portion of the block
> that contains zeros from the last IO to complete rather than the
> data that should be there.

Urgh.  Yuck.

> This is easily manifested by qemu using aio+dio with an unaligned
> guest filesystem - every IO is unaligned and fileystem corruption is
> encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> is also a simple reproducer.
> 
> To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> check new incoming unaligned IO that require sub-block zeroing against that
> list. If we get an overlap where the start and end of unaligned IOs hit the
> same filesystem block, then we need to block the incoming IOs until the IO that
> is zeroing the block completes. The blocked IO can then continue without
> needing to do any zeroing and hence won't overwrite valid data with zeros.

Urgh.  Yuck.

Could we perhaps handle this by making an IO instantiate a page cache
page for partial writes, and forcing that portion of the IO through the
page cache?  The second IO would hit the same page and use the existing
O_DIRECT vs page cache paths.

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-07-30  2:53   ` Matthew Wilcox
@ 2010-07-30  4:53     ` Dave Chinner
  -1 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-07-30  4:53 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, xfs, sandeen

On Thu, Jul 29, 2010 at 08:53:24PM -0600, Matthew Wilcox wrote:
> On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> > If we get two unaligned direct IO's to the same filesystem block
> > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > zero the portion of the block they are not writing data to. As a
> > result, when the IOs complete there will be a portion of the block
> > that contains zeros from the last IO to complete rather than the
> > data that should be there.
> 
> Urgh.  Yuck.
> 
> > This is easily manifested by qemu using aio+dio with an unaligned
> > guest filesystem - every IO is unaligned and fileystem corruption is
> > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > is also a simple reproducer.
> > 
> > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > check new incoming unaligned IO that require sub-block zeroing against that
> > list. If we get an overlap where the start and end of unaligned IOs hit the
> > same filesystem block, then we need to block the incoming IOs until the IO that
> > is zeroing the block completes. The blocked IO can then continue without
> > needing to do any zeroing and hence won't overwrite valid data with zeros.
> 
> Urgh.  Yuck.

It's better than silent data corruption.

> Could we perhaps handle this by making an IO instantiate a page cache
> page for partial writes, and forcing that portion of the IO through the
> page cache?  The second IO would hit the same page and use the existing
> O_DIRECT vs page cache paths.

I don't want any direct IO for XFS to go through the page cache -
unaligned or not. using the page cache for the unaligned blocks
would also be much worse for performance that this method because it
turns unaligned direct IO into 3 IOs - the unaligned head block, the
aligned body and the unaligned tail block. It would also be a
performance hit you take on every single dio, whereas this way the
hit is only taken when an overlap is detected.

And besides, such decisions on whether to use buffered IO have to be
made high up in the filesystem when we are deciding how to lock the
inode for the dio - buffered IO requires exclusive locking, not the
shared locking we do for dio writes. That further reduces the
performance of unaligned direct IO even when there are no overlaps,
and it's a solution that every filesystem needs to implement
themselves in some way.

I've looked at a couple of different ways to fix this (e.g. passing
the unaligned state to get_blocks() to allow the filesystem to
serialise there) but they've all died a horrible death of dodgy
locking and hacky IO completion detection. not to mention
requiring a different solution in every filesystem.

This way may be a bit ugly, but it works, is isolated and seems to
me to be the least-worst way of solving the problem. It could be
made to scale better by making the tracking per-inode, but I don't
think that growing the struct inode for this corner case is a good
tradeoff, either...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-07-30  4:53     ` Dave Chinner
  0 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-07-30  4:53 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-fsdevel, sandeen, xfs

On Thu, Jul 29, 2010 at 08:53:24PM -0600, Matthew Wilcox wrote:
> On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> > If we get two unaligned direct IO's to the same filesystem block
> > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > zero the portion of the block they are not writing data to. As a
> > result, when the IOs complete there will be a portion of the block
> > that contains zeros from the last IO to complete rather than the
> > data that should be there.
> 
> Urgh.  Yuck.
> 
> > This is easily manifested by qemu using aio+dio with an unaligned
> > guest filesystem - every IO is unaligned and fileystem corruption is
> > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > is also a simple reproducer.
> > 
> > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > check new incoming unaligned IO that require sub-block zeroing against that
> > list. If we get an overlap where the start and end of unaligned IOs hit the
> > same filesystem block, then we need to block the incoming IOs until the IO that
> > is zeroing the block completes. The blocked IO can then continue without
> > needing to do any zeroing and hence won't overwrite valid data with zeros.
> 
> Urgh.  Yuck.

It's better than silent data corruption.

> Could we perhaps handle this by making an IO instantiate a page cache
> page for partial writes, and forcing that portion of the IO through the
> page cache?  The second IO would hit the same page and use the existing
> O_DIRECT vs page cache paths.

I don't want any direct IO for XFS to go through the page cache -
unaligned or not. using the page cache for the unaligned blocks
would also be much worse for performance that this method because it
turns unaligned direct IO into 3 IOs - the unaligned head block, the
aligned body and the unaligned tail block. It would also be a
performance hit you take on every single dio, whereas this way the
hit is only taken when an overlap is detected.

And besides, such decisions on whether to use buffered IO have to be
made high up in the filesystem when we are deciding how to lock the
inode for the dio - buffered IO requires exclusive locking, not the
shared locking we do for dio writes. That further reduces the
performance of unaligned direct IO even when there are no overlaps,
and it's a solution that every filesystem needs to implement
themselves in some way.

I've looked at a couple of different ways to fix this (e.g. passing
the unaligned state to get_blocks() to allow the filesystem to
serialise there) but they've all died a horrible death of dodgy
locking and hacky IO completion detection. not to mention
requiring a different solution in every filesystem.

This way may be a bit ugly, but it works, is isolated and seems to
me to be the least-worst way of solving the problem. It could be
made to scale better by making the tracking per-inode, but I don't
think that growing the struct inode for this corner case is a good
tradeoff, either...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-07-29 22:45 ` Dave Chinner
@ 2010-07-30 17:43   ` Badari Pulavarty
  -1 siblings, 0 replies; 28+ messages in thread
From: Badari Pulavarty @ 2010-07-30 17:43 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, xfs, sandeen

On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If we get two unaligned direct IO's to the same filesystem block
> that is marked as a new allocation (i.e. buffer_new), then both IOs will
> zero the portion of the block they are not writing data to. As a
> result, when the IOs complete there will be a portion of the block
> that contains zeros from the last IO to complete rather than the
> data that should be there.
> 
> This is easily manifested by qemu using aio+dio with an unaligned
> guest filesystem - every IO is unaligned and fileystem corruption is
> encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> is also a simple reproducer.
> 
> To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> check new incoming unaligned IO that require sub-block zeroing against that
> list. If we get an overlap where the start and end of unaligned IOs hit the
> same filesystem block, then we need to block the incoming IOs until the IO that
> is zeroing the block completes. The blocked IO can then continue without
> needing to do any zeroing and hence won't overwrite valid data with zeros.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

I can confirm that, it  fixes corruption  of my VM images when using AIO
+DIO. (cache=none,aio=native). I haven't reviewed the patch closely, but

1) can we do this only for AIO+DIO combination ? For regular DIO, we
should have all the IOs serialized by i_mutex anyway..

2) Having a single global list (for all devices) might cause scaling
issues.

3) Are you dropping i_mutex when you are waiting for the zero-out to
finish ?


Thanks,
Badari
> ---
>  fs/direct-io.c |  152 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 146 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index a10cb91..611524e 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -71,6 +71,9 @@ struct dio {
>  	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
>  					   been performed at the start of a
>  					   write */
> +#define LAST_SECTOR ((sector_t)-1LL)
> +	sector_t zero_block_front;	/* fs block we are zeroing at front */
> +	sector_t zero_block_rear;	/* fs block we are zeroing at rear */
>  	int pages_in_io;		/* approximate total IO pages */
>  	size_t	size;			/* total request size (doesn't change)*/
>  	sector_t block_in_file;		/* Current offset into the underlying
> @@ -135,6 +138,101 @@ struct dio {
>  	struct page *pages[DIO_PAGES];	/* page buffer */
>  };
>  
> +
> +/*
> + * record fs blocks we are doing zeroing on in a zero block list.
> + * unaligned IO is not very performant and so is relatively uncommon,
> + * so a global list should be sufficent to track them.
> + */
> +struct dio_zero_block {
> +	struct list_head dio_list;	/* list of io in progress */
> +	sector_t	zero_block;	/* block being zeroed */
> +	struct dio	*dio;		/* owner dio */
> +	wait_queue_head_t wq;		/* New IO block here */
> +	atomic_t	ref;		/* reference count */
> +};
> +
> +DEFINE_SPINLOCK(dio_zero_block_lock);
> +LIST_HEAD(dio_zero_block_list);
> +
> +/*
> + * Add a filesystem block to the list of blocks we are tracking.
> + */
> +static void
> +dio_start_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	zb = kmalloc(sizeof(*zb), GFP_NOIO);
> +	if (!zb)
> +		return;
> +	INIT_LIST_HEAD(&zb->dio_list);
> +	init_waitqueue_head(&zb->wq);
> +	zb->zero_block = zero_block;
> +	zb->dio = dio;
> +	atomic_set(&zb->ref, 1);
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_add(&zb->dio_list, &dio_zero_block_list);
> +	spin_unlock(&dio_zero_block_lock);
> +}
> +
> +static void
> +dio_drop_zero_block(struct dio_zero_block *zb)
> +{
> +	if (atomic_dec_and_test(&zb->ref))
> +		kfree(zb);
> +}
> +
> +/*
> + * Check whether a filesystem block is currently being zeroed, and if it is
> + * wait for it to complete before returning. If we waited for a block being
> + * zeroed, return 1 to indicate that the block is already initialised,
> + * otherwise return 0 to indicate that it needs zeroing.
> + */
> +static int
> +dio_wait_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +		if (zb->dio->inode != dio->inode)
> +			continue;
> +		if (zb->zero_block != zero_block)
> +			continue;
> +		atomic_inc(&zb->ref);
> +		spin_unlock(&dio_zero_block_lock);
> +		wait_event(zb->wq, (list_empty(&zb->dio_list)));
> +		dio_drop_zero_block(zb);
> +		return 1;
> +	}
> +	spin_unlock(&dio_zero_block_lock);
> +	return 0;
> +}
> +
> +/*
> + * Complete a block zeroing and wake up anyone waiting for it.
> + */
> +static void dio_end_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +		if (zb->dio->inode != dio->inode)
> +			continue;
> +		if (zb->zero_block != zero_block)
> +			continue;
> +		list_del_init(&zb->dio_list);
> +		spin_unlock(&dio_zero_block_lock);
> +		wake_up(&zb->wq);
> +		dio_drop_zero_block(zb);
> +		return;
> +	}
> +	spin_unlock(&dio_zero_block_lock);
> +}
> +
>  /*
>   * How many pages are in the queue?
>   */
> @@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
>  		aio_complete(dio->iocb, ret, 0);
>  	}
>  
> +	if (dio->zero_block_front != LAST_SECTOR)
> +		dio_end_zero_block(dio, dio->zero_block_front);
> +	if (dio->zero_block_rear != LAST_SECTOR)
> +		dio_end_zero_block(dio, dio->zero_block_rear);
> +
>  	if (dio->flags & DIO_LOCKING)
>  		/* lockdep: non-owner release */
>  		up_read_non_owner(&dio->inode->i_alloc_sem);
> @@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio)
>   * block with zeros. This happens only if user-buffer, fileoffset or
>   * io length is not filesystem block-size multiple.
>   *
> + * We need to track the blocks we are zeroing. If we have concurrent IOs that hit
> + * the same start or end block, we do not want all the IOs to zero the portion
> + * they are not writing data to as that will overwrite data from the other IOs.
> + * Hence we need to block until the first unaligned IO completes before we can
> + * continue (without executing any zeroing).
> + *
>   * `end' is zero if we're doing the start of the IO, 1 at the end of the
>   * IO.
>   */
> @@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end)
>  {
>  	unsigned dio_blocks_per_fs_block;
>  	unsigned this_chunk_blocks;	/* In dio_blocks */
> -	unsigned this_chunk_bytes;
>  	struct page *page;
> +	sector_t fsblock;
>  
>  	dio->start_zero_done = 1;
>  	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
> @@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int end)
>  	if (!this_chunk_blocks)
>  		return;
>  
> +	if (end)
> +		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +
>  	/*
>  	 * We need to zero out part of an fs block.  It is either at the
> -	 * beginning or the end of the fs block.
> +	 * beginning or the end of the fs block, but first we need to check if
> +	 * there is already a zeroing being run on this block.
> +	 *
> +	 * If we are doing a sub-block IO (i.e. zeroing both front and rear of
> +	 * the same block) we don't need to wait or set a gaurd for the rear of
> +	 * the block as we already have one set.
>  	 */
> -	if (end) 
> -		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +	fsblock = dio->block_in_file >> dio->blkfactor;
> +	if (!end || dio->zero_block_front != fsblock) {
>  
> -	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
> +		/* wait for any zeroing already in progress */
> +		if (dio_wait_zero_block(dio, fsblock)) {
> +			/* skip the range we would have zeroed. */
> +			dio->next_block_for_io += this_chunk_blocks;
> +			return;
> +		}
> +
> +		/*
> +		 * we are going to zero stuff now, so set a guard to catch
> +		 * others that might want to zero the same block.
> +		 */
> +		dio_start_zero_block(dio, fsblock);
> +		if (end)
> +			dio->zero_block_rear = fsblock;
> +		else
> +			dio->zero_block_front = fsblock;
> +	}
>  
>  	page = ZERO_PAGE(0);
> -	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
> +	if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbits,
>  				dio->next_block_for_io))
>  		return;
>  
> @@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
>  	 */
>  	memset(dio, 0, offsetof(struct dio, pages));
>  
> +	/*
> +	 * zero_blocks need to initialised to largeѕt value to avoid
> +	 * matching the zero block accidentally.
> +	 */
> +	dio->zero_block_front = LAST_SECTOR;
> +	dio->zero_block_rear = LAST_SECTOR;
> +
>  	dio->flags = flags;
>  	if (dio->flags & DIO_LOCKING) {
>  		/* watch out for a 0 len io from a tricksy fs */
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-07-30 17:43   ` Badari Pulavarty
  0 siblings, 0 replies; 28+ messages in thread
From: Badari Pulavarty @ 2010-07-30 17:43 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, sandeen, xfs

On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If we get two unaligned direct IO's to the same filesystem block
> that is marked as a new allocation (i.e. buffer_new), then both IOs will
> zero the portion of the block they are not writing data to. As a
> result, when the IOs complete there will be a portion of the block
> that contains zeros from the last IO to complete rather than the
> data that should be there.
> 
> This is easily manifested by qemu using aio+dio with an unaligned
> guest filesystem - every IO is unaligned and fileystem corruption is
> encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> is also a simple reproducer.
> 
> To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> check new incoming unaligned IO that require sub-block zeroing against that
> list. If we get an overlap where the start and end of unaligned IOs hit the
> same filesystem block, then we need to block the incoming IOs until the IO that
> is zeroing the block completes. The blocked IO can then continue without
> needing to do any zeroing and hence won't overwrite valid data with zeros.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>

I can confirm that, it  fixes corruption  of my VM images when using AIO
+DIO. (cache=none,aio=native). I haven't reviewed the patch closely, but

1) can we do this only for AIO+DIO combination ? For regular DIO, we
should have all the IOs serialized by i_mutex anyway..

2) Having a single global list (for all devices) might cause scaling
issues.

3) Are you dropping i_mutex when you are waiting for the zero-out to
finish ?


Thanks,
Badari
> ---
>  fs/direct-io.c |  152 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 146 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index a10cb91..611524e 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -71,6 +71,9 @@ struct dio {
>  	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
>  					   been performed at the start of a
>  					   write */
> +#define LAST_SECTOR ((sector_t)-1LL)
> +	sector_t zero_block_front;	/* fs block we are zeroing at front */
> +	sector_t zero_block_rear;	/* fs block we are zeroing at rear */
>  	int pages_in_io;		/* approximate total IO pages */
>  	size_t	size;			/* total request size (doesn't change)*/
>  	sector_t block_in_file;		/* Current offset into the underlying
> @@ -135,6 +138,101 @@ struct dio {
>  	struct page *pages[DIO_PAGES];	/* page buffer */
>  };
>  
> +
> +/*
> + * record fs blocks we are doing zeroing on in a zero block list.
> + * unaligned IO is not very performant and so is relatively uncommon,
> + * so a global list should be sufficent to track them.
> + */
> +struct dio_zero_block {
> +	struct list_head dio_list;	/* list of io in progress */
> +	sector_t	zero_block;	/* block being zeroed */
> +	struct dio	*dio;		/* owner dio */
> +	wait_queue_head_t wq;		/* New IO block here */
> +	atomic_t	ref;		/* reference count */
> +};
> +
> +DEFINE_SPINLOCK(dio_zero_block_lock);
> +LIST_HEAD(dio_zero_block_list);
> +
> +/*
> + * Add a filesystem block to the list of blocks we are tracking.
> + */
> +static void
> +dio_start_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	zb = kmalloc(sizeof(*zb), GFP_NOIO);
> +	if (!zb)
> +		return;
> +	INIT_LIST_HEAD(&zb->dio_list);
> +	init_waitqueue_head(&zb->wq);
> +	zb->zero_block = zero_block;
> +	zb->dio = dio;
> +	atomic_set(&zb->ref, 1);
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_add(&zb->dio_list, &dio_zero_block_list);
> +	spin_unlock(&dio_zero_block_lock);
> +}
> +
> +static void
> +dio_drop_zero_block(struct dio_zero_block *zb)
> +{
> +	if (atomic_dec_and_test(&zb->ref))
> +		kfree(zb);
> +}
> +
> +/*
> + * Check whether a filesystem block is currently being zeroed, and if it is
> + * wait for it to complete before returning. If we waited for a block being
> + * zeroed, return 1 to indicate that the block is already initialised,
> + * otherwise return 0 to indicate that it needs zeroing.
> + */
> +static int
> +dio_wait_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +		if (zb->dio->inode != dio->inode)
> +			continue;
> +		if (zb->zero_block != zero_block)
> +			continue;
> +		atomic_inc(&zb->ref);
> +		spin_unlock(&dio_zero_block_lock);
> +		wait_event(zb->wq, (list_empty(&zb->dio_list)));
> +		dio_drop_zero_block(zb);
> +		return 1;
> +	}
> +	spin_unlock(&dio_zero_block_lock);
> +	return 0;
> +}
> +
> +/*
> + * Complete a block zeroing and wake up anyone waiting for it.
> + */
> +static void dio_end_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +		if (zb->dio->inode != dio->inode)
> +			continue;
> +		if (zb->zero_block != zero_block)
> +			continue;
> +		list_del_init(&zb->dio_list);
> +		spin_unlock(&dio_zero_block_lock);
> +		wake_up(&zb->wq);
> +		dio_drop_zero_block(zb);
> +		return;
> +	}
> +	spin_unlock(&dio_zero_block_lock);
> +}
> +
>  /*
>   * How many pages are in the queue?
>   */
> @@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
>  		aio_complete(dio->iocb, ret, 0);
>  	}
>  
> +	if (dio->zero_block_front != LAST_SECTOR)
> +		dio_end_zero_block(dio, dio->zero_block_front);
> +	if (dio->zero_block_rear != LAST_SECTOR)
> +		dio_end_zero_block(dio, dio->zero_block_rear);
> +
>  	if (dio->flags & DIO_LOCKING)
>  		/* lockdep: non-owner release */
>  		up_read_non_owner(&dio->inode->i_alloc_sem);
> @@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio)
>   * block with zeros. This happens only if user-buffer, fileoffset or
>   * io length is not filesystem block-size multiple.
>   *
> + * We need to track the blocks we are zeroing. If we have concurrent IOs that hit
> + * the same start or end block, we do not want all the IOs to zero the portion
> + * they are not writing data to as that will overwrite data from the other IOs.
> + * Hence we need to block until the first unaligned IO completes before we can
> + * continue (without executing any zeroing).
> + *
>   * `end' is zero if we're doing the start of the IO, 1 at the end of the
>   * IO.
>   */
> @@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end)
>  {
>  	unsigned dio_blocks_per_fs_block;
>  	unsigned this_chunk_blocks;	/* In dio_blocks */
> -	unsigned this_chunk_bytes;
>  	struct page *page;
> +	sector_t fsblock;
>  
>  	dio->start_zero_done = 1;
>  	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
> @@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int end)
>  	if (!this_chunk_blocks)
>  		return;
>  
> +	if (end)
> +		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +
>  	/*
>  	 * We need to zero out part of an fs block.  It is either at the
> -	 * beginning or the end of the fs block.
> +	 * beginning or the end of the fs block, but first we need to check if
> +	 * there is already a zeroing being run on this block.
> +	 *
> +	 * If we are doing a sub-block IO (i.e. zeroing both front and rear of
> +	 * the same block) we don't need to wait or set a gaurd for the rear of
> +	 * the block as we already have one set.
>  	 */
> -	if (end) 
> -		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +	fsblock = dio->block_in_file >> dio->blkfactor;
> +	if (!end || dio->zero_block_front != fsblock) {
>  
> -	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
> +		/* wait for any zeroing already in progress */
> +		if (dio_wait_zero_block(dio, fsblock)) {
> +			/* skip the range we would have zeroed. */
> +			dio->next_block_for_io += this_chunk_blocks;
> +			return;
> +		}
> +
> +		/*
> +		 * we are going to zero stuff now, so set a guard to catch
> +		 * others that might want to zero the same block.
> +		 */
> +		dio_start_zero_block(dio, fsblock);
> +		if (end)
> +			dio->zero_block_rear = fsblock;
> +		else
> +			dio->zero_block_front = fsblock;
> +	}
>  
>  	page = ZERO_PAGE(0);
> -	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
> +	if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbits,
>  				dio->next_block_for_io))
>  		return;
>  
> @@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
>  	 */
>  	memset(dio, 0, offsetof(struct dio, pages));
>  
> +	/*
> +	 * zero_blocks need to initialised to largeѕt value to avoid
> +	 * matching the zero block accidentally.
> +	 */
> +	dio->zero_block_front = LAST_SECTOR;
> +	dio->zero_block_rear = LAST_SECTOR;
> +
>  	dio->flags = flags;
>  	if (dio->flags & DIO_LOCKING) {
>  		/* watch out for a 0 len io from a tricksy fs */
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-07-30 17:43   ` Badari Pulavarty
@ 2010-07-30 23:13     ` Dave Chinner
  -1 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-07-30 23:13 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: linux-fsdevel, xfs, sandeen

On Fri, Jul 30, 2010 at 10:43:09AM -0700, Badari Pulavarty wrote:
> On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > If we get two unaligned direct IO's to the same filesystem block
> > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > zero the portion of the block they are not writing data to. As a
> > result, when the IOs complete there will be a portion of the block
> > that contains zeros from the last IO to complete rather than the
> > data that should be there.
> > 
> > This is easily manifested by qemu using aio+dio with an unaligned
> > guest filesystem - every IO is unaligned and fileystem corruption is
> > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > is also a simple reproducer.
> > 
> > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > check new incoming unaligned IO that require sub-block zeroing against that
> > list. If we get an overlap where the start and end of unaligned IOs hit the
> > same filesystem block, then we need to block the incoming IOs until the IO that
> > is zeroing the block completes. The blocked IO can then continue without
> > needing to do any zeroing and hence won't overwrite valid data with zeros.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> 
> I can confirm that, it  fixes corruption  of my VM images when using AIO
> +DIO. (cache=none,aio=native). I haven't reviewed the patch closely, but
> 
> 1) can we do this only for AIO+DIO combination ? For regular DIO, we
> should have all the IOs serialized by i_mutex anyway..

Not for filesystems that do their own locking. In most cases XFS
does not take the i_mutiex during DIO writes, and when it does it
drops it long before we call into the generic direct IO code that
does the sub-block zeroing. So the i_mutex does not guarantee any
form of serialisation in direct IO writes at all.

> 2) Having a single global list (for all devices) might cause scaling
> issues.

Unaligned direct IO is undesirable in the first place. While we
shouldd behave correctly in this corner case, I0 don't see any need
for it to be particularly efficient as the real fix for performance
problems with unaligned DIO is to not issue it in the first place.

> 3) Are you dropping i_mutex when you are waiting for the zero-out to
> finish ?

For XFS we're not holding the i_mutex - and we can't take the
i_mutex either as that will cause lock inversion issues. We don't
know what locks are held, we don't know whether it is safe to drop
and take locks, we don't even have the context to operate on
filesystem specific locks to avoid ordering problems. If we can't
sleep with the locks we already have held at this point, the DIO is
already broken for that filesystem.

Besides, if the i_mutex is already held for some filesystem when we
zero blocks, then we can't very well have concurrent block zeroing
in progress, and therefore can't hit this bug, right?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-07-30 23:13     ` Dave Chinner
  0 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-07-30 23:13 UTC (permalink / raw)
  To: Badari Pulavarty; +Cc: linux-fsdevel, sandeen, xfs

On Fri, Jul 30, 2010 at 10:43:09AM -0700, Badari Pulavarty wrote:
> On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > If we get two unaligned direct IO's to the same filesystem block
> > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > zero the portion of the block they are not writing data to. As a
> > result, when the IOs complete there will be a portion of the block
> > that contains zeros from the last IO to complete rather than the
> > data that should be there.
> > 
> > This is easily manifested by qemu using aio+dio with an unaligned
> > guest filesystem - every IO is unaligned and fileystem corruption is
> > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > is also a simple reproducer.
> > 
> > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > check new incoming unaligned IO that require sub-block zeroing against that
> > list. If we get an overlap where the start and end of unaligned IOs hit the
> > same filesystem block, then we need to block the incoming IOs until the IO that
> > is zeroing the block completes. The blocked IO can then continue without
> > needing to do any zeroing and hence won't overwrite valid data with zeros.
> > 
> > Signed-off-by: Dave Chinner <dchinner@redhat.com>
> 
> I can confirm that, it  fixes corruption  of my VM images when using AIO
> +DIO. (cache=none,aio=native). I haven't reviewed the patch closely, but
> 
> 1) can we do this only for AIO+DIO combination ? For regular DIO, we
> should have all the IOs serialized by i_mutex anyway..

Not for filesystems that do their own locking. In most cases XFS
does not take the i_mutiex during DIO writes, and when it does it
drops it long before we call into the generic direct IO code that
does the sub-block zeroing. So the i_mutex does not guarantee any
form of serialisation in direct IO writes at all.

> 2) Having a single global list (for all devices) might cause scaling
> issues.

Unaligned direct IO is undesirable in the first place. While we
shouldd behave correctly in this corner case, I0 don't see any need
for it to be particularly efficient as the real fix for performance
problems with unaligned DIO is to not issue it in the first place.

> 3) Are you dropping i_mutex when you are waiting for the zero-out to
> finish ?

For XFS we're not holding the i_mutex - and we can't take the
i_mutex either as that will cause lock inversion issues. We don't
know what locks are held, we don't know whether it is safe to drop
and take locks, we don't even have the context to operate on
filesystem specific locks to avoid ordering problems. If we can't
sleep with the locks we already have held at this point, the DIO is
already broken for that filesystem.

Besides, if the i_mutex is already held for some filesystem when we
zero blocks, then we can't very well have concurrent block zeroing
in progress, and therefore can't hit this bug, right?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-07-30  4:53     ` Dave Chinner
@ 2010-08-03 17:34       ` Mingming Cao
  -1 siblings, 0 replies; 28+ messages in thread
From: Mingming Cao @ 2010-08-03 17:34 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Matthew Wilcox, linux-fsdevel, xfs, sandeen

On Fri, 2010-07-30 at 14:53 +1000, Dave Chinner wrote:
> On Thu, Jul 29, 2010 at 08:53:24PM -0600, Matthew Wilcox wrote:
> > On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> > > If we get two unaligned direct IO's to the same filesystem block
> > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > zero the portion of the block they are not writing data to. As a
> > > result, when the IOs complete there will be a portion of the block
> > > that contains zeros from the last IO to complete rather than the
> > > data that should be there.
> > 
> > Urgh.  Yuck.
> > 
> > > This is easily manifested by qemu using aio+dio with an unaligned
> > > guest filesystem - every IO is unaligned and fileystem corruption is
> > > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > > is also a simple reproducer.
> > > 
> > > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > > check new incoming unaligned IO that require sub-block zeroing against that
> > > list. If we get an overlap where the start and end of unaligned IOs hit the
> > > same filesystem block, then we need to block the incoming IOs until the IO that
> > > is zeroing the block completes. The blocked IO can then continue without
> > > needing to do any zeroing and hence won't overwrite valid data with zeros.
> > 
> > Urgh.  Yuck.
> 
> It's better than silent data corruption.
> 
> > Could we perhaps handle this by making an IO instantiate a page cache
> > page for partial writes, and forcing that portion of the IO through the
> > page cache?  The second IO would hit the same page and use the existing
> > O_DIRECT vs page cache paths.
> 
> I don't want any direct IO for XFS to go through the page cache -
> unaligned or not. using the page cache for the unaligned blocks
> would also be much worse for performance that this method because it
> turns unaligned direct IO into 3 IOs - the unaligned head block, the
> aligned body and the unaligned tail block. It would also be a
> performance hit you take on every single dio, whereas this way the
> hit is only taken when an overlap is detected.
> 

Does this problem also possible for DIO and non AIO case? (Ext4 case
this only happy with AIO+DIO+unaligned).  If not, could we simply force
unaligned AIO+DIO to be synchronous? Still direct IO...


> And besides, such decisions on whether to use buffered IO have to be
> made high up in the filesystem when we are deciding how to lock the
> inode for the dio - buffered IO requires exclusive locking, not the
> shared locking we do for dio writes. That further reduces the
> performance of unaligned direct IO even when there are no overlaps,
> and it's a solution that every filesystem needs to implement
> themselves in some way.
> 
> I've looked at a couple of different ways to fix this (e.g. passing
> the unaligned state to get_blocks() to allow the filesystem to
> serialise there) but they've all died a horrible death of dodgy
> locking and hacky IO completion detection. not to mention
> requiring a different solution in every filesystem.
> 

I also have been thinking other ways to fix this, initial thoughts about
this but concerned about the scalability. 

I was looking at ways to use buffer head state to indicate any unaligned
AIO DIO write to buffer_new buffers(do_direct_IO set the state), then
for any futhur unaligned IO need to wait for the AIO+DIO complete on
that buffer.  but the buffer head passed from do_direct_IO to
get_blocks() is a dummy buffer, have to get the buffer head from
device...this would only impact to all IOs that are really conflict with
pending AIO DIOs... it should work for ext4 case...would this something
usable for XFS? I have got the approach started last week but not very
far. 


Thanks,
Mingming

> This way may be a bit ugly, but it works, is isolated and seems to
> me to be the least-worst way of solving the problem. It could be
> made to scale better by making the tracking per-inode, but I don't
> think that growing the struct inode for this corner case is a good
> tradeoff, either...
> 
> Cheers,
> 
> Dave.



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-08-03 17:34       ` Mingming Cao
  0 siblings, 0 replies; 28+ messages in thread
From: Mingming Cao @ 2010-08-03 17:34 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, xfs, sandeen, Matthew Wilcox

On Fri, 2010-07-30 at 14:53 +1000, Dave Chinner wrote:
> On Thu, Jul 29, 2010 at 08:53:24PM -0600, Matthew Wilcox wrote:
> > On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> > > If we get two unaligned direct IO's to the same filesystem block
> > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > zero the portion of the block they are not writing data to. As a
> > > result, when the IOs complete there will be a portion of the block
> > > that contains zeros from the last IO to complete rather than the
> > > data that should be there.
> > 
> > Urgh.  Yuck.
> > 
> > > This is easily manifested by qemu using aio+dio with an unaligned
> > > guest filesystem - every IO is unaligned and fileystem corruption is
> > > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > > is also a simple reproducer.
> > > 
> > > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > > check new incoming unaligned IO that require sub-block zeroing against that
> > > list. If we get an overlap where the start and end of unaligned IOs hit the
> > > same filesystem block, then we need to block the incoming IOs until the IO that
> > > is zeroing the block completes. The blocked IO can then continue without
> > > needing to do any zeroing and hence won't overwrite valid data with zeros.
> > 
> > Urgh.  Yuck.
> 
> It's better than silent data corruption.
> 
> > Could we perhaps handle this by making an IO instantiate a page cache
> > page for partial writes, and forcing that portion of the IO through the
> > page cache?  The second IO would hit the same page and use the existing
> > O_DIRECT vs page cache paths.
> 
> I don't want any direct IO for XFS to go through the page cache -
> unaligned or not. using the page cache for the unaligned blocks
> would also be much worse for performance that this method because it
> turns unaligned direct IO into 3 IOs - the unaligned head block, the
> aligned body and the unaligned tail block. It would also be a
> performance hit you take on every single dio, whereas this way the
> hit is only taken when an overlap is detected.
> 

Does this problem also possible for DIO and non AIO case? (Ext4 case
this only happy with AIO+DIO+unaligned).  If not, could we simply force
unaligned AIO+DIO to be synchronous? Still direct IO...


> And besides, such decisions on whether to use buffered IO have to be
> made high up in the filesystem when we are deciding how to lock the
> inode for the dio - buffered IO requires exclusive locking, not the
> shared locking we do for dio writes. That further reduces the
> performance of unaligned direct IO even when there are no overlaps,
> and it's a solution that every filesystem needs to implement
> themselves in some way.
> 
> I've looked at a couple of different ways to fix this (e.g. passing
> the unaligned state to get_blocks() to allow the filesystem to
> serialise there) but they've all died a horrible death of dodgy
> locking and hacky IO completion detection. not to mention
> requiring a different solution in every filesystem.
> 

I also have been thinking other ways to fix this, initial thoughts about
this but concerned about the scalability. 

I was looking at ways to use buffer head state to indicate any unaligned
AIO DIO write to buffer_new buffers(do_direct_IO set the state), then
for any futhur unaligned IO need to wait for the AIO+DIO complete on
that buffer.  but the buffer head passed from do_direct_IO to
get_blocks() is a dummy buffer, have to get the buffer head from
device...this would only impact to all IOs that are really conflict with
pending AIO DIOs... it should work for ext4 case...would this something
usable for XFS? I have got the approach started last week but not very
far. 


Thanks,
Mingming

> This way may be a bit ugly, but it works, is isolated and seems to
> me to be the least-worst way of solving the problem. It could be
> made to scale better by making the tracking per-inode, but I don't
> think that growing the struct inode for this corner case is a good
> tradeoff, either...
> 
> Cheers,
> 
> Dave.


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-08-03 17:34       ` Mingming Cao
@ 2010-08-03 22:56         ` Dave Chinner
  -1 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-08-03 22:56 UTC (permalink / raw)
  To: Mingming Cao; +Cc: Matthew Wilcox, linux-fsdevel, xfs, sandeen

On Tue, Aug 03, 2010 at 10:34:25AM -0700, Mingming Cao wrote:
> On Fri, 2010-07-30 at 14:53 +1000, Dave Chinner wrote:
> > On Thu, Jul 29, 2010 at 08:53:24PM -0600, Matthew Wilcox wrote:
> > > On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> > > > If we get two unaligned direct IO's to the same filesystem block
> > > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > > zero the portion of the block they are not writing data to. As a
> > > > result, when the IOs complete there will be a portion of the block
> > > > that contains zeros from the last IO to complete rather than the
> > > > data that should be there.
....
> > I don't want any direct IO for XFS to go through the page cache -
> > unaligned or not. using the page cache for the unaligned blocks
> > would also be much worse for performance that this method because it
> > turns unaligned direct IO into 3 IOs - the unaligned head block, the
> > aligned body and the unaligned tail block. It would also be a
> > performance hit you take on every single dio, whereas this way the
> > hit is only taken when an overlap is detected.
> 
> Does this problem also possible for DIO and non AIO case? (Ext4 case
> this only happy with AIO+DIO+unaligned).  If not, could we simply force
> unaligned AIO+DIO to be synchronous? Still direct IO...

There is nothing specific to AIO about this bug. XFS (at least)
allows concurrent DIO writes to the same inode regardless of whether
they are dispatched via AIO or multiple separate threads and so the
race condition exists outside just the AIO context...

> > And besides, such decisions on whether to use buffered IO have to be
> > made high up in the filesystem when we are deciding how to lock the
> > inode for the dio - buffered IO requires exclusive locking, not the
> > shared locking we do for dio writes. That further reduces the
> > performance of unaligned direct IO even when there are no overlaps,
> > and it's a solution that every filesystem needs to implement
> > themselves in some way.
> > 
> > I've looked at a couple of different ways to fix this (e.g. passing
> > the unaligned state to get_blocks() to allow the filesystem to
> > serialise there) but they've all died a horrible death of dodgy
> > locking and hacky IO completion detection. not to mention
> > requiring a different solution in every filesystem.
> 
> I also have been thinking other ways to fix this, initial thoughts about
> this but concerned about the scalability. 
> 
> I was looking at ways to use buffer head state to indicate any unaligned
> AIO DIO write to buffer_new buffers(do_direct_IO set the state), then
> for any futhur unaligned IO need to wait for the AIO+DIO complete on
> that buffer.

That's effectively what my fix does, except it does not overload
bufferheads to do it.

> but the buffer head passed from do_direct_IO to
> get_blocks() is a dummy buffer, have to get the buffer head from
> device...

Which, IMO, seems like a dangerous layering violation - it's mixing
device associated bufferheads with filesystem IO (i.e. from
different address spaces) and, as such, the two are never guaranteed
to be coherent.  To make matters more complex, XFS can have at least two
block devices (data and real-time) that direct IO is targeted at, so
doing something that is block device specific seems much more
complex than just tracking the IO in a separate list...

> this would only impact to all IOs that are really conflict with
> pending AIO DIOs... it should work for ext4 case...would this something
> usable for XFS? I have got the approach started last week but not very
> far.

It might, but I don't think it's a viable approach.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-08-03 22:56         ` Dave Chinner
  0 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-08-03 22:56 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-fsdevel, xfs, sandeen, Matthew Wilcox

On Tue, Aug 03, 2010 at 10:34:25AM -0700, Mingming Cao wrote:
> On Fri, 2010-07-30 at 14:53 +1000, Dave Chinner wrote:
> > On Thu, Jul 29, 2010 at 08:53:24PM -0600, Matthew Wilcox wrote:
> > > On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> > > > If we get two unaligned direct IO's to the same filesystem block
> > > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > > zero the portion of the block they are not writing data to. As a
> > > > result, when the IOs complete there will be a portion of the block
> > > > that contains zeros from the last IO to complete rather than the
> > > > data that should be there.
....
> > I don't want any direct IO for XFS to go through the page cache -
> > unaligned or not. using the page cache for the unaligned blocks
> > would also be much worse for performance that this method because it
> > turns unaligned direct IO into 3 IOs - the unaligned head block, the
> > aligned body and the unaligned tail block. It would also be a
> > performance hit you take on every single dio, whereas this way the
> > hit is only taken when an overlap is detected.
> 
> Does this problem also possible for DIO and non AIO case? (Ext4 case
> this only happy with AIO+DIO+unaligned).  If not, could we simply force
> unaligned AIO+DIO to be synchronous? Still direct IO...

There is nothing specific to AIO about this bug. XFS (at least)
allows concurrent DIO writes to the same inode regardless of whether
they are dispatched via AIO or multiple separate threads and so the
race condition exists outside just the AIO context...

> > And besides, such decisions on whether to use buffered IO have to be
> > made high up in the filesystem when we are deciding how to lock the
> > inode for the dio - buffered IO requires exclusive locking, not the
> > shared locking we do for dio writes. That further reduces the
> > performance of unaligned direct IO even when there are no overlaps,
> > and it's a solution that every filesystem needs to implement
> > themselves in some way.
> > 
> > I've looked at a couple of different ways to fix this (e.g. passing
> > the unaligned state to get_blocks() to allow the filesystem to
> > serialise there) but they've all died a horrible death of dodgy
> > locking and hacky IO completion detection. not to mention
> > requiring a different solution in every filesystem.
> 
> I also have been thinking other ways to fix this, initial thoughts about
> this but concerned about the scalability. 
> 
> I was looking at ways to use buffer head state to indicate any unaligned
> AIO DIO write to buffer_new buffers(do_direct_IO set the state), then
> for any futhur unaligned IO need to wait for the AIO+DIO complete on
> that buffer.

That's effectively what my fix does, except it does not overload
bufferheads to do it.

> but the buffer head passed from do_direct_IO to
> get_blocks() is a dummy buffer, have to get the buffer head from
> device...

Which, IMO, seems like a dangerous layering violation - it's mixing
device associated bufferheads with filesystem IO (i.e. from
different address spaces) and, as such, the two are never guaranteed
to be coherent.  To make matters more complex, XFS can have at least two
block devices (data and real-time) that direct IO is targeted at, so
doing something that is block device specific seems much more
complex than just tracking the IO in a separate list...

> this would only impact to all IOs that are really conflict with
> pending AIO DIOs... it should work for ext4 case...would this something
> usable for XFS? I have got the approach started last week but not very
> far.

It might, but I don't think it's a viable approach.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-08-03 22:56         ` Dave Chinner
@ 2010-08-03 23:41           ` Mingming Cao
  -1 siblings, 0 replies; 28+ messages in thread
From: Mingming Cao @ 2010-08-03 23:41 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Matthew Wilcox, linux-fsdevel, xfs, sandeen

On Wed, 2010-08-04 at 08:56 +1000, Dave Chinner wrote:
> On Tue, Aug 03, 2010 at 10:34:25AM -0700, Mingming Cao wrote:
> > On Fri, 2010-07-30 at 14:53 +1000, Dave Chinner wrote:
> > > On Thu, Jul 29, 2010 at 08:53:24PM -0600, Matthew Wilcox wrote:
> > > > On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> > > > > If we get two unaligned direct IO's to the same filesystem block
> > > > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > > > zero the portion of the block they are not writing data to. As a
> > > > > result, when the IOs complete there will be a portion of the block
> > > > > that contains zeros from the last IO to complete rather than the
> > > > > data that should be there.
> ....
> > > I don't want any direct IO for XFS to go through the page cache -
> > > unaligned or not. using the page cache for the unaligned blocks
> > > would also be much worse for performance that this method because it
> > > turns unaligned direct IO into 3 IOs - the unaligned head block, the
> > > aligned body and the unaligned tail block. It would also be a
> > > performance hit you take on every single dio, whereas this way the
> > > hit is only taken when an overlap is detected.
> > 
> > Does this problem also possible for DIO and non AIO case? (Ext4 case
> > this only happy with AIO+DIO+unaligned).  If not, could we simply force
> > unaligned AIO+DIO to be synchronous? Still direct IO...
> 
> There is nothing specific to AIO about this bug. XFS (at least)
> allows concurrent DIO writes to the same inode regardless of whether
> they are dispatched via AIO or multiple separate threads and so the
> race condition exists outside just the AIO context...
> 

Okay..yeah ext4 prevent direct IO write to the same inode from multiple
threads, so this is not a issue for non-aio case.

How does XFS serialize direct IO (aligned) to the same file offset(or
overlap) from multiple threads?

Mingming


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-08-03 23:41           ` Mingming Cao
  0 siblings, 0 replies; 28+ messages in thread
From: Mingming Cao @ 2010-08-03 23:41 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, xfs, sandeen, Matthew Wilcox

On Wed, 2010-08-04 at 08:56 +1000, Dave Chinner wrote:
> On Tue, Aug 03, 2010 at 10:34:25AM -0700, Mingming Cao wrote:
> > On Fri, 2010-07-30 at 14:53 +1000, Dave Chinner wrote:
> > > On Thu, Jul 29, 2010 at 08:53:24PM -0600, Matthew Wilcox wrote:
> > > > On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> > > > > If we get two unaligned direct IO's to the same filesystem block
> > > > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > > > zero the portion of the block they are not writing data to. As a
> > > > > result, when the IOs complete there will be a portion of the block
> > > > > that contains zeros from the last IO to complete rather than the
> > > > > data that should be there.
> ....
> > > I don't want any direct IO for XFS to go through the page cache -
> > > unaligned or not. using the page cache for the unaligned blocks
> > > would also be much worse for performance that this method because it
> > > turns unaligned direct IO into 3 IOs - the unaligned head block, the
> > > aligned body and the unaligned tail block. It would also be a
> > > performance hit you take on every single dio, whereas this way the
> > > hit is only taken when an overlap is detected.
> > 
> > Does this problem also possible for DIO and non AIO case? (Ext4 case
> > this only happy with AIO+DIO+unaligned).  If not, could we simply force
> > unaligned AIO+DIO to be synchronous? Still direct IO...
> 
> There is nothing specific to AIO about this bug. XFS (at least)
> allows concurrent DIO writes to the same inode regardless of whether
> they are dispatched via AIO or multiple separate threads and so the
> race condition exists outside just the AIO context...
> 

Okay..yeah ext4 prevent direct IO write to the same inode from multiple
threads, so this is not a issue for non-aio case.

How does XFS serialize direct IO (aligned) to the same file offset(or
overlap) from multiple threads?

Mingming

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-07-29 22:45 ` Dave Chinner
@ 2010-08-04  0:11   ` Mingming Cao
  -1 siblings, 0 replies; 28+ messages in thread
From: Mingming Cao @ 2010-08-04  0:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, xfs, sandeen

On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If we get two unaligned direct IO's to the same filesystem block
> that is marked as a new allocation (i.e. buffer_new), then both IOs will
> zero the portion of the block they are not writing data to. As a
> result, when the IOs complete there will be a portion of the block
> that contains zeros from the last IO to complete rather than the
> data that should be there.
> 
> This is easily manifested by qemu using aio+dio with an unaligned
> guest filesystem - every IO is unaligned and fileystem corruption is
> encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> is also a simple reproducer.
> 
> To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> check new incoming unaligned IO that require sub-block zeroing against that
> list. If we get an overlap where the start and end of unaligned IOs hit the
> same filesystem block, then we need to block the incoming IOs until the IO that
> is zeroing the block completes. The blocked IO can then continue without
> needing to do any zeroing and hence won't overwrite valid data with zeros.
> 

This seems to address both two IOs are unaligned direct IO. If the first
IO is aligned direct IO, then it is not tracked?

I am also concerned about the aligned direct IO case...

1) first thread aio+dio+aligned write to a hole, there is no zero-out
submitted from kernel. But the hole remains initialized before all IO
complete and convert it from uninitialized extent to initialized.
2) second thread aio+dio+unalign write to the same hole, this time it is
unaligned. since buffer is still new (not converted yet), the new
incoming thread zero out port of data that first thread has written to

It seems would need to track any direct IO written to a hole/unwrtten
extent/ buffer new, not only the unaligned dio ...

regards,
Mingming
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/direct-io.c |  152 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 146 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index a10cb91..611524e 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -71,6 +71,9 @@ struct dio {
>  	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
>  					   been performed at the start of a
>  					   write */
> +#define LAST_SECTOR ((sector_t)-1LL)
> +	sector_t zero_block_front;	/* fs block we are zeroing at front */
> +	sector_t zero_block_rear;	/* fs block we are zeroing at rear */
>  	int pages_in_io;		/* approximate total IO pages */
>  	size_t	size;			/* total request size (doesn't change)*/
>  	sector_t block_in_file;		/* Current offset into the underlying
> @@ -135,6 +138,101 @@ struct dio {
>  	struct page *pages[DIO_PAGES];	/* page buffer */
>  };
> 
> +
> +/*
> + * record fs blocks we are doing zeroing on in a zero block list.
> + * unaligned IO is not very performant and so is relatively uncommon,
> + * so a global list should be sufficent to track them.
> + */
> +struct dio_zero_block {
> +	struct list_head dio_list;	/* list of io in progress */
> +	sector_t	zero_block;	/* block being zeroed */
> +	struct dio	*dio;		/* owner dio */
> +	wait_queue_head_t wq;		/* New IO block here */
> +	atomic_t	ref;		/* reference count */
> +};
> +
> +DEFINE_SPINLOCK(dio_zero_block_lock);
> +LIST_HEAD(dio_zero_block_list);
> +
> +/*
> + * Add a filesystem block to the list of blocks we are tracking.
> + */
> +static void
> +dio_start_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	zb = kmalloc(sizeof(*zb), GFP_NOIO);
> +	if (!zb)
> +		return;
> +	INIT_LIST_HEAD(&zb->dio_list);
> +	init_waitqueue_head(&zb->wq);
> +	zb->zero_block = zero_block;
> +	zb->dio = dio;
> +	atomic_set(&zb->ref, 1);
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_add(&zb->dio_list, &dio_zero_block_list);
> +	spin_unlock(&dio_zero_block_lock);
> +}
> +
> +static void
> +dio_drop_zero_block(struct dio_zero_block *zb)
> +{
> +	if (atomic_dec_and_test(&zb->ref))
> +		kfree(zb);
> +}
> +
> +/*
> + * Check whether a filesystem block is currently being zeroed, and if it is
> + * wait for it to complete before returning. If we waited for a block being
> + * zeroed, return 1 to indicate that the block is already initialised,
> + * otherwise return 0 to indicate that it needs zeroing.
> + */
> +static int
> +dio_wait_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +		if (zb->dio->inode != dio->inode)
> +			continue;
> +		if (zb->zero_block != zero_block)
> +			continue;
> +		atomic_inc(&zb->ref);
> +		spin_unlock(&dio_zero_block_lock);
> +		wait_event(zb->wq, (list_empty(&zb->dio_list)));
> +		dio_drop_zero_block(zb);
> +		return 1;
> +	}
> +	spin_unlock(&dio_zero_block_lock);
> +	return 0;
> +}
> +
> +/*
> + * Complete a block zeroing and wake up anyone waiting for it.
> + */
> +static void dio_end_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +		if (zb->dio->inode != dio->inode)
> +			continue;
> +		if (zb->zero_block != zero_block)
> +			continue;
> +		list_del_init(&zb->dio_list);
> +		spin_unlock(&dio_zero_block_lock);
> +		wake_up(&zb->wq);
> +		dio_drop_zero_block(zb);
> +		return;
> +	}
> +	spin_unlock(&dio_zero_block_lock);
> +}
> +
>  /*
>   * How many pages are in the queue?
>   */
> @@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
>  		aio_complete(dio->iocb, ret, 0);
>  	}
> 
> +	if (dio->zero_block_front != LAST_SECTOR)
> +		dio_end_zero_block(dio, dio->zero_block_front);
> +	if (dio->zero_block_rear != LAST_SECTOR)
> +		dio_end_zero_block(dio, dio->zero_block_rear);
> +
>  	if (dio->flags & DIO_LOCKING)
>  		/* lockdep: non-owner release */
>  		up_read_non_owner(&dio->inode->i_alloc_sem);
> @@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio)
>   * block with zeros. This happens only if user-buffer, fileoffset or
>   * io length is not filesystem block-size multiple.
>   *
> + * We need to track the blocks we are zeroing. If we have concurrent IOs that hit
> + * the same start or end block, we do not want all the IOs to zero the portion
> + * they are not writing data to as that will overwrite data from the other IOs.
> + * Hence we need to block until the first unaligned IO completes before we can
> + * continue (without executing any zeroing).
> + *
>   * `end' is zero if we're doing the start of the IO, 1 at the end of the
>   * IO.
>   */
> @@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end)
>  {
>  	unsigned dio_blocks_per_fs_block;
>  	unsigned this_chunk_blocks;	/* In dio_blocks */
> -	unsigned this_chunk_bytes;
>  	struct page *page;
> +	sector_t fsblock;
> 
>  	dio->start_zero_done = 1;
>  	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
> @@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int end)
>  	if (!this_chunk_blocks)
>  		return;
> 
> +	if (end)
> +		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +
>  	/*
>  	 * We need to zero out part of an fs block.  It is either at the
> -	 * beginning or the end of the fs block.
> +	 * beginning or the end of the fs block, but first we need to check if
> +	 * there is already a zeroing being run on this block.
> +	 *
> +	 * If we are doing a sub-block IO (i.e. zeroing both front and rear of
> +	 * the same block) we don't need to wait or set a gaurd for the rear of
> +	 * the block as we already have one set.
>  	 */
> -	if (end) 
> -		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +	fsblock = dio->block_in_file >> dio->blkfactor;
> +	if (!end || dio->zero_block_front != fsblock) {
> 
> -	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
> +		/* wait for any zeroing already in progress */
> +		if (dio_wait_zero_block(dio, fsblock)) {
> +			/* skip the range we would have zeroed. */
> +			dio->next_block_for_io += this_chunk_blocks;
> +			return;
> +		}
> +
> +		/*
> +		 * we are going to zero stuff now, so set a guard to catch
> +		 * others that might want to zero the same block.
> +		 */
> +		dio_start_zero_block(dio, fsblock);
> +		if (end)
> +			dio->zero_block_rear = fsblock;
> +		else
> +			dio->zero_block_front = fsblock;
> +	}
> 
>  	page = ZERO_PAGE(0);
> -	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
> +	if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbits,
>  				dio->next_block_for_io))
>  		return;
> 
> @@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
>  	 */
>  	memset(dio, 0, offsetof(struct dio, pages));
> 
> +	/*
> +	 * zero_blocks need to initialised to largeѕt value to avoid
> +	 * matching the zero block accidentally.
> +	 */
> +	dio->zero_block_front = LAST_SECTOR;
> +	dio->zero_block_rear = LAST_SECTOR;
> +
>  	dio->flags = flags;
>  	if (dio->flags & DIO_LOCKING) {
>  		/* watch out for a 0 len io from a tricksy fs */
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-08-04  0:11   ` Mingming Cao
  0 siblings, 0 replies; 28+ messages in thread
From: Mingming Cao @ 2010-08-04  0:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, sandeen, xfs

On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> If we get two unaligned direct IO's to the same filesystem block
> that is marked as a new allocation (i.e. buffer_new), then both IOs will
> zero the portion of the block they are not writing data to. As a
> result, when the IOs complete there will be a portion of the block
> that contains zeros from the last IO to complete rather than the
> data that should be there.
> 
> This is easily manifested by qemu using aio+dio with an unaligned
> guest filesystem - every IO is unaligned and fileystem corruption is
> encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> is also a simple reproducer.
> 
> To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> check new incoming unaligned IO that require sub-block zeroing against that
> list. If we get an overlap where the start and end of unaligned IOs hit the
> same filesystem block, then we need to block the incoming IOs until the IO that
> is zeroing the block completes. The blocked IO can then continue without
> needing to do any zeroing and hence won't overwrite valid data with zeros.
> 

This seems to address both two IOs are unaligned direct IO. If the first
IO is aligned direct IO, then it is not tracked?

I am also concerned about the aligned direct IO case...

1) first thread aio+dio+aligned write to a hole, there is no zero-out
submitted from kernel. But the hole remains initialized before all IO
complete and convert it from uninitialized extent to initialized.
2) second thread aio+dio+unalign write to the same hole, this time it is
unaligned. since buffer is still new (not converted yet), the new
incoming thread zero out port of data that first thread has written to

It seems would need to track any direct IO written to a hole/unwrtten
extent/ buffer new, not only the unaligned dio ...

regards,
Mingming
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/direct-io.c |  152 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 146 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index a10cb91..611524e 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -71,6 +71,9 @@ struct dio {
>  	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
>  					   been performed at the start of a
>  					   write */
> +#define LAST_SECTOR ((sector_t)-1LL)
> +	sector_t zero_block_front;	/* fs block we are zeroing at front */
> +	sector_t zero_block_rear;	/* fs block we are zeroing at rear */
>  	int pages_in_io;		/* approximate total IO pages */
>  	size_t	size;			/* total request size (doesn't change)*/
>  	sector_t block_in_file;		/* Current offset into the underlying
> @@ -135,6 +138,101 @@ struct dio {
>  	struct page *pages[DIO_PAGES];	/* page buffer */
>  };
> 
> +
> +/*
> + * record fs blocks we are doing zeroing on in a zero block list.
> + * unaligned IO is not very performant and so is relatively uncommon,
> + * so a global list should be sufficent to track them.
> + */
> +struct dio_zero_block {
> +	struct list_head dio_list;	/* list of io in progress */
> +	sector_t	zero_block;	/* block being zeroed */
> +	struct dio	*dio;		/* owner dio */
> +	wait_queue_head_t wq;		/* New IO block here */
> +	atomic_t	ref;		/* reference count */
> +};
> +
> +DEFINE_SPINLOCK(dio_zero_block_lock);
> +LIST_HEAD(dio_zero_block_list);
> +
> +/*
> + * Add a filesystem block to the list of blocks we are tracking.
> + */
> +static void
> +dio_start_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	zb = kmalloc(sizeof(*zb), GFP_NOIO);
> +	if (!zb)
> +		return;
> +	INIT_LIST_HEAD(&zb->dio_list);
> +	init_waitqueue_head(&zb->wq);
> +	zb->zero_block = zero_block;
> +	zb->dio = dio;
> +	atomic_set(&zb->ref, 1);
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_add(&zb->dio_list, &dio_zero_block_list);
> +	spin_unlock(&dio_zero_block_lock);
> +}
> +
> +static void
> +dio_drop_zero_block(struct dio_zero_block *zb)
> +{
> +	if (atomic_dec_and_test(&zb->ref))
> +		kfree(zb);
> +}
> +
> +/*
> + * Check whether a filesystem block is currently being zeroed, and if it is
> + * wait for it to complete before returning. If we waited for a block being
> + * zeroed, return 1 to indicate that the block is already initialised,
> + * otherwise return 0 to indicate that it needs zeroing.
> + */
> +static int
> +dio_wait_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +		if (zb->dio->inode != dio->inode)
> +			continue;
> +		if (zb->zero_block != zero_block)
> +			continue;
> +		atomic_inc(&zb->ref);
> +		spin_unlock(&dio_zero_block_lock);
> +		wait_event(zb->wq, (list_empty(&zb->dio_list)));
> +		dio_drop_zero_block(zb);
> +		return 1;
> +	}
> +	spin_unlock(&dio_zero_block_lock);
> +	return 0;
> +}
> +
> +/*
> + * Complete a block zeroing and wake up anyone waiting for it.
> + */
> +static void dio_end_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +		if (zb->dio->inode != dio->inode)
> +			continue;
> +		if (zb->zero_block != zero_block)
> +			continue;
> +		list_del_init(&zb->dio_list);
> +		spin_unlock(&dio_zero_block_lock);
> +		wake_up(&zb->wq);
> +		dio_drop_zero_block(zb);
> +		return;
> +	}
> +	spin_unlock(&dio_zero_block_lock);
> +}
> +
>  /*
>   * How many pages are in the queue?
>   */
> @@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
>  		aio_complete(dio->iocb, ret, 0);
>  	}
> 
> +	if (dio->zero_block_front != LAST_SECTOR)
> +		dio_end_zero_block(dio, dio->zero_block_front);
> +	if (dio->zero_block_rear != LAST_SECTOR)
> +		dio_end_zero_block(dio, dio->zero_block_rear);
> +
>  	if (dio->flags & DIO_LOCKING)
>  		/* lockdep: non-owner release */
>  		up_read_non_owner(&dio->inode->i_alloc_sem);
> @@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio)
>   * block with zeros. This happens only if user-buffer, fileoffset or
>   * io length is not filesystem block-size multiple.
>   *
> + * We need to track the blocks we are zeroing. If we have concurrent IOs that hit
> + * the same start or end block, we do not want all the IOs to zero the portion
> + * they are not writing data to as that will overwrite data from the other IOs.
> + * Hence we need to block until the first unaligned IO completes before we can
> + * continue (without executing any zeroing).
> + *
>   * `end' is zero if we're doing the start of the IO, 1 at the end of the
>   * IO.
>   */
> @@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end)
>  {
>  	unsigned dio_blocks_per_fs_block;
>  	unsigned this_chunk_blocks;	/* In dio_blocks */
> -	unsigned this_chunk_bytes;
>  	struct page *page;
> +	sector_t fsblock;
> 
>  	dio->start_zero_done = 1;
>  	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
> @@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int end)
>  	if (!this_chunk_blocks)
>  		return;
> 
> +	if (end)
> +		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +
>  	/*
>  	 * We need to zero out part of an fs block.  It is either at the
> -	 * beginning or the end of the fs block.
> +	 * beginning or the end of the fs block, but first we need to check if
> +	 * there is already a zeroing being run on this block.
> +	 *
> +	 * If we are doing a sub-block IO (i.e. zeroing both front and rear of
> +	 * the same block) we don't need to wait or set a gaurd for the rear of
> +	 * the block as we already have one set.
>  	 */
> -	if (end) 
> -		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +	fsblock = dio->block_in_file >> dio->blkfactor;
> +	if (!end || dio->zero_block_front != fsblock) {
> 
> -	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
> +		/* wait for any zeroing already in progress */
> +		if (dio_wait_zero_block(dio, fsblock)) {
> +			/* skip the range we would have zeroed. */
> +			dio->next_block_for_io += this_chunk_blocks;
> +			return;
> +		}
> +
> +		/*
> +		 * we are going to zero stuff now, so set a guard to catch
> +		 * others that might want to zero the same block.
> +		 */
> +		dio_start_zero_block(dio, fsblock);
> +		if (end)
> +			dio->zero_block_rear = fsblock;
> +		else
> +			dio->zero_block_front = fsblock;
> +	}
> 
>  	page = ZERO_PAGE(0);
> -	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
> +	if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbits,
>  				dio->next_block_for_io))
>  		return;
> 
> @@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
>  	 */
>  	memset(dio, 0, offsetof(struct dio, pages));
> 
> +	/*
> +	 * zero_blocks need to initialised to largeѕt value to avoid
> +	 * matching the zero block accidentally.
> +	 */
> +	dio->zero_block_front = LAST_SECTOR;
> +	dio->zero_block_rear = LAST_SECTOR;
> +
>  	dio->flags = flags;
>  	if (dio->flags & DIO_LOCKING) {
>  		/* watch out for a 0 len io from a tricksy fs */
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-08-04  0:11   ` Mingming Cao
@ 2010-08-04  3:37     ` Dave Chinner
  -1 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-08-04  3:37 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-fsdevel, xfs, sandeen

On Tue, Aug 03, 2010 at 05:11:18PM -0700, Mingming Cao wrote:
> On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > If we get two unaligned direct IO's to the same filesystem block
> > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > zero the portion of the block they are not writing data to. As a
> > result, when the IOs complete there will be a portion of the block
> > that contains zeros from the last IO to complete rather than the
> > data that should be there.
> > 
> > This is easily manifested by qemu using aio+dio with an unaligned
> > guest filesystem - every IO is unaligned and fileystem corruption is
> > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > is also a simple reproducer.
> > 
> > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > check new incoming unaligned IO that require sub-block zeroing against that
> > list. If we get an overlap where the start and end of unaligned IOs hit the
> > same filesystem block, then we need to block the incoming IOs until the IO that
> > is zeroing the block completes. The blocked IO can then continue without
> > needing to do any zeroing and hence won't overwrite valid data with zeros.
> > 
> 
> This seems to address both two IOs are unaligned direct IO. If the first
> IO is aligned direct IO, then it is not tracked?
> 
> I am also concerned about the aligned direct IO case...
> 
> 1) first thread aio+dio+aligned write to a hole, there is no zero-out
> submitted from kernel. But the hole remains initialized before all IO
> complete and convert it from uninitialized extent to initialized.
> 2) second thread aio+dio+unalign write to the same hole, this time it is
> unaligned. since buffer is still new (not converted yet), the new
> incoming thread zero out port of data that first thread has written to

That is clearly and unmistakably an application bug - it should not
be issuing concurrent, overlapping IO to the same block(s)
regardless of whether they are unaligned, aligned or a mixture of
both. By using direct IO, the application has assumed responsibility
for preventing data corruption due to overlapping IOs - they are
inherently racy and nothing in the dio code prevents that from
occurring.

The bug I'm fixing is for *non-overlapping* concurrent unaligned IOs
where the kernel direct IO code causes the data corruption, not the
application. The application is not doing something stupid, and as
such needs to be fixed.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-08-04  3:37     ` Dave Chinner
  0 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-08-04  3:37 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-fsdevel, sandeen, xfs

On Tue, Aug 03, 2010 at 05:11:18PM -0700, Mingming Cao wrote:
> On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > 
> > If we get two unaligned direct IO's to the same filesystem block
> > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > zero the portion of the block they are not writing data to. As a
> > result, when the IOs complete there will be a portion of the block
> > that contains zeros from the last IO to complete rather than the
> > data that should be there.
> > 
> > This is easily manifested by qemu using aio+dio with an unaligned
> > guest filesystem - every IO is unaligned and fileystem corruption is
> > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > is also a simple reproducer.
> > 
> > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > check new incoming unaligned IO that require sub-block zeroing against that
> > list. If we get an overlap where the start and end of unaligned IOs hit the
> > same filesystem block, then we need to block the incoming IOs until the IO that
> > is zeroing the block completes. The blocked IO can then continue without
> > needing to do any zeroing and hence won't overwrite valid data with zeros.
> > 
> 
> This seems to address both two IOs are unaligned direct IO. If the first
> IO is aligned direct IO, then it is not tracked?
> 
> I am also concerned about the aligned direct IO case...
> 
> 1) first thread aio+dio+aligned write to a hole, there is no zero-out
> submitted from kernel. But the hole remains initialized before all IO
> complete and convert it from uninitialized extent to initialized.
> 2) second thread aio+dio+unalign write to the same hole, this time it is
> unaligned. since buffer is still new (not converted yet), the new
> incoming thread zero out port of data that first thread has written to

That is clearly and unmistakably an application bug - it should not
be issuing concurrent, overlapping IO to the same block(s)
regardless of whether they are unaligned, aligned or a mixture of
both. By using direct IO, the application has assumed responsibility
for preventing data corruption due to overlapping IOs - they are
inherently racy and nothing in the dio code prevents that from
occurring.

The bug I'm fixing is for *non-overlapping* concurrent unaligned IOs
where the kernel direct IO code causes the data corruption, not the
application. The application is not doing something stupid, and as
such needs to be fixed.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-08-04  3:37     ` Dave Chinner
@ 2010-08-04  3:58       ` Dave Chinner
  -1 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-08-04  3:58 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-fsdevel, xfs, sandeen

On Wed, Aug 04, 2010 at 01:37:18PM +1000, Dave Chinner wrote:
> On Tue, Aug 03, 2010 at 05:11:18PM -0700, Mingming Cao wrote:
> > On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > If we get two unaligned direct IO's to the same filesystem block
> > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > zero the portion of the block they are not writing data to. As a
> > > result, when the IOs complete there will be a portion of the block
> > > that contains zeros from the last IO to complete rather than the
> > > data that should be there.
> > > 
> > > This is easily manifested by qemu using aio+dio with an unaligned
> > > guest filesystem - every IO is unaligned and fileystem corruption is
> > > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > > is also a simple reproducer.
> > > 
> > > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > > check new incoming unaligned IO that require sub-block zeroing against that
> > > list. If we get an overlap where the start and end of unaligned IOs hit the
> > > same filesystem block, then we need to block the incoming IOs until the IO that
> > > is zeroing the block completes. The blocked IO can then continue without
> > > needing to do any zeroing and hence won't overwrite valid data with zeros.
> > > 
> > 
> > This seems to address both two IOs are unaligned direct IO. If the first
> > IO is aligned direct IO, then it is not tracked?
> > 
> > I am also concerned about the aligned direct IO case...
> > 
> > 1) first thread aio+dio+aligned write to a hole, there is no zero-out
> > submitted from kernel. But the hole remains initialized before all IO
> > complete and convert it from uninitialized extent to initialized.
> > 2) second thread aio+dio+unalign write to the same hole, this time it is
> > unaligned. since buffer is still new (not converted yet), the new
> > incoming thread zero out port of data that first thread has written to
> 
> That is clearly and unmistakably an application bug - it should not
> be issuing concurrent, overlapping IO to the same block(s)
> regardless of whether they are unaligned, aligned or a mixture of
> both. By using direct IO, the application has assumed responsibility
> for preventing data corruption due to overlapping IOs - they are
> inherently racy and nothing in the dio code prevents that from
> occurring.
> 
> The bug I'm fixing is for *non-overlapping* concurrent unaligned IOs
> where the kernel direct IO code causes the data corruption, not the
> application. The application is not doing something stupid, and as
> such needs to be fixed.
   ^^^^^^
   the kernel bug needs to be fixed.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-08-04  3:58       ` Dave Chinner
  0 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-08-04  3:58 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-fsdevel, sandeen, xfs

On Wed, Aug 04, 2010 at 01:37:18PM +1000, Dave Chinner wrote:
> On Tue, Aug 03, 2010 at 05:11:18PM -0700, Mingming Cao wrote:
> > On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > If we get two unaligned direct IO's to the same filesystem block
> > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > zero the portion of the block they are not writing data to. As a
> > > result, when the IOs complete there will be a portion of the block
> > > that contains zeros from the last IO to complete rather than the
> > > data that should be there.
> > > 
> > > This is easily manifested by qemu using aio+dio with an unaligned
> > > guest filesystem - every IO is unaligned and fileystem corruption is
> > > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > > is also a simple reproducer.
> > > 
> > > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > > check new incoming unaligned IO that require sub-block zeroing against that
> > > list. If we get an overlap where the start and end of unaligned IOs hit the
> > > same filesystem block, then we need to block the incoming IOs until the IO that
> > > is zeroing the block completes. The blocked IO can then continue without
> > > needing to do any zeroing and hence won't overwrite valid data with zeros.
> > > 
> > 
> > This seems to address both two IOs are unaligned direct IO. If the first
> > IO is aligned direct IO, then it is not tracked?
> > 
> > I am also concerned about the aligned direct IO case...
> > 
> > 1) first thread aio+dio+aligned write to a hole, there is no zero-out
> > submitted from kernel. But the hole remains initialized before all IO
> > complete and convert it from uninitialized extent to initialized.
> > 2) second thread aio+dio+unalign write to the same hole, this time it is
> > unaligned. since buffer is still new (not converted yet), the new
> > incoming thread zero out port of data that first thread has written to
> 
> That is clearly and unmistakably an application bug - it should not
> be issuing concurrent, overlapping IO to the same block(s)
> regardless of whether they are unaligned, aligned or a mixture of
> both. By using direct IO, the application has assumed responsibility
> for preventing data corruption due to overlapping IOs - they are
> inherently racy and nothing in the dio code prevents that from
> occurring.
> 
> The bug I'm fixing is for *non-overlapping* concurrent unaligned IOs
> where the kernel direct IO code causes the data corruption, not the
> application. The application is not doing something stupid, and as
> such needs to be fixed.
   ^^^^^^
   the kernel bug needs to be fixed.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-08-03 23:41           ` Mingming Cao
@ 2010-08-04  4:22             ` Dave Chinner
  -1 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-08-04  4:22 UTC (permalink / raw)
  To: Mingming Cao; +Cc: Matthew Wilcox, linux-fsdevel, xfs, sandeen

On Tue, Aug 03, 2010 at 04:41:14PM -0700, Mingming Cao wrote:
> On Wed, 2010-08-04 at 08:56 +1000, Dave Chinner wrote:
> > On Tue, Aug 03, 2010 at 10:34:25AM -0700, Mingming Cao wrote:
> > > On Fri, 2010-07-30 at 14:53 +1000, Dave Chinner wrote:
> > > > On Thu, Jul 29, 2010 at 08:53:24PM -0600, Matthew Wilcox wrote:
> > > > > On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> > > > > > If we get two unaligned direct IO's to the same filesystem block
> > > > > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > > > > zero the portion of the block they are not writing data to. As a
> > > > > > result, when the IOs complete there will be a portion of the block
> > > > > > that contains zeros from the last IO to complete rather than the
> > > > > > data that should be there.
> > ....
> > > > I don't want any direct IO for XFS to go through the page cache -
> > > > unaligned or not. using the page cache for the unaligned blocks
> > > > would also be much worse for performance that this method because it
> > > > turns unaligned direct IO into 3 IOs - the unaligned head block, the
> > > > aligned body and the unaligned tail block. It would also be a
> > > > performance hit you take on every single dio, whereas this way the
> > > > hit is only taken when an overlap is detected.
> > > 
> > > Does this problem also possible for DIO and non AIO case? (Ext4 case
> > > this only happy with AIO+DIO+unaligned).  If not, could we simply force
> > > unaligned AIO+DIO to be synchronous? Still direct IO...
> > 
> > There is nothing specific to AIO about this bug. XFS (at least)
> > allows concurrent DIO writes to the same inode regardless of whether
> > they are dispatched via AIO or multiple separate threads and so the
> > race condition exists outside just the AIO context...
> > 
> 
> Okay..yeah ext4 prevent direct IO write to the same inode from multiple
> threads, so this is not a issue for non-aio case.
> 
> How does XFS serialize direct IO (aligned) to the same file offset(or
> overlap) from multiple threads?

It doesn't. The 1996 usenix paper describes it well:

http://oss.sgi.com/projects/xfs/papers/xfs_usenix/index.html

See section 6.2 "Performing File I/O", sepcifically the sections on
"Using Direct I/O" and "Using Multiple Processes". Quote:

"When using direct I/O, multiple readers and writers can all access
the file simultaneously. Currently, when using direct I/O and
multiple writers, we place the burden of serializing writes to the
same region of the file on the application."

This is still true 15 years later....


Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-08-04  4:22             ` Dave Chinner
  0 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-08-04  4:22 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-fsdevel, xfs, sandeen, Matthew Wilcox

On Tue, Aug 03, 2010 at 04:41:14PM -0700, Mingming Cao wrote:
> On Wed, 2010-08-04 at 08:56 +1000, Dave Chinner wrote:
> > On Tue, Aug 03, 2010 at 10:34:25AM -0700, Mingming Cao wrote:
> > > On Fri, 2010-07-30 at 14:53 +1000, Dave Chinner wrote:
> > > > On Thu, Jul 29, 2010 at 08:53:24PM -0600, Matthew Wilcox wrote:
> > > > > On Fri, Jul 30, 2010 at 08:45:16AM +1000, Dave Chinner wrote:
> > > > > > If we get two unaligned direct IO's to the same filesystem block
> > > > > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > > > > zero the portion of the block they are not writing data to. As a
> > > > > > result, when the IOs complete there will be a portion of the block
> > > > > > that contains zeros from the last IO to complete rather than the
> > > > > > data that should be there.
> > ....
> > > > I don't want any direct IO for XFS to go through the page cache -
> > > > unaligned or not. using the page cache for the unaligned blocks
> > > > would also be much worse for performance that this method because it
> > > > turns unaligned direct IO into 3 IOs - the unaligned head block, the
> > > > aligned body and the unaligned tail block. It would also be a
> > > > performance hit you take on every single dio, whereas this way the
> > > > hit is only taken when an overlap is detected.
> > > 
> > > Does this problem also possible for DIO and non AIO case? (Ext4 case
> > > this only happy with AIO+DIO+unaligned).  If not, could we simply force
> > > unaligned AIO+DIO to be synchronous? Still direct IO...
> > 
> > There is nothing specific to AIO about this bug. XFS (at least)
> > allows concurrent DIO writes to the same inode regardless of whether
> > they are dispatched via AIO or multiple separate threads and so the
> > race condition exists outside just the AIO context...
> > 
> 
> Okay..yeah ext4 prevent direct IO write to the same inode from multiple
> threads, so this is not a issue for non-aio case.
> 
> How does XFS serialize direct IO (aligned) to the same file offset(or
> overlap) from multiple threads?

It doesn't. The 1996 usenix paper describes it well:

http://oss.sgi.com/projects/xfs/papers/xfs_usenix/index.html

See section 6.2 "Performing File I/O", sepcifically the sections on
"Using Direct I/O" and "Using Multiple Processes". Quote:

"When using direct I/O, multiple readers and writers can all access
the file simultaneously. Currently, when using direct I/O and
multiple writers, we place the burden of serializing writes to the
same region of the file on the application."

This is still true 15 years later....


Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-08-04  3:37     ` Dave Chinner
@ 2010-08-04 14:55       ` Mingming Cao
  -1 siblings, 0 replies; 28+ messages in thread
From: Mingming Cao @ 2010-08-04 14:55 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, xfs, sandeen

On Wed, 2010-08-04 at 13:37 +1000, Dave Chinner wrote:
> On Tue, Aug 03, 2010 at 05:11:18PM -0700, Mingming Cao wrote:
> > On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > If we get two unaligned direct IO's to the same filesystem block
> > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > zero the portion of the block they are not writing data to. As a
> > > result, when the IOs complete there will be a portion of the block
> > > that contains zeros from the last IO to complete rather than the
> > > data that should be there.
> > > 
> > > This is easily manifested by qemu using aio+dio with an unaligned
> > > guest filesystem - every IO is unaligned and fileystem corruption is
> > > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > > is also a simple reproducer.
> > > 
> > > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > > check new incoming unaligned IO that require sub-block zeroing against that
> > > list. If we get an overlap where the start and end of unaligned IOs hit the
> > > same filesystem block, then we need to block the incoming IOs until the IO that
> > > is zeroing the block completes. The blocked IO can then continue without
> > > needing to do any zeroing and hence won't overwrite valid data with zeros.
> > > 
> > 
> > This seems to address both two IOs are unaligned direct IO. If the first
> > IO is aligned direct IO, then it is not tracked?
> > 
> > I am also concerned about the aligned direct IO case...
> > 
> > 1) first thread aio+dio+aligned write to a hole, there is no zero-out
> > submitted from kernel. But the hole remains initialized before all IO
> > complete and convert it from uninitialized extent to initialized.
> > 2) second thread aio+dio+unalign write to the same hole, this time it is
> > unaligned. since buffer is still new (not converted yet), the new
> > incoming thread zero out port of data that first thread has written to
> 
> That is clearly and unmistakably an application bug - it should not
> be issuing concurrent, overlapping IO to the same block(s)
> regardless of whether they are unaligned, aligned or a mixture of
> both. By using direct IO, the application has assumed responsibility
> for preventing data corruption due to overlapping IOs - they are
> inherently racy and nothing in the dio code prevents that from
> occurring.
> 

while there are multiple applications running on the same filesystem,
they could possible touching the same files concurrently. How could
applications know there is other apps to change the same file at the
same time?

> The bug I'm fixing is for *non-overlapping* concurrent unaligned IOs
> where the kernel direct IO code causes the data corruption, not the
> application. The application is not doing something stupid, and as
> such needs to be fixed.
> 

For the case I refering to here, it's the kernel direct IO who zero out
the block for 2).  The application did 2) did not do zero-out it self,
but it will result in loose data the application write in 1). It's the
same as what you are trying to fix.


Mingming



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-08-04 14:55       ` Mingming Cao
  0 siblings, 0 replies; 28+ messages in thread
From: Mingming Cao @ 2010-08-04 14:55 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-fsdevel, sandeen, xfs

On Wed, 2010-08-04 at 13:37 +1000, Dave Chinner wrote:
> On Tue, Aug 03, 2010 at 05:11:18PM -0700, Mingming Cao wrote:
> > On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> > > From: Dave Chinner <dchinner@redhat.com>
> > > 
> > > If we get two unaligned direct IO's to the same filesystem block
> > > that is marked as a new allocation (i.e. buffer_new), then both IOs will
> > > zero the portion of the block they are not writing data to. As a
> > > result, when the IOs complete there will be a portion of the block
> > > that contains zeros from the last IO to complete rather than the
> > > data that should be there.
> > > 
> > > This is easily manifested by qemu using aio+dio with an unaligned
> > > guest filesystem - every IO is unaligned and fileystem corruption is
> > > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> > > is also a simple reproducer.
> > > 
> > > To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> > > check new incoming unaligned IO that require sub-block zeroing against that
> > > list. If we get an overlap where the start and end of unaligned IOs hit the
> > > same filesystem block, then we need to block the incoming IOs until the IO that
> > > is zeroing the block completes. The blocked IO can then continue without
> > > needing to do any zeroing and hence won't overwrite valid data with zeros.
> > > 
> > 
> > This seems to address both two IOs are unaligned direct IO. If the first
> > IO is aligned direct IO, then it is not tracked?
> > 
> > I am also concerned about the aligned direct IO case...
> > 
> > 1) first thread aio+dio+aligned write to a hole, there is no zero-out
> > submitted from kernel. But the hole remains initialized before all IO
> > complete and convert it from uninitialized extent to initialized.
> > 2) second thread aio+dio+unalign write to the same hole, this time it is
> > unaligned. since buffer is still new (not converted yet), the new
> > incoming thread zero out port of data that first thread has written to
> 
> That is clearly and unmistakably an application bug - it should not
> be issuing concurrent, overlapping IO to the same block(s)
> regardless of whether they are unaligned, aligned or a mixture of
> both. By using direct IO, the application has assumed responsibility
> for preventing data corruption due to overlapping IOs - they are
> inherently racy and nothing in the dio code prevents that from
> occurring.
> 

while there are multiple applications running on the same filesystem,
they could possible touching the same files concurrently. How could
applications know there is other apps to change the same file at the
same time?

> The bug I'm fixing is for *non-overlapping* concurrent unaligned IOs
> where the kernel direct IO code causes the data corruption, not the
> application. The application is not doing something stupid, and as
> such needs to be fixed.
> 

For the case I refering to here, it's the kernel direct IO who zero out
the block for 2).  The application did 2) did not do zero-out it self,
but it will result in loose data the application write in 1). It's the
same as what you are trying to fix.


Mingming


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
  2010-08-04 14:55       ` Mingming Cao
@ 2010-08-05  1:02         ` Dave Chinner
  -1 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-08-05  1:02 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-fsdevel, xfs, sandeen

On Wed, Aug 04, 2010 at 07:55:52AM -0700, Mingming Cao wrote:
> On Wed, 2010-08-04 at 13:37 +1000, Dave Chinner wrote:
> > On Tue, Aug 03, 2010 at 05:11:18PM -0700, Mingming Cao wrote:
> > > This seems to address both two IOs are unaligned direct IO. If the first
> > > IO is aligned direct IO, then it is not tracked?
> > > 
> > > I am also concerned about the aligned direct IO case...
> > > 
> > > 1) first thread aio+dio+aligned write to a hole, there is no zero-out
> > > submitted from kernel. But the hole remains initialized before all IO
> > > complete and convert it from uninitialized extent to initialized.
> > > 2) second thread aio+dio+unalign write to the same hole, this time it is
> > > unaligned. since buffer is still new (not converted yet), the new
> > > incoming thread zero out port of data that first thread has written to
> > 
> > That is clearly and unmistakably an application bug - it should not
> > be issuing concurrent, overlapping IO to the same block(s)
> > regardless of whether they are unaligned, aligned or a mixture of
> > both. By using direct IO, the application has assumed responsibility
> > for preventing data corruption due to overlapping IOs - they are
> > inherently racy and nothing in the dio code prevents that from
> > occurring.
> 
> while there are multiple applications running on the same filesystem,
> they could possible touching the same files concurrently. How could
> applications know there is other apps to change the same file at the
> same time?

That's for the applications to figure out.  A typical way of
acheiving serialisation at application level is for all the
applications doing dio to the same files is to use byte-range
locking...

Remember, DIO abdicates all responsibility for IO coherency to the
applications. If userspace is issuing concurrent direct IOs to the
same block _no matter how_, then that is an application problem, not
a kernel problem.

If you want the kernel to avoid overlapping DIO, then we have to
track _every single DIO_. We clearly do not do that, have never done
that, and no OS that implements direct IO has ever done that. It is,
quite simply, not the responsibility of the kernel to prevent
userspace from doing stupid stuff with DIO and it never has been.
If you want POSIX IO coherency semantics, then *don't use DIO*.

> > The bug I'm fixing is for *non-overlapping* concurrent unaligned IOs
> > where the kernel direct IO code causes the data corruption, not the
> > application. The application is not doing something stupid, and as
> > such needs to be fixed.
> 
> For the case I refering to here, it's the kernel direct IO who zero out
> the block for 2).  The application did 2) did not do zero-out it self,
> but it will result in loose data the application write in 1). It's the
> same as what you are trying to fix.

No it isn't. Overlapping vs non-overlapping DIO are two very different
cases.

Cheers,

Dave.

-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] dio: track and serialise unaligned direct IO
@ 2010-08-05  1:02         ` Dave Chinner
  0 siblings, 0 replies; 28+ messages in thread
From: Dave Chinner @ 2010-08-05  1:02 UTC (permalink / raw)
  To: Mingming Cao; +Cc: linux-fsdevel, sandeen, xfs

On Wed, Aug 04, 2010 at 07:55:52AM -0700, Mingming Cao wrote:
> On Wed, 2010-08-04 at 13:37 +1000, Dave Chinner wrote:
> > On Tue, Aug 03, 2010 at 05:11:18PM -0700, Mingming Cao wrote:
> > > This seems to address both two IOs are unaligned direct IO. If the first
> > > IO is aligned direct IO, then it is not tracked?
> > > 
> > > I am also concerned about the aligned direct IO case...
> > > 
> > > 1) first thread aio+dio+aligned write to a hole, there is no zero-out
> > > submitted from kernel. But the hole remains initialized before all IO
> > > complete and convert it from uninitialized extent to initialized.
> > > 2) second thread aio+dio+unalign write to the same hole, this time it is
> > > unaligned. since buffer is still new (not converted yet), the new
> > > incoming thread zero out port of data that first thread has written to
> > 
> > That is clearly and unmistakably an application bug - it should not
> > be issuing concurrent, overlapping IO to the same block(s)
> > regardless of whether they are unaligned, aligned or a mixture of
> > both. By using direct IO, the application has assumed responsibility
> > for preventing data corruption due to overlapping IOs - they are
> > inherently racy and nothing in the dio code prevents that from
> > occurring.
> 
> while there are multiple applications running on the same filesystem,
> they could possible touching the same files concurrently. How could
> applications know there is other apps to change the same file at the
> same time?

That's for the applications to figure out.  A typical way of
acheiving serialisation at application level is for all the
applications doing dio to the same files is to use byte-range
locking...

Remember, DIO abdicates all responsibility for IO coherency to the
applications. If userspace is issuing concurrent direct IOs to the
same block _no matter how_, then that is an application problem, not
a kernel problem.

If you want the kernel to avoid overlapping DIO, then we have to
track _every single DIO_. We clearly do not do that, have never done
that, and no OS that implements direct IO has ever done that. It is,
quite simply, not the responsibility of the kernel to prevent
userspace from doing stupid stuff with DIO and it never has been.
If you want POSIX IO coherency semantics, then *don't use DIO*.

> > The bug I'm fixing is for *non-overlapping* concurrent unaligned IOs
> > where the kernel direct IO code causes the data corruption, not the
> > application. The application is not doing something stupid, and as
> > such needs to be fixed.
> 
> For the case I refering to here, it's the kernel direct IO who zero out
> the block for 2).  The application did 2) did not do zero-out it self,
> but it will result in loose data the application write in 1). It's the
> same as what you are trying to fix.

No it isn't. Overlapping vs non-overlapping DIO are two very different
cases.

Cheers,

Dave.

-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2010-08-05  1:03 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-07-29 22:45 [PATCH] dio: track and serialise unaligned direct IO Dave Chinner
2010-07-29 22:45 ` Dave Chinner
2010-07-30  2:53 ` Matthew Wilcox
2010-07-30  2:53   ` Matthew Wilcox
2010-07-30  4:53   ` Dave Chinner
2010-07-30  4:53     ` Dave Chinner
2010-08-03 17:34     ` Mingming Cao
2010-08-03 17:34       ` Mingming Cao
2010-08-03 22:56       ` Dave Chinner
2010-08-03 22:56         ` Dave Chinner
2010-08-03 23:41         ` Mingming Cao
2010-08-03 23:41           ` Mingming Cao
2010-08-04  4:22           ` Dave Chinner
2010-08-04  4:22             ` Dave Chinner
2010-07-30 17:43 ` Badari Pulavarty
2010-07-30 17:43   ` Badari Pulavarty
2010-07-30 23:13   ` Dave Chinner
2010-07-30 23:13     ` Dave Chinner
2010-08-04  0:11 ` Mingming Cao
2010-08-04  0:11   ` Mingming Cao
2010-08-04  3:37   ` Dave Chinner
2010-08-04  3:37     ` Dave Chinner
2010-08-04  3:58     ` Dave Chinner
2010-08-04  3:58       ` Dave Chinner
2010-08-04 14:55     ` Mingming Cao
2010-08-04 14:55       ` Mingming Cao
2010-08-05  1:02       ` Dave Chinner
2010-08-05  1:02         ` Dave Chinner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.