All of lore.kernel.org
 help / color / mirror / Atom feed
* [Ocfs2-devel] ocfs2: serialize unaligned aio
@ 2011-06-22 21:23 Mark Fasheh
  2011-06-23  3:44 ` Tao Ma
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Mark Fasheh @ 2011-06-22 21:23 UTC (permalink / raw)
  To: ocfs2-devel

Fix a corruption that can happen when we have (two or more) outstanding
aio's to an overlapping unaligned region.  Ext4
(e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix
similar issues.

In our case what happens is that we can have an outstanding aio on a region
and if a write comes in with some bytes overlapping the original aio we may
decide to read that region into a page before continuing (typically because
of buffered-io fallback).  Since we have no ordering guarantees with the
aio, we can read stale or bad data into the page and then write it back out.

If the i/o is page and block aligned, then we avoid this issue as there
won't be any need to read data from disk.

I took the same approach as Eric in the ext4 patch and introduced some
serialization of unaligned async direct i/o.  I don't expect this to have an
effect on the most common cases of AIO.  Unaligned aio will be slower
though, but that's far more acceptable than data corruption.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>

---
 fs/ocfs2/aops.c  |   10 ++++++++++
 fs/ocfs2/aops.h  |   14 ++++++++++++++
 fs/ocfs2/file.c  |   38 ++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/inode.h |    3 +++
 fs/ocfs2/super.c |   10 ++++++++--
 5 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ac97bca..4c1ec8f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -564,6 +564,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 	int level;
+	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
 
 	/* this io's submitter should not have unlocked this before we could */
 	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -573,6 +574,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 		ocfs2_iocb_clear_sem_locked(iocb);
 	}
 
+	if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+		ocfs2_iocb_clear_unaligned_aio(iocb);
+
+		if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
+		    waitqueue_active(wq)) {
+			wake_up_all(wq);
+		}
+	}
+
 	ocfs2_iocb_clear_rw_locked(iocb);
 
 	level = ocfs2_iocb_rw_locked_level(iocb);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 75cf3ad..ffb2da3 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
 	OCFS2_IOCB_RW_LOCK = 0,
 	OCFS2_IOCB_RW_LOCK_LEVEL,
 	OCFS2_IOCB_SEM,
+	OCFS2_IOCB_UNALIGNED_IO,
 	OCFS2_IOCB_NUM_LOCKS
 };
 
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
 	clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_is_sem_locked(iocb) \
 	test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+	set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+	clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+	test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+
+#define OCFS2_IOEND_WQ_HASH_SZ	37
+#define ocfs2_ioend_wq(v)   (&ocfs2__ioend_wq[((unsigned long)(v)) %\
+					    OCFS2_IOEND_WQ_HASH_SZ])
+extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89659d6..4c909c9 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2038,6 +2038,23 @@ out:
 	return ret;
 }
 
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+	int blockmask = inode->i_sb->s_blocksize - 1;
+	loff_t final_size = pos + count;
+
+	if ((pos & blockmask) || (final_size & blockmask))
+		return 1;
+	return 0;
+}
+
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
 					    struct file *file,
 					    loff_t pos, size_t count,
@@ -2216,6 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int full_coherency = !(osb->s_mount_opt &
 			       OCFS2_MOUNT_COHERENCY_BUFFERED);
+	int unaligned_dio = 0;
 
 	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2284,6 +2302,10 @@ relock:
 		goto out;
 	}
 
+	if (direct_io && !is_sync_kiocb(iocb))
+		unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+						      *ppos);
+
 	/*
 	 * We can't complete the direct I/O as requested, fall back to
 	 * buffered I/O.
@@ -2299,6 +2321,18 @@ relock:
 		goto relock;
 	}
 
+	if (unaligned_dio) {
+		/*
+		 * Wait on previous unaligned aio to complete before
+		 * proceeding.
+		 */
+		ocfs2_aiodio_wait(inode);
+
+		/* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+		atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+		ocfs2_iocb_set_unaligned_aio(iocb);
+	}
+
 	/*
 	 * To later detect whether a journal commit for sync writes is
 	 * necessary, we sample i_size, and cluster count here.
@@ -2371,8 +2405,12 @@ out_dio:
 	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
 		rw_level = -1;
 		have_alloc_sem = 0;
+		unaligned_dio = 0;
 	}
 
+	if (unaligned_dio)
+		atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+
 out:
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1c508b1..88924a3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,6 +43,9 @@ struct ocfs2_inode_info
 	/* protects extended attribute changes on this inode */
 	struct rw_semaphore		ip_xattr_sem;
 
+	/* Number of outstanding AIO's which are not page aligned */
+	atomic_t			ip_unaligned_aio;
+
 	/* These fields are protected by ip_lock */
 	spinlock_t			ip_lock;
 	u32				ip_open_count;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5a521c7..6b7b415 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -53,6 +53,7 @@
 #include "ocfs1_fs_compat.h"
 
 #include "alloc.h"
+#include "aops.h"
 #include "blockcheck.h"
 #include "dlmglue.h"
 #include "export.h"
@@ -1615,12 +1616,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	return 0;
 }
 
+wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
 static int __init ocfs2_init(void)
 {
-	int status;
+	int status, i;
 
 	ocfs2_print_version();
 
+	for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
+		init_waitqueue_head(&ocfs2__ioend_wq[i]);
+
 	status = init_ocfs2_uptodate_cache();
 	if (status < 0) {
 		mlog_errno(status);
@@ -1759,7 +1765,7 @@ static void ocfs2_inode_init_once(void *data)
 	ocfs2_extent_map_init(&oi->vfs_inode);
 	INIT_LIST_HEAD(&oi->ip_io_markers);
 	oi->ip_dir_start_lookup = 0;
-
+	atomic_set(&oi->ip_unaligned_aio, 0);
 	init_rwsem(&oi->ip_alloc_sem);
 	init_rwsem(&oi->ip_xattr_sem);
 	mutex_init(&oi->ip_io_mutex);
-- 
1.7.5.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [Ocfs2-devel] ocfs2: serialize unaligned aio
  2011-06-22 21:23 [Ocfs2-devel] ocfs2: serialize unaligned aio Mark Fasheh
@ 2011-06-23  3:44 ` Tao Ma
  2011-06-26  7:22 ` Joel Becker
  2011-07-28  9:08 ` Joel Becker
  2 siblings, 0 replies; 7+ messages in thread
From: Tao Ma @ 2011-06-23  3:44 UTC (permalink / raw)
  To: ocfs2-devel

Hi Mark,
On 06/23/2011 05:23 AM, Mark Fasheh wrote:
> Fix a corruption that can happen when we have (two or more) outstanding
> aio's to an overlapping unaligned region.  Ext4
> (e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix
> similar issues.
> 
> In our case what happens is that we can have an outstanding aio on a region
> and if a write comes in with some bytes overlapping the original aio we may
> decide to read that region into a page before continuing (typically because
> of buffered-io fallback).  Since we have no ordering guarantees with the
> aio, we can read stale or bad data into the page and then write it back out.
> 
> If the i/o is page and block aligned, then we avoid this issue as there
> won't be any need to read data from disk.
> 
> I took the same approach as Eric in the ext4 patch and introduced some
> serialization of unaligned async direct i/o.  I don't expect this to have an
> effect on the most common cases of AIO.  Unaligned aio will be slower
> though, but that's far more acceptable than data corruption.
> 
> Signed-off-by: Mark Fasheh <mfasheh@suse.com>
> 
> ---
>  fs/ocfs2/aops.c  |   10 ++++++++++
>  fs/ocfs2/aops.h  |   14 ++++++++++++++
>  fs/ocfs2/file.c  |   38 ++++++++++++++++++++++++++++++++++++++
>  fs/ocfs2/inode.h |    3 +++
>  fs/ocfs2/super.c |   10 ++++++++--
>  5 files changed, 73 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index ac97bca..4c1ec8f 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -564,6 +564,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
>  {
>  	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
>  	int level;
> +	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
>  
>  	/* this io's submitter should not have unlocked this before we could */
>  	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
> @@ -573,6 +574,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
>  		ocfs2_iocb_clear_sem_locked(iocb);
>  	}
>  
> +	if (ocfs2_iocb_is_unaligned_aio(iocb)) {
> +		ocfs2_iocb_clear_unaligned_aio(iocb);
> +
> +		if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
> +		    waitqueue_active(wq)) {
> +			wake_up_all(wq);
> +		}
> +	}
> +
>  	ocfs2_iocb_clear_rw_locked(iocb);
>  
>  	level = ocfs2_iocb_rw_locked_level(iocb);
> diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
> index 75cf3ad..ffb2da3 100644
> --- a/fs/ocfs2/aops.h
> +++ b/fs/ocfs2/aops.h
> @@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
>  	OCFS2_IOCB_RW_LOCK = 0,
>  	OCFS2_IOCB_RW_LOCK_LEVEL,
>  	OCFS2_IOCB_SEM,
> +	OCFS2_IOCB_UNALIGNED_IO,
>  	OCFS2_IOCB_NUM_LOCKS
>  };
>  
> @@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
>  	clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
>  #define ocfs2_iocb_is_sem_locked(iocb) \
>  	test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
> +
> +#define ocfs2_iocb_set_unaligned_aio(iocb) \
> +	set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
> +#define ocfs2_iocb_clear_unaligned_aio(iocb) \
> +	clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
> +#define ocfs2_iocb_is_unaligned_aio(iocb) \
> +	test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
> +
> +#define OCFS2_IOEND_WQ_HASH_SZ	37
> +#define ocfs2_ioend_wq(v)   (&ocfs2__ioend_wq[((unsigned long)(v)) %\
> +					    OCFS2_IOEND_WQ_HASH_SZ])
> +extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
> +
>  #endif /* OCFS2_FILE_H */
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 89659d6..4c909c9 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -2038,6 +2038,23 @@ out:
>  	return ret;
>  }
>  
> +static void ocfs2_aiodio_wait(struct inode *inode)
> +{
> +	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
> +
> +	wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
> +}
> +
> +static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
> +{
> +	int blockmask = inode->i_sb->s_blocksize - 1;
> +	loff_t final_size = pos + count;
> +
> +	if ((pos & blockmask) || (final_size & blockmask))
> +		return 1;
> +	return 0;
> +}
> +
>  static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
>  					    struct file *file,
>  					    loff_t pos, size_t count,
> @@ -2216,6 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
>  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>  	int full_coherency = !(osb->s_mount_opt &
>  			       OCFS2_MOUNT_COHERENCY_BUFFERED);
> +	int unaligned_dio = 0;
>  
>  	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
>  		(unsigned long long)OCFS2_I(inode)->ip_blkno,
> @@ -2284,6 +2302,10 @@ relock:
>  		goto out;
>  	}
>  
> +	if (direct_io && !is_sync_kiocb(iocb))
> +		unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
> +						      *ppos);
> +
>  	/*
>  	 * We can't complete the direct I/O as requested, fall back to
>  	 * buffered I/O.
> @@ -2299,6 +2321,18 @@ relock:
>  		goto relock;
>  	}
>  
> +	if (unaligned_dio) {
> +		/*
> +		 * Wait on previous unaligned aio to complete before
> +		 * proceeding.
> +		 */
> +		ocfs2_aiodio_wait(inode);
> +
> +		/* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
> +		atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
> +		ocfs2_iocb_set_unaligned_aio(iocb);
> +	}
> +
I just have one qs here. In ocfs2_aiodio_wait, we just wait for the
event and in ocfs2_dio_end_io, we will wake up all the waiters.

So if there are 2 aios comes when 1 aio is proceeding, these 2 will wait
here and after the first 1 aio is completed, both of these 2 will be
waken up and they will be running at the same time actually. Maybe we
have to used exclusive wait here and use wake_up so that only 1 of these
2 will be waken up to proceed? Or is it designed like this intentionally?

I have gone through the patches of ext4 and yes, it does the similar
thing, but I am not sure whether the case is the same or not since I am
not quite familiar with ext4 by now.

Regards,
Tao
>  	/*
>  	 * To later detect whether a journal commit for sync writes is
>  	 * necessary, we sample i_size, and cluster count here.
> @@ -2371,8 +2405,12 @@ out_dio:
>  	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
>  		rw_level = -1;
>  		have_alloc_sem = 0;
> +		unaligned_dio = 0;
>  	}
>  
> +	if (unaligned_dio)
> +		atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
> +
>  out:
>  	if (rw_level != -1)
>  		ocfs2_rw_unlock(inode, rw_level);
> diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
> index 1c508b1..88924a3 100644
> --- a/fs/ocfs2/inode.h
> +++ b/fs/ocfs2/inode.h
> @@ -43,6 +43,9 @@ struct ocfs2_inode_info
>  	/* protects extended attribute changes on this inode */
>  	struct rw_semaphore		ip_xattr_sem;
>  
> +	/* Number of outstanding AIO's which are not page aligned */
> +	atomic_t			ip_unaligned_aio;
> +
>  	/* These fields are protected by ip_lock */
>  	spinlock_t			ip_lock;
>  	u32				ip_open_count;
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 5a521c7..6b7b415 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -53,6 +53,7 @@
>  #include "ocfs1_fs_compat.h"
>  
>  #include "alloc.h"
> +#include "aops.h"
>  #include "blockcheck.h"
>  #include "dlmglue.h"
>  #include "export.h"
> @@ -1615,12 +1616,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
>  	return 0;
>  }
>  
> +wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
> +
>  static int __init ocfs2_init(void)
>  {
> -	int status;
> +	int status, i;
>  
>  	ocfs2_print_version();
>  
> +	for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
> +		init_waitqueue_head(&ocfs2__ioend_wq[i]);
> +
>  	status = init_ocfs2_uptodate_cache();
>  	if (status < 0) {
>  		mlog_errno(status);
> @@ -1759,7 +1765,7 @@ static void ocfs2_inode_init_once(void *data)
>  	ocfs2_extent_map_init(&oi->vfs_inode);
>  	INIT_LIST_HEAD(&oi->ip_io_markers);
>  	oi->ip_dir_start_lookup = 0;
> -
> +	atomic_set(&oi->ip_unaligned_aio, 0);
>  	init_rwsem(&oi->ip_alloc_sem);
>  	init_rwsem(&oi->ip_xattr_sem);
>  	mutex_init(&oi->ip_io_mutex);

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Ocfs2-devel] ocfs2: serialize unaligned aio
  2011-06-22 21:23 [Ocfs2-devel] ocfs2: serialize unaligned aio Mark Fasheh
  2011-06-23  3:44 ` Tao Ma
@ 2011-06-26  7:22 ` Joel Becker
  2011-06-27 16:23   ` Mark Fasheh
  2011-07-28  9:08 ` Joel Becker
  2 siblings, 1 reply; 7+ messages in thread
From: Joel Becker @ 2011-06-26  7:22 UTC (permalink / raw)
  To: ocfs2-devel

On Wed, Jun 22, 2011 at 02:23:38PM -0700, Mark Fasheh wrote:
> Fix a corruption that can happen when we have (two or more) outstanding
> aio's to an overlapping unaligned region.  Ext4
> (e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix
> similar issues.
> 
> In our case what happens is that we can have an outstanding aio on a region
> and if a write comes in with some bytes overlapping the original aio we may
> decide to read that region into a page before continuing (typically because
> of buffered-io fallback).  Since we have no ordering guarantees with the
> aio, we can read stale or bad data into the page and then write it back out.
> 
> If the i/o is page and block aligned, then we avoid this issue as there
> won't be any need to read data from disk.
> 
> I took the same approach as Eric in the ext4 patch and introduced some
> serialization of unaligned async direct i/o.  I don't expect this to have an
> effect on the most common cases of AIO.  Unaligned aio will be slower
> though, but that's far more acceptable than data corruption.

	The patch looks good, but I'm a little confused.  Why doesn't
this matter for buffered I/O?  Just because that data is going through
the pagecache?  For a second, I couldn't see how unaligned dio was
possible, until I remembered this was block aligned, not sector aligned.
	Don't most of the major DIO users (read: databases) do
sector-aligned I/O?  Won't this affect them?

Joel

> 
> Signed-off-by: Mark Fasheh <mfasheh@suse.com>
> 
> ---
>  fs/ocfs2/aops.c  |   10 ++++++++++
>  fs/ocfs2/aops.h  |   14 ++++++++++++++
>  fs/ocfs2/file.c  |   38 ++++++++++++++++++++++++++++++++++++++
>  fs/ocfs2/inode.h |    3 +++
>  fs/ocfs2/super.c |   10 ++++++++--
>  5 files changed, 73 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
> index ac97bca..4c1ec8f 100644
> --- a/fs/ocfs2/aops.c
> +++ b/fs/ocfs2/aops.c
> @@ -564,6 +564,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
>  {
>  	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
>  	int level;
> +	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
>  
>  	/* this io's submitter should not have unlocked this before we could */
>  	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
> @@ -573,6 +574,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
>  		ocfs2_iocb_clear_sem_locked(iocb);
>  	}
>  
> +	if (ocfs2_iocb_is_unaligned_aio(iocb)) {
> +		ocfs2_iocb_clear_unaligned_aio(iocb);
> +
> +		if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
> +		    waitqueue_active(wq)) {
> +			wake_up_all(wq);
> +		}
> +	}
> +
>  	ocfs2_iocb_clear_rw_locked(iocb);
>  
>  	level = ocfs2_iocb_rw_locked_level(iocb);
> diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
> index 75cf3ad..ffb2da3 100644
> --- a/fs/ocfs2/aops.h
> +++ b/fs/ocfs2/aops.h
> @@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
>  	OCFS2_IOCB_RW_LOCK = 0,
>  	OCFS2_IOCB_RW_LOCK_LEVEL,
>  	OCFS2_IOCB_SEM,
> +	OCFS2_IOCB_UNALIGNED_IO,
>  	OCFS2_IOCB_NUM_LOCKS
>  };
>  
> @@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
>  	clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
>  #define ocfs2_iocb_is_sem_locked(iocb) \
>  	test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
> +
> +#define ocfs2_iocb_set_unaligned_aio(iocb) \
> +	set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
> +#define ocfs2_iocb_clear_unaligned_aio(iocb) \
> +	clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
> +#define ocfs2_iocb_is_unaligned_aio(iocb) \
> +	test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
> +
> +#define OCFS2_IOEND_WQ_HASH_SZ	37
> +#define ocfs2_ioend_wq(v)   (&ocfs2__ioend_wq[((unsigned long)(v)) %\
> +					    OCFS2_IOEND_WQ_HASH_SZ])
> +extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
> +
>  #endif /* OCFS2_FILE_H */
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index 89659d6..4c909c9 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -2038,6 +2038,23 @@ out:
>  	return ret;
>  }
>  
> +static void ocfs2_aiodio_wait(struct inode *inode)
> +{
> +	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
> +
> +	wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
> +}
> +
> +static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
> +{
> +	int blockmask = inode->i_sb->s_blocksize - 1;
> +	loff_t final_size = pos + count;
> +
> +	if ((pos & blockmask) || (final_size & blockmask))
> +		return 1;
> +	return 0;
> +}
> +
>  static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
>  					    struct file *file,
>  					    loff_t pos, size_t count,
> @@ -2216,6 +2233,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
>  	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
>  	int full_coherency = !(osb->s_mount_opt &
>  			       OCFS2_MOUNT_COHERENCY_BUFFERED);
> +	int unaligned_dio = 0;
>  
>  	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
>  		(unsigned long long)OCFS2_I(inode)->ip_blkno,
> @@ -2284,6 +2302,10 @@ relock:
>  		goto out;
>  	}
>  
> +	if (direct_io && !is_sync_kiocb(iocb))
> +		unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
> +						      *ppos);
> +
>  	/*
>  	 * We can't complete the direct I/O as requested, fall back to
>  	 * buffered I/O.
> @@ -2299,6 +2321,18 @@ relock:
>  		goto relock;
>  	}
>  
> +	if (unaligned_dio) {
> +		/*
> +		 * Wait on previous unaligned aio to complete before
> +		 * proceeding.
> +		 */
> +		ocfs2_aiodio_wait(inode);
> +
> +		/* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
> +		atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
> +		ocfs2_iocb_set_unaligned_aio(iocb);
> +	}
> +
>  	/*
>  	 * To later detect whether a journal commit for sync writes is
>  	 * necessary, we sample i_size, and cluster count here.
> @@ -2371,8 +2405,12 @@ out_dio:
>  	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
>  		rw_level = -1;
>  		have_alloc_sem = 0;
> +		unaligned_dio = 0;
>  	}
>  
> +	if (unaligned_dio)
> +		atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
> +
>  out:
>  	if (rw_level != -1)
>  		ocfs2_rw_unlock(inode, rw_level);
> diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
> index 1c508b1..88924a3 100644
> --- a/fs/ocfs2/inode.h
> +++ b/fs/ocfs2/inode.h
> @@ -43,6 +43,9 @@ struct ocfs2_inode_info
>  	/* protects extended attribute changes on this inode */
>  	struct rw_semaphore		ip_xattr_sem;
>  
> +	/* Number of outstanding AIO's which are not page aligned */
> +	atomic_t			ip_unaligned_aio;
> +
>  	/* These fields are protected by ip_lock */
>  	spinlock_t			ip_lock;
>  	u32				ip_open_count;
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 5a521c7..6b7b415 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -53,6 +53,7 @@
>  #include "ocfs1_fs_compat.h"
>  
>  #include "alloc.h"
> +#include "aops.h"
>  #include "blockcheck.h"
>  #include "dlmglue.h"
>  #include "export.h"
> @@ -1615,12 +1616,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
>  	return 0;
>  }
>  
> +wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
> +
>  static int __init ocfs2_init(void)
>  {
> -	int status;
> +	int status, i;
>  
>  	ocfs2_print_version();
>  
> +	for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
> +		init_waitqueue_head(&ocfs2__ioend_wq[i]);
> +
>  	status = init_ocfs2_uptodate_cache();
>  	if (status < 0) {
>  		mlog_errno(status);
> @@ -1759,7 +1765,7 @@ static void ocfs2_inode_init_once(void *data)
>  	ocfs2_extent_map_init(&oi->vfs_inode);
>  	INIT_LIST_HEAD(&oi->ip_io_markers);
>  	oi->ip_dir_start_lookup = 0;
> -
> +	atomic_set(&oi->ip_unaligned_aio, 0);
>  	init_rwsem(&oi->ip_alloc_sem);
>  	init_rwsem(&oi->ip_xattr_sem);
>  	mutex_init(&oi->ip_io_mutex);
> -- 
> 1.7.5.3
> 

-- 

"The whole problem with the world is that fools and fanatics are always
 so certain of themselves, and wiser people so full of doubts."
	- Bertrand Russell

			http://www.jlbec.org/
			jlbec at evilplan.org

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Ocfs2-devel] ocfs2: serialize unaligned aio
  2011-06-26  7:22 ` Joel Becker
@ 2011-06-27 16:23   ` Mark Fasheh
  2011-06-27 16:43     ` Sunil Mushran
  0 siblings, 1 reply; 7+ messages in thread
From: Mark Fasheh @ 2011-06-27 16:23 UTC (permalink / raw)
  To: ocfs2-devel

On Sun, Jun 26, 2011 at 12:22:48AM -0700, Joel Becker wrote:
> On Wed, Jun 22, 2011 at 02:23:38PM -0700, Mark Fasheh wrote:
> > Fix a corruption that can happen when we have (two or more) outstanding
> > aio's to an overlapping unaligned region.  Ext4
> > (e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix
> > similar issues.
> > 
> > In our case what happens is that we can have an outstanding aio on a region
> > and if a write comes in with some bytes overlapping the original aio we may
> > decide to read that region into a page before continuing (typically because
> > of buffered-io fallback).  Since we have no ordering guarantees with the
> > aio, we can read stale or bad data into the page and then write it back out.
> > 
> > If the i/o is page and block aligned, then we avoid this issue as there
> > won't be any need to read data from disk.
> > 
> > I took the same approach as Eric in the ext4 patch and introduced some
> > serialization of unaligned async direct i/o.  I don't expect this to have an
> > effect on the most common cases of AIO.  Unaligned aio will be slower
> > though, but that's far more acceptable than data corruption.
> 
> 	The patch looks good, but I'm a little confused.  Why doesn't
> this matter for buffered I/O?  Just because that data is going through
> the pagecache?  For a second, I couldn't see how unaligned dio was
> possible, until I remembered this was block aligned, not sector aligned.

Buffered I/O is synchronous so we don't really have any situations in which
there can be two buffered I/O's at the same time.


> 	Don't most of the major DIO users (read: databases) do
> sector-aligned I/O?  Won't this affect them?

In 2.6? Anyway, Sunil will have to answer that question... I would guess
though that since xfs and ext4 have the same patch and there don't seem to
be major reports from Oracle of DB performance tanking. That's hardly solid
evidence of course.
	--Mark

--
Mark Fasheh

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Ocfs2-devel] ocfs2: serialize unaligned aio
  2011-06-27 16:23   ` Mark Fasheh
@ 2011-06-27 16:43     ` Sunil Mushran
  2011-06-27 17:26       ` Mark Fasheh
  0 siblings, 1 reply; 7+ messages in thread
From: Sunil Mushran @ 2011-06-27 16:43 UTC (permalink / raw)
  To: ocfs2-devel

On 06/27/2011 09:23 AM, Mark Fasheh wrote:
> On Sun, Jun 26, 2011 at 12:22:48AM -0700, Joel Becker wrote:
>> On Wed, Jun 22, 2011 at 02:23:38PM -0700, Mark Fasheh wrote:
>>> Fix a corruption that can happen when we have (two or more) outstanding
>>> aio's to an overlapping unaligned region.  Ext4
>>> (e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix
>>> similar issues.
>>>
>>> In our case what happens is that we can have an outstanding aio on a region
>>> and if a write comes in with some bytes overlapping the original aio we may
>>> decide to read that region into a page before continuing (typically because
>>> of buffered-io fallback).  Since we have no ordering guarantees with the
>>> aio, we can read stale or bad data into the page and then write it back out.
>>>
>>> If the i/o is page and block aligned, then we avoid this issue as there
>>> won't be any need to read data from disk.
>>>
>>> I took the same approach as Eric in the ext4 patch and introduced some
>>> serialization of unaligned async direct i/o.  I don't expect this to have an
>>> effect on the most common cases of AIO.  Unaligned aio will be slower
>>> though, but that's far more acceptable than data corruption.
>> 	The patch looks good, but I'm a little confused.  Why doesn't
>> this matter for buffered I/O?  Just because that data is going through
>> the pagecache?  For a second, I couldn't see how unaligned dio was
>> possible, until I remembered this was block aligned, not sector aligned.
> Buffered I/O is synchronous so we don't really have any situations in which
> there can be two buffered I/O's at the same time.
>
>
>> 	Don't most of the major DIO users (read: databases) do
>> sector-aligned I/O?  Won't this affect them?
> In 2.6? Anyway, Sunil will have to answer that question... I would guess
> though that since xfs and ext4 have the same patch and there don't seem to
> be major reports from Oracle of DB performance tanking. That's hardly solid
> evidence of course.

The Oracle db is not a concern as it fully allocates (and inits) the blocks.
The exception is the RMAN backup that does extending (aio) direct writes.

If I remember correctly, this issue was reported by KVM users. Atleast
on ext4/xfs.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Ocfs2-devel] ocfs2: serialize unaligned aio
  2011-06-27 16:43     ` Sunil Mushran
@ 2011-06-27 17:26       ` Mark Fasheh
  0 siblings, 0 replies; 7+ messages in thread
From: Mark Fasheh @ 2011-06-27 17:26 UTC (permalink / raw)
  To: ocfs2-devel

On Mon, Jun 27, 2011 at 09:43:34AM -0700, Sunil Mushran wrote:
> On 06/27/2011 09:23 AM, Mark Fasheh wrote:
> >On Sun, Jun 26, 2011 at 12:22:48AM -0700, Joel Becker wrote:
> >>On Wed, Jun 22, 2011 at 02:23:38PM -0700, Mark Fasheh wrote:
> >>>Fix a corruption that can happen when we have (two or more) outstanding
> >>>aio's to an overlapping unaligned region.  Ext4
> >>>(e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix
> >>>similar issues.
> >>>
> >>>In our case what happens is that we can have an outstanding aio on a 
> >>>region
> >>>and if a write comes in with some bytes overlapping the original aio we 
> >>>may
> >>>decide to read that region into a page before continuing (typically 
> >>>because
> >>>of buffered-io fallback).  Since we have no ordering guarantees with the
> >>>aio, we can read stale or bad data into the page and then write it back 
> >>>out.
> >>>
> >>>If the i/o is page and block aligned, then we avoid this issue as there
> >>>won't be any need to read data from disk.
> >>>
> >>>I took the same approach as Eric in the ext4 patch and introduced some
> >>>serialization of unaligned async direct i/o.  I don't expect this to 
> >>>have an
> >>>effect on the most common cases of AIO.  Unaligned aio will be slower
> >>>though, but that's far more acceptable than data corruption.
> >>	The patch looks good, but I'm a little confused.  Why doesn't
> >>this matter for buffered I/O?  Just because that data is going through
> >>the pagecache?  For a second, I couldn't see how unaligned dio was
> >>possible, until I remembered this was block aligned, not sector aligned.
> >Buffered I/O is synchronous so we don't really have any situations in which
> >there can be two buffered I/O's at the same time.
> >
> >
> >>	Don't most of the major DIO users (read: databases) do
> >>sector-aligned I/O?  Won't this affect them?
> >In 2.6? Anyway, Sunil will have to answer that question... I would guess
> >though that since xfs and ext4 have the same patch and there don't seem to
> >be major reports from Oracle of DB performance tanking. That's hardly solid
> >evidence of course.
> 
> The Oracle db is not a concern as it fully allocates (and inits) the blocks.
> The exception is the RMAN backup that does extending (aio) direct writes.

In this case (RMAN) wouldn't we just be always falling back to buffered
anyway, since it's always extending?
	--Mark

--
Mark Fasheh

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Ocfs2-devel] ocfs2: serialize unaligned aio
  2011-06-22 21:23 [Ocfs2-devel] ocfs2: serialize unaligned aio Mark Fasheh
  2011-06-23  3:44 ` Tao Ma
  2011-06-26  7:22 ` Joel Becker
@ 2011-07-28  9:08 ` Joel Becker
  2 siblings, 0 replies; 7+ messages in thread
From: Joel Becker @ 2011-07-28  9:08 UTC (permalink / raw)
  To: ocfs2-devel

On Wed, Jun 22, 2011 at 02:23:38PM -0700, Mark Fasheh wrote:
> Fix a corruption that can happen when we have (two or more) outstanding
> aio's to an overlapping unaligned region.  Ext4
> (e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix
> similar issues.
> 
> In our case what happens is that we can have an outstanding aio on a region
> and if a write comes in with some bytes overlapping the original aio we may
> decide to read that region into a page before continuing (typically because
> of buffered-io fallback).  Since we have no ordering guarantees with the
> aio, we can read stale or bad data into the page and then write it back out.
> 
> If the i/o is page and block aligned, then we avoid this issue as there
> won't be any need to read data from disk.
> 
> I took the same approach as Eric in the ext4 patch and introduced some
> serialization of unaligned async direct i/o.  I don't expect this to have an
> effect on the most common cases of AIO.  Unaligned aio will be slower
> though, but that's far more acceptable than data corruption.
> 
> Signed-off-by: Mark Fasheh <mfasheh@suse.com>

This patch is now in the fixes branch of ocfs2.git.

Joel


-- 

"The one important thing i have learned over the years is the
 difference between taking one's work seriously and taking one's self
 seriously.  The first is imperative and the second is disastrous."
	-Margot Fonteyn

			http://www.jlbec.org/
			jlbec at evilplan.org

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2011-07-28  9:08 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-06-22 21:23 [Ocfs2-devel] ocfs2: serialize unaligned aio Mark Fasheh
2011-06-23  3:44 ` Tao Ma
2011-06-26  7:22 ` Joel Becker
2011-06-27 16:23   ` Mark Fasheh
2011-06-27 16:43     ` Sunil Mushran
2011-06-27 17:26       ` Mark Fasheh
2011-07-28  9:08 ` Joel Becker

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.