All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
@ 2007-02-19 20:35 Zach Brown
  2007-02-19 20:47 ` Benjamin LaHaise
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Zach Brown @ 2007-02-19 20:35 UTC (permalink / raw)
  To: linux-aio, linux-kernel
  Cc: Benjamin LaHaise, Suparna bhattacharya, Andrew Morton, Leonid Ananiev

aio: propogate post-EIOCBQUEUED errors to completion event

This addresses an oops reported by Leonid Ananiev <leonid.i.ananiev@intel.com>
as archived at http://lkml.org/lkml/2007/2/8/337.

O_DIRECT kicks off bios and returns -EIOCBQUEUED to indicate its intention to
call aio_complete() once the bios complete.   As we return from submission we
must preserve the -EIOCBQUEUED return code so that fs/aio.c knows to let the
bio completion call aio_complete().  This stops us from returning errors after
O_DIRECT submission.

But we have a few places that are very interested in generating errors after
bio submission.

The most critical of these is invalidating the page cache after a write.  This
avoids exposing stale data to buffered operations that are performed after the
O_DIRECT write succeeds.  We must do this after submission because the user
buffer might have been an mmap()ed buffer of the region being written to.  The
get_user_pages() in the O_DIRECT completion path could have faulted in stale
data.

So this patch introduces a helper, aio_propogate_error(), which queues
post-submission errors in the iocb so that they are given to the user
completion event when aio_complete() is finally called.

To get this working we change the aio_complete() path so that the ring
insertion is performed as the final iocb reference is dropped.  This gives the
submission path time to queue its pending error before it drops its reference.
This increases the space in the iocb as it has to record the two result codes
from aio_complete() and the pending error from the submission path.

This was tested by running O_DIRECT aio-stress concurrently with buffered reads
while triggering EIO in invalidate_inode_pages2_range() with the help of a
debugfs bool hack.  Previously the kernel would oops as fs/aio.c and bio
completion both called aio_complete().  With this patch aio-stress sees -EIO.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---

 fs/aio.c            |  109 ++++++++++++++++++++++--------------------
 include/linux/aio.h |   30 +++++++++++
 mm/filemap.c        |    4 -
 3 files changed, 91 insertions(+), 52 deletions(-)

--- a/fs/aio.c	Mon Feb 19 11:52:27 2007 -0800
+++ b/fs/aio.c	Mon Feb 19 12:32:52 2007 -0800
@@ -192,6 +192,46 @@ static int aio_setup_ring(struct kioctx 
 	(void)__event;				\
 	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
 } while(0)
+
+static void aio_ring_insert_entry(struct kioctx *ctx, struct kiocb *iocb)
+{
+	struct aio_ring_info	*info;
+	struct aio_ring		*ring;
+	struct io_event		*event;
+	unsigned long		tail;	
+
+	assert_spin_locked(&ctx->ctx_lock);
+	
+	info = &ctx->ring_info;
+	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+
+	tail = info->tail;
+	event = aio_ring_event(info, tail, KM_IRQ0);
+	if (++tail >= info->nr)
+		tail = 0;
+
+	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+	event->data = iocb->ki_user_data;
+	event->res = iocb->ki_pending_err ? iocb->ki_pending_err : iocb->ki_res;
+	event->res2 = iocb->ki_res2;
+
+	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %d %lx %lx\n",
+		ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+		iocb->ki_pending_err, iocb->ki_res, iocb->ki_res2);
+
+	/* after flagging the request as done, we
+	 * must never even look at it again
+	 */
+	smp_wmb();	/* make event visible before updating tail */
+
+	info->tail = tail;
+	ring->tail = tail;
+
+	put_aio_ring_event(event, KM_IRQ0);
+	kunmap_atomic(ring, KM_IRQ1);
+
+	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+}
 
 /* ioctx_alloc
  *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
@@ -418,6 +458,7 @@ static struct kiocb fastcall *__aio_get_
 	req->ki_cancel = NULL;
 	req->ki_retry = NULL;
 	req->ki_dtor = NULL;
+	req->ki_pending_err = 0;
 	req->private = NULL;
 	req->ki_iovec = NULL;
 	INIT_LIST_HEAD(&req->ki_run_list);
@@ -507,10 +548,14 @@ static int __aio_put_req(struct kioctx *
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	req->ki_users --;
+	req->ki_users--;
 	BUG_ON(req->ki_users < 0);
 	if (likely(req->ki_users))
 		return 0;
+
+	if (kiocbIsInserted(req))
+		aio_ring_insert_entry(ctx, req);
+
 	list_del(&req->ki_list);		/* remove from active_reqs */
 	req->ki_cancel = NULL;
 	req->ki_retry = NULL;
@@ -924,12 +969,8 @@ int fastcall aio_complete(struct kiocb *
 int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
-	struct aio_ring_info	*info;
-	struct aio_ring	*ring;
-	struct io_event	*event;
+	int		ret;
 	unsigned long	flags;
-	unsigned long	tail;
-	int		ret;
 
 	/*
 	 * Special case handling for sync iocbs:
@@ -946,56 +987,24 @@ int fastcall aio_complete(struct kiocb *
 		return 1;
 	}
 
-	info = &ctx->ring_info;
-
-	/* add a completion event to the ring buffer.
-	 * must be done holding ctx->ctx_lock to prevent
-	 * other code from messing with the tail
-	 * pointer since we might be called from irq
-	 * context.
+	/*
+	 * We queue up the completion codes into the iocb.  They are combined
+	 * with a potential error from the submission path and inserted into
+	 * the ring once the last reference to the iocb is dropped.  Cancelled
+	 * iocbs don't insert events on completion because userland was given
+	 * an event directly as part of the cancelation interface.
 	 */
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 
 	if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
 		list_del_init(&iocb->ki_run_list);
 
-	/*
-	 * cancelled requests don't get events, userland was given one
-	 * when the event got cancelled.
-	 */
-	if (kiocbIsCancelled(iocb))
-		goto put_rq;
-
-	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
-
-	tail = info->tail;
-	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail >= info->nr)
-		tail = 0;
-
-	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
-	event->data = iocb->ki_user_data;
-	event->res = res;
-	event->res2 = res2;
-
-	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
-		ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
-		res, res2);
-
-	/* after flagging the request as done, we
-	 * must never even look at it again
-	 */
-	smp_wmb();	/* make event visible before updating tail */
-
-	info->tail = tail;
-	ring->tail = tail;
-
-	put_aio_ring_event(event, KM_IRQ0);
-	kunmap_atomic(ring, KM_IRQ1);
-
-	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
-put_rq:
-	/* everything turned out well, dispose of the aiocb. */
+	if (!kiocbIsCancelled(iocb)) {
+		iocb->ki_res = res;
+		iocb->ki_res2 = res2;
+		kiocbSetInserted(iocb);
+	}
+
 	ret = __aio_put_req(ctx, iocb);
 
 	if (waitqueue_active(&ctx->wait))
diff -r 8c8c075ae1c3 include/linux/aio.h
--- a/include/linux/aio.h	Mon Feb 19 11:52:27 2007 -0800
+++ b/include/linux/aio.h	Mon Feb 19 12:32:36 2007 -0800
@@ -34,6 +34,7 @@ struct kioctx;
 /* #define KIF_LOCKED		0 */
 #define KIF_KICKED		1
 #define KIF_CANCELLED		2
+#define KIF_INSERTED		4
 
 #define kiocbTryLock(iocb)	test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbTryKick(iocb)	test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags)
@@ -41,6 +42,7 @@ struct kioctx;
 #define kiocbSetLocked(iocb)	set_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbSetKicked(iocb)	set_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbSetCancelled(iocb)	set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+#define kiocbSetInserted(iocb)	set_bit(KIF_INSERTED, &(iocb)->ki_flags)
 
 #define kiocbClearLocked(iocb)	clear_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbClearKicked(iocb)	clear_bit(KIF_KICKED, &(iocb)->ki_flags)
@@ -49,6 +51,7 @@ struct kioctx;
 #define kiocbIsLocked(iocb)	test_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbIsKicked(iocb)	test_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbIsCancelled(iocb)	test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+#define kiocbIsInserted(iocb)	test_bit(KIF_INSERTED, &(iocb)->ki_flags)
 
 /* is there a better place to document function pointer methods? */
 /**
@@ -119,6 +122,10 @@ struct kiocb {
 
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
+	/* we store and combine return codes from submission and completion */ 
+	int			ki_pending_err;
+	long			ki_res;
+	long			ki_res2;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
@@ -246,6 +253,29 @@ static inline struct kiocb *list_kiocb(s
 	return list_entry(h, struct kiocb, ki_list);
 }
 
+/*
+ * This function is used to make sure that an error is communicated to
+ * userspace on iocb completion without stopping -EIOCBQUEUED from bubbling up
+ * to fs/aio.c from the place where it originated.
+ *
+ * If we have an existing -EIOCBQUEUED it must be returned all the way to
+ * fs/aio.c so that it doesn't double-complete the iocb along with whoever
+ * returned -EIOCBQUEUED..  In that case we put the new error in the iocb.  It
+ * will be returned to userspace *intead of* the first result code given to
+ * aio_complete().  Use this only for errors which must overwrite whatever the
+ * return code might have been.  The first non-zero new_err given to this
+ * function for a given iocb will be returned to userspace.
+ */
+static inline int aio_propogate_error(struct kiocb *iocb, int existing_err,
+				      int new_err)
+{
+	if (existing_err != -EIOCBQUEUED)
+		return new_err;
+	if (!iocb->ki_pending_err)
+		iocb->ki_pending_err = new_err;
+	return -EIOCBQUEUED;
+}
+
 /* for sysctl: */
 extern unsigned long aio_nr;
 extern unsigned long aio_max_nr;
diff -r 8c8c075ae1c3 mm/filemap.c
--- a/mm/filemap.c	Mon Feb 19 11:52:27 2007 -0800
+++ b/mm/filemap.c	Mon Feb 19 12:32:36 2007 -0800
@@ -2031,7 +2031,7 @@ generic_file_direct_write(struct kiocb *
 	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 		if (err < 0)
-			written = err;
+			written = aio_propogate_error(iocb, written, err);
 	}
 	return written;
 }
@@ -2396,7 +2396,7 @@ generic_file_direct_IO(int rw, struct ki
 			int err = invalidate_inode_pages2_range(mapping,
 					offset >> PAGE_CACHE_SHIFT, end);
 			if (err)
-				retval = err;
+				retval = aio_propogate_error(iocb, retval, err);
 		}
 	}
 	return retval;

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-19 20:35 [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event Zach Brown
@ 2007-02-19 20:47 ` Benjamin LaHaise
  2007-02-19 21:07   ` Zach Brown
  2007-02-19 20:58 ` Ananiev, Leonid I
  2007-02-20 14:08 ` Ananiev, Leonid I
  2 siblings, 1 reply; 18+ messages in thread
From: Benjamin LaHaise @ 2007-02-19 20:47 UTC (permalink / raw)
  To: Zach Brown
  Cc: linux-aio, linux-kernel, Suparna bhattacharya, Andrew Morton,
	Leonid Ananiev

On Mon, Feb 19, 2007 at 12:35:27PM -0800, Zach Brown wrote:
> aio: propogate post-EIOCBQUEUED errors to completion event

This patch has to be split in 2 to make it easier to review -- the change 
moving the event insertion code for the completion queue should stand 
separately, as it is completely unrelated to the rest of the patch.  I 
*think* the patch is right, but picking the changes to the code and watching 
its movement at the same time is making my head spin.

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <dont@kvack.org>.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-19 20:35 [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event Zach Brown
  2007-02-19 20:47 ` Benjamin LaHaise
@ 2007-02-19 20:58 ` Ananiev, Leonid I
  2007-02-19 21:50   ` Chris Mason
  2007-02-20 14:08 ` Ananiev, Leonid I
  2 siblings, 1 reply; 18+ messages in thread
From: Ananiev, Leonid I @ 2007-02-19 20:58 UTC (permalink / raw)
  To: Zach Brown, linux-aio, linux-kernel
  Cc: Benjamin LaHaise, Suparna bhattacharya, Andrew Morton

> while triggering EIO in invalidate_inode_pages2_range() 
...
> With this patch aio-stress sees -EIO.

Actually if invalidate_inode_pages2_range() returns EIO it means
that internal kernel synchronization conflict was happen.
It is reported to user as hardware IO error.
Iteration in synchronization process could be performed instead.

Leonid
-----Original Message-----
From: Zach Brown [mailto:zach.brown@oracle.com] 
Sent: Monday, February 19, 2007 11:35 PM
To: linux-aio@kvack.org; linux-kernel@vger.kernel.org
Cc: Benjamin LaHaise; Suparna bhattacharya; Andrew Morton; Ananiev,
Leonid I
Subject: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion
event

aio: propogate post-EIOCBQUEUED errors to completion event

This addresses an oops reported by Leonid Ananiev
<leonid.i.ananiev@intel.com>
as archived at http://lkml.org/lkml/2007/2/8/337.

O_DIRECT kicks off bios and returns -EIOCBQUEUED to indicate its
intention to
call aio_complete() once the bios complete.   As we return from
submission we
must preserve the -EIOCBQUEUED return code so that fs/aio.c knows to let
the
bio completion call aio_complete().  This stops us from returning errors
after
O_DIRECT submission.

But we have a few places that are very interested in generating errors
after
bio submission.

The most critical of these is invalidating the page cache after a write.
This
avoids exposing stale data to buffered operations that are performed
after the
O_DIRECT write succeeds.  We must do this after submission because the
user
buffer might have been an mmap()ed buffer of the region being written
to.  The
get_user_pages() in the O_DIRECT completion path could have faulted in
stale
data.

So this patch introduces a helper, aio_propogate_error(), which queues
post-submission errors in the iocb so that they are given to the user
completion event when aio_complete() is finally called.

To get this working we change the aio_complete() path so that the ring
insertion is performed as the final iocb reference is dropped.  This
gives the
submission path time to queue its pending error before it drops its
reference.
This increases the space in the iocb as it has to record the two result
codes
from aio_complete() and the pending error from the submission path.

This was tested by running O_DIRECT aio-stress concurrently with
buffered reads
while triggering EIO in invalidate_inode_pages2_range() with the help of
a
debugfs bool hack.  Previously the kernel would oops as fs/aio.c and bio
completion both called aio_complete().  With this patch aio-stress sees
-EIO.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---

 fs/aio.c            |  109 ++++++++++++++++++++++--------------------
 include/linux/aio.h |   30 +++++++++++
 mm/filemap.c        |    4 -
 3 files changed, 91 insertions(+), 52 deletions(-)

--- a/fs/aio.c	Mon Feb 19 11:52:27 2007 -0800
+++ b/fs/aio.c	Mon Feb 19 12:32:52 2007 -0800
@@ -192,6 +192,46 @@ static int aio_setup_ring(struct kioctx 
 	(void)__event;				\
 	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km);
\
 } while(0)
+
+static void aio_ring_insert_entry(struct kioctx *ctx, struct kiocb
*iocb)
+{
+	struct aio_ring_info	*info;
+	struct aio_ring		*ring;
+	struct io_event		*event;
+	unsigned long		tail;	
+
+	assert_spin_locked(&ctx->ctx_lock);
+	
+	info = &ctx->ring_info;
+	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+
+	tail = info->tail;
+	event = aio_ring_event(info, tail, KM_IRQ0);
+	if (++tail >= info->nr)
+		tail = 0;
+
+	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+	event->data = iocb->ki_user_data;
+	event->res = iocb->ki_pending_err ? iocb->ki_pending_err :
iocb->ki_res;
+	event->res2 = iocb->ki_res2;
+
+	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %d %lx %lx\n",
+		ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+		iocb->ki_pending_err, iocb->ki_res, iocb->ki_res2);
+
+	/* after flagging the request as done, we
+	 * must never even look at it again
+	 */
+	smp_wmb();	/* make event visible before updating tail */
+
+	info->tail = tail;
+	ring->tail = tail;
+
+	put_aio_ring_event(event, KM_IRQ0);
+	kunmap_atomic(ring, KM_IRQ1);
+
+	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+}
 
 /* ioctx_alloc
  *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it
failed.
@@ -418,6 +458,7 @@ static struct kiocb fastcall *__aio_get_
 	req->ki_cancel = NULL;
 	req->ki_retry = NULL;
 	req->ki_dtor = NULL;
+	req->ki_pending_err = 0;
 	req->private = NULL;
 	req->ki_iovec = NULL;
 	INIT_LIST_HEAD(&req->ki_run_list);
@@ -507,10 +548,14 @@ static int __aio_put_req(struct kioctx *
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	req->ki_users --;
+	req->ki_users--;
 	BUG_ON(req->ki_users < 0);
 	if (likely(req->ki_users))
 		return 0;
+
+	if (kiocbIsInserted(req))
+		aio_ring_insert_entry(ctx, req);
+
 	list_del(&req->ki_list);		/* remove from
active_reqs */
 	req->ki_cancel = NULL;
 	req->ki_retry = NULL;
@@ -924,12 +969,8 @@ int fastcall aio_complete(struct kiocb *
 int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
-	struct aio_ring_info	*info;
-	struct aio_ring	*ring;
-	struct io_event	*event;
+	int		ret;
 	unsigned long	flags;
-	unsigned long	tail;
-	int		ret;
 
 	/*
 	 * Special case handling for sync iocbs:
@@ -946,56 +987,24 @@ int fastcall aio_complete(struct kiocb *
 		return 1;
 	}
 
-	info = &ctx->ring_info;
-
-	/* add a completion event to the ring buffer.
-	 * must be done holding ctx->ctx_lock to prevent
-	 * other code from messing with the tail
-	 * pointer since we might be called from irq
-	 * context.
+	/*
+	 * We queue up the completion codes into the iocb.  They are
combined
+	 * with a potential error from the submission path and inserted
into
+	 * the ring once the last reference to the iocb is dropped.
Cancelled
+	 * iocbs don't insert events on completion because userland was
given
+	 * an event directly as part of the cancelation interface.
 	 */
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 
 	if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
 		list_del_init(&iocb->ki_run_list);
 
-	/*
-	 * cancelled requests don't get events, userland was given one
-	 * when the event got cancelled.
-	 */
-	if (kiocbIsCancelled(iocb))
-		goto put_rq;
-
-	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
-
-	tail = info->tail;
-	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail >= info->nr)
-		tail = 0;
-
-	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
-	event->data = iocb->ki_user_data;
-	event->res = res;
-	event->res2 = res2;
-
-	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
-		ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
-		res, res2);
-
-	/* after flagging the request as done, we
-	 * must never even look at it again
-	 */
-	smp_wmb();	/* make event visible before updating tail */
-
-	info->tail = tail;
-	ring->tail = tail;
-
-	put_aio_ring_event(event, KM_IRQ0);
-	kunmap_atomic(ring, KM_IRQ1);
-
-	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
-put_rq:
-	/* everything turned out well, dispose of the aiocb. */
+	if (!kiocbIsCancelled(iocb)) {
+		iocb->ki_res = res;
+		iocb->ki_res2 = res2;
+		kiocbSetInserted(iocb);
+	}
+
 	ret = __aio_put_req(ctx, iocb);
 
 	if (waitqueue_active(&ctx->wait))
diff -r 8c8c075ae1c3 include/linux/aio.h
--- a/include/linux/aio.h	Mon Feb 19 11:52:27 2007 -0800
+++ b/include/linux/aio.h	Mon Feb 19 12:32:36 2007 -0800
@@ -34,6 +34,7 @@ struct kioctx;
 /* #define KIF_LOCKED		0 */
 #define KIF_KICKED		1
 #define KIF_CANCELLED		2
+#define KIF_INSERTED		4
 
 #define kiocbTryLock(iocb)	test_and_set_bit(KIF_LOCKED,
&(iocb)->ki_flags)
 #define kiocbTryKick(iocb)	test_and_set_bit(KIF_KICKED,
&(iocb)->ki_flags)
@@ -41,6 +42,7 @@ struct kioctx;
 #define kiocbSetLocked(iocb)	set_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbSetKicked(iocb)	set_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbSetCancelled(iocb)	set_bit(KIF_CANCELLED,
&(iocb)->ki_flags)
+#define kiocbSetInserted(iocb)	set_bit(KIF_INSERTED, &(iocb)->ki_flags)
 
 #define kiocbClearLocked(iocb)	clear_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbClearKicked(iocb)	clear_bit(KIF_KICKED, &(iocb)->ki_flags)
@@ -49,6 +51,7 @@ struct kioctx;
 #define kiocbIsLocked(iocb)	test_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbIsKicked(iocb)	test_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbIsCancelled(iocb)	test_bit(KIF_CANCELLED,
&(iocb)->ki_flags)
+#define kiocbIsInserted(iocb)	test_bit(KIF_INSERTED,
&(iocb)->ki_flags)
 
 /* is there a better place to document function pointer methods? */
 /**
@@ -119,6 +122,10 @@ struct kiocb {
 
 	struct list_head	ki_list;	/* the aio core uses
this
 						 * for cancellation */
+	/* we store and combine return codes from submission and
completion */ 
+	int			ki_pending_err;
+	long			ki_res;
+	long			ki_res2;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
@@ -246,6 +253,29 @@ static inline struct kiocb *list_kiocb(s
 	return list_entry(h, struct kiocb, ki_list);
 }
 
+/*
+ * This function is used to make sure that an error is communicated to
+ * userspace on iocb completion without stopping -EIOCBQUEUED from
bubbling up
+ * to fs/aio.c from the place where it originated.
+ *
+ * If we have an existing -EIOCBQUEUED it must be returned all the way
to
+ * fs/aio.c so that it doesn't double-complete the iocb along with
whoever
+ * returned -EIOCBQUEUED..  In that case we put the new error in the
iocb.  It
+ * will be returned to userspace *intead of* the first result code
given to
+ * aio_complete().  Use this only for errors which must overwrite
whatever the
+ * return code might have been.  The first non-zero new_err given to
this
+ * function for a given iocb will be returned to userspace.
+ */
+static inline int aio_propogate_error(struct kiocb *iocb, int
existing_err,
+				      int new_err)
+{
+	if (existing_err != -EIOCBQUEUED)
+		return new_err;
+	if (!iocb->ki_pending_err)
+		iocb->ki_pending_err = new_err;
+	return -EIOCBQUEUED;
+}
+
 /* for sysctl: */
 extern unsigned long aio_nr;
 extern unsigned long aio_max_nr;
diff -r 8c8c075ae1c3 mm/filemap.c
--- a/mm/filemap.c	Mon Feb 19 11:52:27 2007 -0800
+++ b/mm/filemap.c	Mon Feb 19 12:32:36 2007 -0800
@@ -2031,7 +2031,7 @@ generic_file_direct_write(struct kiocb *
 	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err = generic_osync_inode(inode, mapping,
OSYNC_METADATA);
 		if (err < 0)
-			written = err;
+			written = aio_propogate_error(iocb, written,
err);
 	}
 	return written;
 }
@@ -2396,7 +2396,7 @@ generic_file_direct_IO(int rw, struct ki
 			int err = invalidate_inode_pages2_range(mapping,
 					offset >> PAGE_CACHE_SHIFT,
end);
 			if (err)
-				retval = err;
+				retval = aio_propogate_error(iocb,
retval, err);
 		}
 	}
 	return retval;

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-19 20:47 ` Benjamin LaHaise
@ 2007-02-19 21:07   ` Zach Brown
  0 siblings, 0 replies; 18+ messages in thread
From: Zach Brown @ 2007-02-19 21:07 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: linux-aio, linux-kernel, Suparna bhattacharya, Andrew Morton,
	Leonid Ananiev


>   I
> *think* the patch is right, but picking the changes to the code and  
> watching
> its movement at the same time is making my head spin.

Really?  The only things that changed are the assignment from iocb- 
 >res (after testing pending_err) instead of the 'res' argument and  
the dprintk.

But, uh, sure.  If it's a burden I can respin it as two patches.   
Here they come!

- z

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-19 20:58 ` Ananiev, Leonid I
@ 2007-02-19 21:50   ` Chris Mason
  2007-02-20  0:21     ` Benjamin LaHaise
  0 siblings, 1 reply; 18+ messages in thread
From: Chris Mason @ 2007-02-19 21:50 UTC (permalink / raw)
  To: Ananiev, Leonid I
  Cc: Zach Brown, linux-aio, linux-kernel, Benjamin LaHaise,
	Suparna bhattacharya, Andrew Morton

On Mon, Feb 19, 2007 at 11:58:16PM +0300, Ananiev, Leonid I wrote:
> > while triggering EIO in invalidate_inode_pages2_range() 
> ...
> > With this patch aio-stress sees -EIO.
> 
> Actually if invalidate_inode_pages2_range() returns EIO it means
> that internal kernel synchronization conflict was happen.
> It is reported to user as hardware IO error.
> Iteration in synchronization process could be performed instead.
> 

aio is not responsible for this particular synchronization.  Those fixes
(if we make them) should come from other places.  The patch is important
to get aio error handling right.

I would argue that one common cause of the EIO is userland
error (mmap concurrent with O_DIRECT), and EIO is the correct answer.

-chris

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-19 21:50   ` Chris Mason
@ 2007-02-20  0:21     ` Benjamin LaHaise
  2007-02-20  0:26       ` Zach Brown
                         ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Benjamin LaHaise @ 2007-02-20  0:21 UTC (permalink / raw)
  To: Chris Mason
  Cc: Ananiev, Leonid I, Zach Brown, linux-aio, linux-kernel,
	Suparna bhattacharya, Andrew Morton

On Mon, Feb 19, 2007 at 04:50:48PM -0500, Chris Mason wrote:
> aio is not responsible for this particular synchronization.  Those fixes
> (if we make them) should come from other places.  The patch is important
> to get aio error handling right.
> 
> I would argue that one common cause of the EIO is userland
> error (mmap concurrent with O_DIRECT), and EIO is the correct answer.

I disagree.  That means that using the pagecache to synchronize things like 
the proposed online defragmentation will occasionally make O_DIRECT users 
fail.  O_DIRECT doesn't prevent the sysadmin from copying files or other 
page cache uses, which implies that generating an error in these cases is 
horrifically broken.  If only root could do it, I wouldn't complain, but 
this would seem to imply that user vs root holes still exist.

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <dont@kvack.org>.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20  0:21     ` Benjamin LaHaise
@ 2007-02-20  0:26       ` Zach Brown
  2007-02-20  0:28       ` Chris Mason
  2007-02-20 16:01       ` Trond Myklebust
  2 siblings, 0 replies; 18+ messages in thread
From: Zach Brown @ 2007-02-20  0:26 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Chris Mason, Ananiev, Leonid I, linux-aio, linux-kernel,
	Suparna bhattacharya, Andrew Morton

>> I would argue that one common cause of the EIO is userland
>> error (mmap concurrent with O_DIRECT), and EIO is the correct answer.
>
> I disagree.  That means that using the pagecache to synchronize  
> things like
> the proposed online defragmentation will occasionally make O_DIRECT  
> users
> fail.

Well, only if the pages couldn't be invalidated.  Arguably ext3  
should be trying a lot harder to invalidate before giving up and  
returning failure from invalidate_page().

Hopefully this all goes away in the long term with Chris'  
"placeholder" work that stops insertions into the radix tree while  
dio ops are in flight.

- z

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20  0:21     ` Benjamin LaHaise
  2007-02-20  0:26       ` Zach Brown
@ 2007-02-20  0:28       ` Chris Mason
  2007-02-20 16:01       ` Trond Myklebust
  2 siblings, 0 replies; 18+ messages in thread
From: Chris Mason @ 2007-02-20  0:28 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Ananiev, Leonid I, Zach Brown, linux-aio, linux-kernel,
	Suparna bhattacharya, Andrew Morton

On Mon, Feb 19, 2007 at 07:21:09PM -0500, Benjamin LaHaise wrote:
> On Mon, Feb 19, 2007 at 04:50:48PM -0500, Chris Mason wrote:
> > aio is not responsible for this particular synchronization.  Those fixes
> > (if we make them) should come from other places.  The patch is important
> > to get aio error handling right.
> > 
> > I would argue that one common cause of the EIO is userland
> > error (mmap concurrent with O_DIRECT), and EIO is the correct answer.
> 
> I disagree.  That means that using the pagecache to synchronize things like 
> the proposed online defragmentation will occasionally make O_DIRECT users 
> fail.  O_DIRECT doesn't prevent the sysadmin from copying files or other 
> page cache uses, which implies that generating an error in these cases is 
> horrifically broken.  If only root could do it, I wouldn't complain, but 
> this would seem to imply that user vs root holes still exist.

I'm working on this part in the placeholder patches.  It's just a separate
problem from the aio code.

-chris


^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-19 20:35 [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event Zach Brown
  2007-02-19 20:47 ` Benjamin LaHaise
  2007-02-19 20:58 ` Ananiev, Leonid I
@ 2007-02-20 14:08 ` Ananiev, Leonid I
  2 siblings, 0 replies; 18+ messages in thread
From: Ananiev, Leonid I @ 2007-02-20 14:08 UTC (permalink / raw)
  To: Zach Brown, linux-aio, linux-kernel
  Cc: Benjamin LaHaise, Suparna bhattacharya, Andrew Morton

I have applied the Brown's patch
and run aio-stress O_DIRECT random write 100 times
There is 4% throughput degradation.

47 of 100 aio-stress runs reported an EIO.
1 aio-stress error output example:

adding stage random write
starting with random write
file size 1200MB, record size 16KB, depth 64, ios per iteration 1
max io_submit 1, buffer alignment set to 4KB
threads 1 files 1 contexts 1 context offset 2MB verification on
adding file /mnt/stp/test/testfile thread 0
io err 18446744073709551611 (Input/output error) op 1, size 16384
io err 18446744073709551611 (Input/output error) op 1, size 16384
io err 18446744073709551611 (Input/output error) op 1, size 16384
latency min 0.01 avg 0.16 max 6923.40
         76784 < 100 13 < 250 2 < 500 0 < 1000 0 < 5000 1 < 10000
completion latency min 0.21 avg 153.07 max 1103.15
         34330 < 100 26669 < 250 13653 < 500 2082 < 1000 2 < 5000 0 <
10000
random write on /mnt/stp/test/testfile (6.06 MB/s) 1200.00 MB in 198.03s
3 errors on oper, last 4294967291
thread 0 random write totals (6.06 MB/s) 1200.00 MB in 198.04s
3 errors on oper, last 4294967291

Leonid
-----Original Message-----
From: Zach Brown [mailto:zach.brown@oracle.com] 
Sent: Monday, February 19, 2007 11:35 PM
To: linux-aio@kvack.org; linux-kernel@vger.kernel.org
Cc: Benjamin LaHaise; Suparna bhattacharya; Andrew Morton; Ananiev,
Leonid I
Subject: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion
event

aio: propogate post-EIOCBQUEUED errors to completion event

This addresses an oops reported by Leonid Ananiev
<leonid.i.ananiev@intel.com>
as archived at http://lkml.org/lkml/2007/2/8/337.

O_DIRECT kicks off bios and returns -EIOCBQUEUED to indicate its
intention to
call aio_complete() once the bios complete.   As we return from
submission we
must preserve the -EIOCBQUEUED return code so that fs/aio.c knows to let
the
bio completion call aio_complete().  This stops us from returning errors
after
O_DIRECT submission.

But we have a few places that are very interested in generating errors
after
bio submission.

The most critical of these is invalidating the page cache after a write.
This
avoids exposing stale data to buffered operations that are performed
after the
O_DIRECT write succeeds.  We must do this after submission because the
user
buffer might have been an mmap()ed buffer of the region being written
to.  The
get_user_pages() in the O_DIRECT completion path could have faulted in
stale
data.

So this patch introduces a helper, aio_propogate_error(), which queues
post-submission errors in the iocb so that they are given to the user
completion event when aio_complete() is finally called.

To get this working we change the aio_complete() path so that the ring
insertion is performed as the final iocb reference is dropped.  This
gives the
submission path time to queue its pending error before it drops its
reference.
This increases the space in the iocb as it has to record the two result
codes
from aio_complete() and the pending error from the submission path.

This was tested by running O_DIRECT aio-stress concurrently with
buffered reads
while triggering EIO in invalidate_inode_pages2_range() with the help of
a
debugfs bool hack.  Previously the kernel would oops as fs/aio.c and bio
completion both called aio_complete().  With this patch aio-stress sees
-EIO.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
---

 fs/aio.c            |  109 ++++++++++++++++++++++--------------------
 include/linux/aio.h |   30 +++++++++++
 mm/filemap.c        |    4 -
 3 files changed, 91 insertions(+), 52 deletions(-)

--- a/fs/aio.c	Mon Feb 19 11:52:27 2007 -0800
+++ b/fs/aio.c	Mon Feb 19 12:32:52 2007 -0800
@@ -192,6 +192,46 @@ static int aio_setup_ring(struct kioctx 
 	(void)__event;				\
 	kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km);
\
 } while(0)
+
+static void aio_ring_insert_entry(struct kioctx *ctx, struct kiocb
*iocb)
+{
+	struct aio_ring_info	*info;
+	struct aio_ring		*ring;
+	struct io_event		*event;
+	unsigned long		tail;	
+
+	assert_spin_locked(&ctx->ctx_lock);
+	
+	info = &ctx->ring_info;
+	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+
+	tail = info->tail;
+	event = aio_ring_event(info, tail, KM_IRQ0);
+	if (++tail >= info->nr)
+		tail = 0;
+
+	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+	event->data = iocb->ki_user_data;
+	event->res = iocb->ki_pending_err ? iocb->ki_pending_err :
iocb->ki_res;
+	event->res2 = iocb->ki_res2;
+
+	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %d %lx %lx\n",
+		ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+		iocb->ki_pending_err, iocb->ki_res, iocb->ki_res2);
+
+	/* after flagging the request as done, we
+	 * must never even look at it again
+	 */
+	smp_wmb();	/* make event visible before updating tail */
+
+	info->tail = tail;
+	ring->tail = tail;
+
+	put_aio_ring_event(event, KM_IRQ0);
+	kunmap_atomic(ring, KM_IRQ1);
+
+	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+}
 
 /* ioctx_alloc
  *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it
failed.
@@ -418,6 +458,7 @@ static struct kiocb fastcall *__aio_get_
 	req->ki_cancel = NULL;
 	req->ki_retry = NULL;
 	req->ki_dtor = NULL;
+	req->ki_pending_err = 0;
 	req->private = NULL;
 	req->ki_iovec = NULL;
 	INIT_LIST_HEAD(&req->ki_run_list);
@@ -507,10 +548,14 @@ static int __aio_put_req(struct kioctx *
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	req->ki_users --;
+	req->ki_users--;
 	BUG_ON(req->ki_users < 0);
 	if (likely(req->ki_users))
 		return 0;
+
+	if (kiocbIsInserted(req))
+		aio_ring_insert_entry(ctx, req);
+
 	list_del(&req->ki_list);		/* remove from
active_reqs */
 	req->ki_cancel = NULL;
 	req->ki_retry = NULL;
@@ -924,12 +969,8 @@ int fastcall aio_complete(struct kiocb *
 int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
-	struct aio_ring_info	*info;
-	struct aio_ring	*ring;
-	struct io_event	*event;
+	int		ret;
 	unsigned long	flags;
-	unsigned long	tail;
-	int		ret;
 
 	/*
 	 * Special case handling for sync iocbs:
@@ -946,56 +987,24 @@ int fastcall aio_complete(struct kiocb *
 		return 1;
 	}
 
-	info = &ctx->ring_info;
-
-	/* add a completion event to the ring buffer.
-	 * must be done holding ctx->ctx_lock to prevent
-	 * other code from messing with the tail
-	 * pointer since we might be called from irq
-	 * context.
+	/*
+	 * We queue up the completion codes into the iocb.  They are
combined
+	 * with a potential error from the submission path and inserted
into
+	 * the ring once the last reference to the iocb is dropped.
Cancelled
+	 * iocbs don't insert events on completion because userland was
given
+	 * an event directly as part of the cancelation interface.
 	 */
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 
 	if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
 		list_del_init(&iocb->ki_run_list);
 
-	/*
-	 * cancelled requests don't get events, userland was given one
-	 * when the event got cancelled.
-	 */
-	if (kiocbIsCancelled(iocb))
-		goto put_rq;
-
-	ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
-
-	tail = info->tail;
-	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail >= info->nr)
-		tail = 0;
-
-	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
-	event->data = iocb->ki_user_data;
-	event->res = res;
-	event->res2 = res2;
-
-	dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
-		ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
-		res, res2);
-
-	/* after flagging the request as done, we
-	 * must never even look at it again
-	 */
-	smp_wmb();	/* make event visible before updating tail */
-
-	info->tail = tail;
-	ring->tail = tail;
-
-	put_aio_ring_event(event, KM_IRQ0);
-	kunmap_atomic(ring, KM_IRQ1);
-
-	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
-put_rq:
-	/* everything turned out well, dispose of the aiocb. */
+	if (!kiocbIsCancelled(iocb)) {
+		iocb->ki_res = res;
+		iocb->ki_res2 = res2;
+		kiocbSetInserted(iocb);
+	}
+
 	ret = __aio_put_req(ctx, iocb);
 
 	if (waitqueue_active(&ctx->wait))
diff -r 8c8c075ae1c3 include/linux/aio.h
--- a/include/linux/aio.h	Mon Feb 19 11:52:27 2007 -0800
+++ b/include/linux/aio.h	Mon Feb 19 12:32:36 2007 -0800
@@ -34,6 +34,7 @@ struct kioctx;
 /* #define KIF_LOCKED		0 */
 #define KIF_KICKED		1
 #define KIF_CANCELLED		2
+#define KIF_INSERTED		4
 
 #define kiocbTryLock(iocb)	test_and_set_bit(KIF_LOCKED,
&(iocb)->ki_flags)
 #define kiocbTryKick(iocb)	test_and_set_bit(KIF_KICKED,
&(iocb)->ki_flags)
@@ -41,6 +42,7 @@ struct kioctx;
 #define kiocbSetLocked(iocb)	set_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbSetKicked(iocb)	set_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbSetCancelled(iocb)	set_bit(KIF_CANCELLED,
&(iocb)->ki_flags)
+#define kiocbSetInserted(iocb)	set_bit(KIF_INSERTED, &(iocb)->ki_flags)
 
 #define kiocbClearLocked(iocb)	clear_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbClearKicked(iocb)	clear_bit(KIF_KICKED, &(iocb)->ki_flags)
@@ -49,6 +51,7 @@ struct kioctx;
 #define kiocbIsLocked(iocb)	test_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbIsKicked(iocb)	test_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbIsCancelled(iocb)	test_bit(KIF_CANCELLED,
&(iocb)->ki_flags)
+#define kiocbIsInserted(iocb)	test_bit(KIF_INSERTED,
&(iocb)->ki_flags)
 
 /* is there a better place to document function pointer methods? */
 /**
@@ -119,6 +122,10 @@ struct kiocb {
 
 	struct list_head	ki_list;	/* the aio core uses
this
 						 * for cancellation */
+	/* we store and combine return codes from submission and
completion */ 
+	int			ki_pending_err;
+	long			ki_res;
+	long			ki_res2;
 };
 
 #define is_sync_kiocb(iocb)	((iocb)->ki_key == KIOCB_SYNC_KEY)
@@ -246,6 +253,29 @@ static inline struct kiocb *list_kiocb(s
 	return list_entry(h, struct kiocb, ki_list);
 }
 
+/*
+ * This function is used to make sure that an error is communicated to
+ * userspace on iocb completion without stopping -EIOCBQUEUED from
bubbling up
+ * to fs/aio.c from the place where it originated.
+ *
+ * If we have an existing -EIOCBQUEUED it must be returned all the way
to
+ * fs/aio.c so that it doesn't double-complete the iocb along with
whoever
+ * returned -EIOCBQUEUED..  In that case we put the new error in the
iocb.  It
+ * will be returned to userspace *intead of* the first result code
given to
+ * aio_complete().  Use this only for errors which must overwrite
whatever the
+ * return code might have been.  The first non-zero new_err given to
this
+ * function for a given iocb will be returned to userspace.
+ */
+static inline int aio_propogate_error(struct kiocb *iocb, int
existing_err,
+				      int new_err)
+{
+	if (existing_err != -EIOCBQUEUED)
+		return new_err;
+	if (!iocb->ki_pending_err)
+		iocb->ki_pending_err = new_err;
+	return -EIOCBQUEUED;
+}
+
 /* for sysctl: */
 extern unsigned long aio_nr;
 extern unsigned long aio_max_nr;
diff -r 8c8c075ae1c3 mm/filemap.c
--- a/mm/filemap.c	Mon Feb 19 11:52:27 2007 -0800
+++ b/mm/filemap.c	Mon Feb 19 12:32:36 2007 -0800
@@ -2031,7 +2031,7 @@ generic_file_direct_write(struct kiocb *
 	    ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err = generic_osync_inode(inode, mapping,
OSYNC_METADATA);
 		if (err < 0)
-			written = err;
+			written = aio_propogate_error(iocb, written,
err);
 	}
 	return written;
 }
@@ -2396,7 +2396,7 @@ generic_file_direct_IO(int rw, struct ki
 			int err = invalidate_inode_pages2_range(mapping,
 					offset >> PAGE_CACHE_SHIFT,
end);
 			if (err)
-				retval = err;
+				retval = aio_propogate_error(iocb,
retval, err);
 		}
 	}
 	return retval;

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20  0:21     ` Benjamin LaHaise
  2007-02-20  0:26       ` Zach Brown
  2007-02-20  0:28       ` Chris Mason
@ 2007-02-20 16:01       ` Trond Myklebust
  2007-02-20 16:06         ` Benjamin LaHaise
                           ` (2 more replies)
  2 siblings, 3 replies; 18+ messages in thread
From: Trond Myklebust @ 2007-02-20 16:01 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Chris Mason, Ananiev, Leonid I, Zach Brown, linux-aio,
	linux-kernel, Suparna bhattacharya, Andrew Morton

On Mon, 2007-02-19 at 19:21 -0500, Benjamin LaHaise wrote:
> On Mon, Feb 19, 2007 at 04:50:48PM -0500, Chris Mason wrote:
> > aio is not responsible for this particular synchronization.  Those fixes
> > (if we make them) should come from other places.  The patch is important
> > to get aio error handling right.
> > 
> > I would argue that one common cause of the EIO is userland
> > error (mmap concurrent with O_DIRECT), and EIO is the correct answer.
> 
> I disagree.  That means that using the pagecache to synchronize things like 
> the proposed online defragmentation will occasionally make O_DIRECT users 
> fail.  O_DIRECT doesn't prevent the sysadmin from copying files or other 
> page cache uses, which implies that generating an error in these cases is 
> horrifically broken.  If only root could do it, I wouldn't complain, but 
> this would seem to imply that user vs root holes still exist.

We don't try to resolve "conflicting" writes between ordinary mmap() and
write(), so why should we be doing it for mmap and O_DIRECT?

mmap() is designed to violate the ordinary mutex locks for write(), so
if a conflict arises, whether it be with O_DIRECT or ordinary writes
then it is a case of "last writer wins".

Trond


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20 16:01       ` Trond Myklebust
@ 2007-02-20 16:06         ` Benjamin LaHaise
  2007-02-20 16:06         ` Arjan van de Ven
  2007-02-20 16:08         ` Chris Mason
  2 siblings, 0 replies; 18+ messages in thread
From: Benjamin LaHaise @ 2007-02-20 16:06 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Chris Mason, Ananiev, Leonid I, Zach Brown, linux-aio,
	linux-kernel, Suparna bhattacharya, Andrew Morton

On Tue, Feb 20, 2007 at 11:01:50AM -0500, Trond Myklebust wrote:
> We don't try to resolve "conflicting" writes between ordinary mmap() and
> write(), so why should we be doing it for mmap and O_DIRECT?

Yes we do -- both writes will succeed.  More importantly, if one modifies 
the first 512 bytes of a page and the other modifies the last 512 bytes of 
the page, both writes will show up in the completed result.  This is not the 
case with the -EIO bailout.

> mmap() is designed to violate the ordinary mutex locks for write(), so
> if a conflict arises, whether it be with O_DIRECT or ordinary writes
> then it is a case of "last writer wins".

Except that we're not handling cases like the one mentioned above, which 
is quite broken.

		-ben
-- 
"Time is of no importance, Mr. President, only life is important."
Don't Email: <dont@kvack.org>.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20 16:01       ` Trond Myklebust
  2007-02-20 16:06         ` Benjamin LaHaise
@ 2007-02-20 16:06         ` Arjan van de Ven
  2007-02-20 16:19           ` Chris Mason
  2007-02-20 16:08         ` Chris Mason
  2 siblings, 1 reply; 18+ messages in thread
From: Arjan van de Ven @ 2007-02-20 16:06 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Benjamin LaHaise, Chris Mason, Ananiev, Leonid I, Zach Brown,
	linux-aio, linux-kernel, Suparna bhattacharya, Andrew Morton


> We don't try to resolve "conflicting" writes between ordinary mmap() and
> write(), so why should we be doing it for mmap and O_DIRECT?
> 
> mmap() is designed to violate the ordinary mutex locks for write(), so
> if a conflict arises, whether it be with O_DIRECT or ordinary writes
> then it is a case of "last writer wins".

but.. wouldn't an O_DIRECT *read* even cause this?



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20 16:01       ` Trond Myklebust
  2007-02-20 16:06         ` Benjamin LaHaise
  2007-02-20 16:06         ` Arjan van de Ven
@ 2007-02-20 16:08         ` Chris Mason
  2007-02-20 16:29           ` Trond Myklebust
  2 siblings, 1 reply; 18+ messages in thread
From: Chris Mason @ 2007-02-20 16:08 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Benjamin LaHaise, Ananiev, Leonid I, Zach Brown, linux-aio,
	linux-kernel, Suparna bhattacharya, Andrew Morton

On Tue, Feb 20, 2007 at 11:01:50AM -0500, Trond Myklebust wrote:
> On Mon, 2007-02-19 at 19:21 -0500, Benjamin LaHaise wrote:
> > On Mon, Feb 19, 2007 at 04:50:48PM -0500, Chris Mason wrote:
> > > aio is not responsible for this particular synchronization.  Those fixes
> > > (if we make them) should come from other places.  The patch is important
> > > to get aio error handling right.
> > > 
> > > I would argue that one common cause of the EIO is userland
> > > error (mmap concurrent with O_DIRECT), and EIO is the correct answer.
> > 
> > I disagree.  That means that using the pagecache to synchronize things like 
> > the proposed online defragmentation will occasionally make O_DIRECT users 
> > fail.  O_DIRECT doesn't prevent the sysadmin from copying files or other 
> > page cache uses, which implies that generating an error in these cases is 
> > horrifically broken.  If only root could do it, I wouldn't complain, but 
> > this would seem to imply that user vs root holes still exist.
> 
> We don't try to resolve "conflicting" writes between ordinary mmap() and
> write(), so why should we be doing it for mmap and O_DIRECT?
> 
> mmap() is designed to violate the ordinary mutex locks for write(), so
> if a conflict arises, whether it be with O_DIRECT or ordinary writes
> then it is a case of "last writer wins".

There are some strange O_DIRECT corner cases in here such that the 'last
writer' may actually be a 'last reader' and winning can mean have a copy
of the page in page cache older than the copy on disk.

One option is to have invalidate_inode_pages2_range continue if it can't
toss a page but still return something that O_DIRECT ignores (living
with the race), but it looks like I can make a launder_page op that does
the right thing.  I'll give it a shot.

-chris


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20 16:06         ` Arjan van de Ven
@ 2007-02-20 16:19           ` Chris Mason
  0 siblings, 0 replies; 18+ messages in thread
From: Chris Mason @ 2007-02-20 16:19 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Trond Myklebust, Benjamin LaHaise, Ananiev, Leonid I, Zach Brown,
	linux-aio, linux-kernel, Suparna bhattacharya, Andrew Morton

On Tue, Feb 20, 2007 at 05:06:47PM +0100, Arjan van de Ven wrote:
> > We don't try to resolve "conflicting" writes between ordinary mmap() and
> > write(), so why should we be doing it for mmap and O_DIRECT?
> > 
> > mmap() is designed to violate the ordinary mutex locks for write(), so
> > if a conflict arises, whether it be with O_DIRECT or ordinary writes
> > then it is a case of "last writer wins".
> 
> but.. wouldn't an O_DIRECT *read* even cause this?

The O_DIRECT read is fine because it doesn't leave bad data in the page
cache.  The point of doing invalidate_inode_pages2_range is to purge
page cache data that has the old contents of the file before the
O_DIRECT write.

-chris


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20 16:08         ` Chris Mason
@ 2007-02-20 16:29           ` Trond Myklebust
  2007-02-20 16:38             ` Trond Myklebust
  2007-02-20 18:40             ` Zach Brown
  0 siblings, 2 replies; 18+ messages in thread
From: Trond Myklebust @ 2007-02-20 16:29 UTC (permalink / raw)
  To: Chris Mason
  Cc: Benjamin LaHaise, Ananiev, Leonid I, Zach Brown, linux-aio,
	linux-kernel, Suparna bhattacharya, Andrew Morton

On Tue, 2007-02-20 at 11:08 -0500, Chris Mason wrote:
> On Tue, Feb 20, 2007 at 11:01:50AM -0500, Trond Myklebust wrote:
> > On Mon, 2007-02-19 at 19:21 -0500, Benjamin LaHaise wrote:
> > > On Mon, Feb 19, 2007 at 04:50:48PM -0500, Chris Mason wrote:
> > > > aio is not responsible for this particular synchronization.  Those fixes
> > > > (if we make them) should come from other places.  The patch is important
> > > > to get aio error handling right.
> > > > 
> > > > I would argue that one common cause of the EIO is userland
> > > > error (mmap concurrent with O_DIRECT), and EIO is the correct answer.
> > > 
> > > I disagree.  That means that using the pagecache to synchronize things like 
> > > the proposed online defragmentation will occasionally make O_DIRECT users 
> > > fail.  O_DIRECT doesn't prevent the sysadmin from copying files or other 
> > > page cache uses, which implies that generating an error in these cases is 
> > > horrifically broken.  If only root could do it, I wouldn't complain, but 
> > > this would seem to imply that user vs root holes still exist.
> > 
> > We don't try to resolve "conflicting" writes between ordinary mmap() and
> > write(), so why should we be doing it for mmap and O_DIRECT?
> > 
> > mmap() is designed to violate the ordinary mutex locks for write(), so
> > if a conflict arises, whether it be with O_DIRECT or ordinary writes
> > then it is a case of "last writer wins".
> 
> There are some strange O_DIRECT corner cases in here such that the 'last
> writer' may actually be a 'last reader' and winning can mean have a copy
> of the page in page cache older than the copy on disk.

As long as it is marked dirty so that it eventually gets synced to disk,
it shouldn't matter.

> One option is to have invalidate_inode_pages2_range continue if it can't
> toss a page but still return something that O_DIRECT ignores (living
> with the race), but it looks like I can make a launder_page op that does
> the right thing.  I'll give it a shot.

I already sent in a patch to do that last week.

Cheers
  Trond


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20 16:29           ` Trond Myklebust
@ 2007-02-20 16:38             ` Trond Myklebust
  2007-02-20 18:40             ` Zach Brown
  1 sibling, 0 replies; 18+ messages in thread
From: Trond Myklebust @ 2007-02-20 16:38 UTC (permalink / raw)
  To: Chris Mason
  Cc: Benjamin LaHaise, Ananiev, Leonid I, Zach Brown, linux-aio,
	linux-kernel, Suparna bhattacharya, Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 623 bytes --]

On Tue, 2007-02-20 at 11:30 -0500, Trond Myklebust wrote:
> > One option is to have invalidate_inode_pages2_range continue if it can't
> > toss a page but still return something that O_DIRECT ignores (living
> > with the race), but it looks like I can make a launder_page op that does
> > the right thing.  I'll give it a shot.
> 
> I already sent in a patch to do that last week.

To be more precise, here are the 2 patches that I sent to lkml last
week. One ensures that we don't stop the invalidation just because of
the existence of an unsynced dirty page. The other gets rid of the EIO
for that case.

Cheers
  Trond


[-- Attachment #2: linux-2.6.20-001-fix_invalidate_inode_pages2_range_return.dif --]
[-- Type: message/rfc822, Size: 1156 bytes --]

From: Trond Myklebust <Trond.Myklebust@netapp.com>
Subject: No Subject
Date: Sat, 13 Jan 2007 02:28:13 -0500
Message-ID: <1171989417.6271.29.camel@heimdal.trondhjem.org>

Fix invalidate_inode_pages2_range() so that it does not immediately exit
just because a single page in the specified range could not be removed.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 mm/truncate.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/truncate.c b/mm/truncate.c
index ebf3fcb..0f4b6d1 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -375,10 +375,10 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 
 	pagevec_init(&pvec, 0);
 	next = start;
-	while (next <= end && !ret && !wrapped &&
+	while (next <= end && !wrapped &&
 		pagevec_lookup(&pvec, mapping, next,
 			min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
-		for (i = 0; !ret && i < pagevec_count(&pvec); i++) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index;
 

[-- Attachment #3: linux-2.6.20-002-invalidate_inode_pages2_dont_fail_on_dirty.dif --]
[-- Type: message/rfc822, Size: 1856 bytes --]

From: Trond Myklebust <Trond.Myklebust@netapp.com>
Subject: No Subject
Date: Sat, 13 Jan 2007 02:28:13 -0500
Message-ID: <1171989417.6271.30.camel@heimdal.trondhjem.org>

invalidate_inode_pages2() should not try to fix races between direct_IO and
mmap(). It should only be trying to clear out pages that were dirty before
the direct_IO write (see generic_file_direct_IO()).
Skipping dirty pages should therefore not result in an error.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 mm/truncate.c |    8 +++++---
 1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/truncate.c b/mm/truncate.c
index 0f4b6d1..c3ff820 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -318,6 +318,8 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
  * invalidation guarantees, and cannot afford to leave pages behind because
  * shrink_list() has a temp ref on them, or because they're transiently sitting
  * in the lru_cache_add() pagevecs.
+ * Note: this function just skips pages that are dirty without flagging
+ * an error.
  */
 static int
 invalidate_complete_page2(struct address_space *mapping, struct page *page)
@@ -330,7 +332,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 
 	write_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page))
-		goto failed;
+		goto dirty;
 
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
@@ -338,9 +340,9 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
-failed:
+dirty:
 	write_unlock_irq(&mapping->tree_lock);
-	return 0;
+	return 1;
 }
 
 static int do_launder_page(struct address_space *mapping, struct page *page)

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20 16:29           ` Trond Myklebust
  2007-02-20 16:38             ` Trond Myklebust
@ 2007-02-20 18:40             ` Zach Brown
  2007-02-21  0:05               ` Trond Myklebust
  1 sibling, 1 reply; 18+ messages in thread
From: Zach Brown @ 2007-02-20 18:40 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Chris Mason, Benjamin LaHaise, Ananiev, Leonid I, linux-aio,
	linux-kernel, Suparna bhattacharya, Andrew Morton

>> There are some strange O_DIRECT corner cases in here such that the  
>> 'last
>> writer' may actually be a 'last reader' and winning can mean have  
>> a copy
>> of the page in page cache older than the copy on disk.
>
> As long as it is marked dirty so that it eventually gets synced to  
> disk,
> it shouldn't matter.

No, Chris is pointing out that an an O_DIRECT write can leave clean  
read pages in the page cache.

All it takes is giving a source buffer for the write which is an mmap 
()ed apeture of the region that is being written to.  If you get the  
offsets right you can get the get_user_pages() down in fs/direct-io.c  
will populate the page cache before the actual O_DIRECT write gets to  
it.

- z

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event
  2007-02-20 18:40             ` Zach Brown
@ 2007-02-21  0:05               ` Trond Myklebust
  0 siblings, 0 replies; 18+ messages in thread
From: Trond Myklebust @ 2007-02-21  0:05 UTC (permalink / raw)
  To: Zach Brown
  Cc: Chris Mason, Benjamin LaHaise, Ananiev, Leonid I, linux-aio,
	linux-kernel, Suparna bhattacharya, Andrew Morton

On Tue, 2007-02-20 at 10:40 -0800, Zach Brown wrote:
> >> There are some strange O_DIRECT corner cases in here such that the  
> >> 'last
> >> writer' may actually be a 'last reader' and winning can mean have  
> >> a copy
> >> of the page in page cache older than the copy on disk.
> >
> > As long as it is marked dirty so that it eventually gets synced to  
> > disk,
> > it shouldn't matter.
> 
> No, Chris is pointing out that an an O_DIRECT write can leave clean  
> read pages in the page cache.
>
> All it takes is giving a source buffer for the write which is an mmap 
> ()ed apeture of the region that is being written to.  If you get the  
> offsets right you can get the get_user_pages() down in fs/direct-io.c  
> will populate the page cache before the actual O_DIRECT write gets to  
> it.


With invalidate_inode_pages2()? That is supposed to wait until the page
lock is taken -> read is done. It then calls unmap_mapping_range() which
will remove the offending page from any existing mappings. Sure an
application could get stale data, but if it is reading while an O_DIRECT
write is proceeding, then it gets what it deserves.

Trond


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2007-02-21  0:05 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-19 20:35 [PATCH] aio: propogate post-EIOCBQUEUED errors to completion event Zach Brown
2007-02-19 20:47 ` Benjamin LaHaise
2007-02-19 21:07   ` Zach Brown
2007-02-19 20:58 ` Ananiev, Leonid I
2007-02-19 21:50   ` Chris Mason
2007-02-20  0:21     ` Benjamin LaHaise
2007-02-20  0:26       ` Zach Brown
2007-02-20  0:28       ` Chris Mason
2007-02-20 16:01       ` Trond Myklebust
2007-02-20 16:06         ` Benjamin LaHaise
2007-02-20 16:06         ` Arjan van de Ven
2007-02-20 16:19           ` Chris Mason
2007-02-20 16:08         ` Chris Mason
2007-02-20 16:29           ` Trond Myklebust
2007-02-20 16:38             ` Trond Myklebust
2007-02-20 18:40             ` Zach Brown
2007-02-21  0:05               ` Trond Myklebust
2007-02-20 14:08 ` Ananiev, Leonid I

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.