All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] dio: Fast-path for page-aligned IOs
@ 2011-06-20 23:17 Dan Ehrenberg
  2011-06-21 20:41 ` Andi Kleen
  2011-07-27 21:08 ` Christoph Hellwig
  0 siblings, 2 replies; 15+ messages in thread
From: Dan Ehrenberg @ 2011-06-20 23:17 UTC (permalink / raw)
  To: Alexander Viro; +Cc: linux-fsdevel, linux-kernel, Andrew Morton, Dan Ehrenberg

This code introduces a fast-path variant of __blockdev_direct_IO
for the special case where the request size is a multiple of the page
size, the inode block size is a page, the user memory is page-aligned,
the underlying storage is contiguous on disk and the file location is
already initialized. The special case decreases the amount of
bookkeeping required, which saves a significant amount of CPU time on
a fast device such as a ramdisk or an SSD.  The patch is inspired by
earlier code by Ken Chen.

In my testing, this patch saves around a quarter of system time
compared to the old version when executed on a workload of 4k
random read operations on a high performance SSD. It does this
by nearly eliminating the CPU usage from fs/direct-io.c.  It only
works for this particular special case which we find to be common
in our workloads.

With hard drives, this optimization is less relevant because
(a) the long latency of a seek will mask most of this CPU time, and
(b) applications issue fewer, larger requests to minimize seeks,
thereby minimizing the CPU overhead from this code path. But on SSDs,
the drive makes requests as small as a page size fast. It is common
to use direct I/O here because the page cache overhead is easily
visible. And it is common to use operations which are multiples of
page size rather than smaller ones because most SSDs use 4k or bigger
blocks at the lowest level, even if the firmware may expose smaller
blocks.

The fast path does not apply for operations of the wrong size
or alignmnent, or for operations on raw drives with 512-byte sectors.
It might be possible to make this special case a little more general
while maintaining its performance benefits, but I do not believe that
the full performance benefits can be achieved without resorting to
special handling of simple cases, as is done in this patch.

Signed-off-by: Dan Ehrenberg <dehrenberg@google.com>
---
Changelog since v1:
  The original patch I sent was accidentally written for
  2.4.34. This version is forward-ported the patch to what's
  currently in upstream.

 fs/direct-io.c |  294 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 269 insertions(+), 25 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index ac5f164..aa2c369 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -38,6 +38,10 @@
 #include <asm/atomic.h>
 
 /*
+ * General case implementation of __blockdev_direct_IO
+ */
+
+/*
  * How many user pages to map in one call to get_user_pages().  This determines
  * the size of a structure on the stack.
  */
@@ -428,22 +432,22 @@ static struct bio *dio_await_one(struct dio *dio)
 /*
  * Process one completed BIO.  No locks are held.
  */
-static int dio_bio_complete(struct dio *dio, struct bio *bio)
+static int __dio_bio_complete(struct bio *bio, int is_async, int rw, int *error)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int page_no;
 
 	if (!uptodate)
-		dio->io_error = -EIO;
+		*error = -EIO;
 
-	if (dio->is_async && dio->rw == READ) {
+	if (is_async && rw == READ) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 	} else {
 		for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
 			struct page *page = bvec[page_no].bv_page;
 
-			if (dio->rw == READ && !PageCompound(page))
+			if (rw == READ && !PageCompound(page))
 				set_page_dirty_lock(page);
 			page_cache_release(page);
 		}
@@ -452,6 +456,11 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	return uptodate ? 0 : -EIO;
 }
 
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
+{
+	return __dio_bio_complete(bio, dio->is_async, dio->rw, &dio->io_error);
+}
+
 /*
  * Wait on and process all in-flight BIOs.  This must only be called once
  * all bios have been issued so that the refcount can only decrease.
@@ -1137,27 +1146,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	return ret;
 }
 
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- *  - if the flags value contains DIO_LOCKING we use a fancy locking
- *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
- *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
- *
- *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- *    internal locking but rather rely on the filesystem to synchronize
- *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
- */
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+static ssize_t
+dio_generic(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
@@ -1253,4 +1243,258 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 out:
 	return retval;
 }
+
+/*
+ * Special case for __blockdev_direct_IO:
+ * page-aligned page-sized IOs when the inode block size is page-sized.
+ *
+ * The following code does the exact same thing as the above
+ * general-case code, but the smaller amount of bookkeeping
+ * has a big impact on CPU usage, which is visible on fast
+ * devices like SSDs.
+ */
+
+/*
+ * struct mini_dio has a subset of the fields in struct dio.
+ * For comments on fields, see that struct definition.
+ */
+struct mini_dio {
+	/* The following struct items are needed in the completion. */
+	dio_iodone_t *end_io;
+	struct kiocb *iocb;
+	loff_t offset;
+	int rw;
+	int flags;
+	struct inode *inode;
+	size_t size;
+
+	/*
+	 * The following struct items are allocated just because
+	 * they're big, and they are only used during submission.
+	 */
+	struct buffer_head map_bh;
+	struct page *pages[DIO_PAGES];
+};
+
+/*
+ * In this function we treat everything as if it's AIO to simplify the logic.
+ * The AIO code paths also handle the synchronous case appropriately.
+ */
+static void dio_fast_end_aio(struct bio *bio, int error)
+{
+	struct mini_dio *mdio = bio->bi_private;
+	struct kiocb *iocb = mdio->iocb;
+	int res;
+
+	__dio_bio_complete(bio, 1, mdio->rw, &error);
+
+	/* Could it be some other value? */
+	res = (error < 0) ? error : mdio->size;
+
+	if (mdio->end_io)
+		mdio->end_io(iocb, mdio->offset, res,
+			     mdio->map_bh.b_private, 0, 1);
+	else
+		aio_complete(iocb, res, 0);
+
+	if (mdio->flags & DIO_LOCKING)
+		up_read_non_owner(&mdio->inode->i_alloc_sem);
+
+	kfree(mdio);
+}
+
+static ssize_t dio_fast(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io,
+	int flags)
+{
+	struct bio *bio;
+	struct mini_dio *mdio;
+	int res, i;
+	size_t size = iov[0].iov_len;
+	int num_pages = size >> PAGE_SHIFT;
+
+	mdio = kmalloc(sizeof(*mdio), GFP_KERNEL);
+	if (mdio == NULL)
+		goto fallback;
+	mdio->end_io = end_io;
+	mdio->iocb = iocb;
+	mdio->offset = offset;
+	mdio->rw = rw;
+	mdio->flags = flags;
+	mdio->inode = inode;
+	mdio->size = size;
+	/* get_block expects a partially initialized bh */
+	mdio->map_bh.b_size = size;
+	mdio->map_bh.b_state = 0;
+
+	res = get_user_pages_fast(
+		(unsigned long)iov[0].iov_base,	/* Where from */
+		num_pages,			/* How many pages */
+		rw == READ,			/* Write to memory? */
+		mdio->pages);			/* Put results here */
+	if (res < 0)
+		goto fallback_free;
+	if (res != num_pages) {
+		num_pages = res;	/* Used for how many to free */
+		goto fallback_pages;
+	}
+
+	if (flags & DIO_LOCKING) {
+		if (rw == READ) {
+			struct address_space *mapping =
+					iocb->ki_filp->f_mapping;
+
+			mutex_lock(&inode->i_mutex);
+
+			res = filemap_write_and_wait_range(
+				mapping, offset,
+				offset + size - 1);
+			if (res != 0)
+				goto fallback_unlock;
+		}
+
+		/*
+		 * The i_alloc_sem will be released at I/O completion,
+		 * possibly in a different thread.
+		 */
+		down_read_non_owner(&inode->i_alloc_sem);
+	}
+
+
+	if ((*get_block)(inode, offset >> PAGE_SHIFT, &mdio->map_bh, 0) != 0)
+		goto fallback_sem_up;
+	if (!buffer_mapped(&mdio->map_bh) || mdio->map_bh.b_size != size)
+		/*
+		 * Bail out to the general case.
+		 *
+		 * If the first condition is false, we've either
+		 * encountered a file hole, or we are appending to
+		 * the end of the file.
+		 *
+		 * The second condition will only be false if the
+		 * file system returns a mapping shorter than the
+		 * total size. Here, we only handle physically
+		 * contiguous mappings, which can be implemented
+		 * with a single bio.
+		 */
+		goto fallback_sem_up;
+	bio = bio_alloc(GFP_KERNEL, num_pages);
+	bio->bi_bdev = mdio->map_bh.b_bdev;
+	bio->bi_end_io = dio_fast_end_aio;
+	bio->bi_private = mdio;
+	/*
+	 * On this fast path, inode->i_blkbits is guaranteed
+	 * to be equal to PAGE_SIZE
+	 */
+	bio->bi_sector = mdio->map_bh.b_blocknr << (PAGE_SHIFT - 9);
+
+	for (i = 0; i < num_pages; i++) {
+		if (bio_add_page(bio, mdio->pages[i], PAGE_SIZE, 0)
+				!= PAGE_SIZE) {
+			bio_put(bio);
+			goto fallback_sem_up;
+		}
+	}
+	/* bio is ready, submit it */
+	if (rw == READ)
+		bio_set_pages_dirty(bio);
+	if (submit_io == NULL)
+		submit_bio(rw, bio);
+	else
+		submit_io(rw, bio, inode, offset);
+	mutex_unlock(&inode->i_mutex);
+	return -EIOCBQUEUED;
+
+fallback_sem_up:
+	if (flags & DIO_LOCKING)
+		up_read_non_owner(&inode->i_alloc_sem);
+fallback_unlock:
+	if ((flags & DIO_LOCKING) && rw == READ)
+		mutex_unlock(&inode->i_mutex);
+fallback_pages:
+	for (i = 0; i < num_pages; i++)
+		page_cache_release(mdio->pages[i]);
+fallback_free:
+	kfree(mdio);
+
+fallback:	/* Bail out to the general case */
+	return dio_generic(rw, iocb, inode,
+			bdev, iov, offset, 1,
+			get_block, end_io, submit_io, flags);
+}
+
+/*
+ * __blockdev_direct_IO - issue a direct I/O request
+ *
+ * This is a library function for use by filesystem drivers in their
+ * direct I/O routine. It translates an I/O request into bios and
+ * sends those down to the block layer.
+ *
+ * @rw: either READ or WRITE
+ * @iocb: iocb of the request
+ * @inode: inode of the request
+ * @bdev: backing block device
+ * @iov: vector of ranges of user-space memory
+ * @offset: logical byte offset for the start of the IO
+ * @nr_segs: number of elements in iov
+ * @get_block: a callback to map blocks into a buffer_head
+ *    signature: int (*get_block)(struct inode *inode,
+ *		sector_t iblock, struct buffer_head *bh, int create)
+ *    On entry, bh will have its b_size field initialized with the
+ *    length of the request
+ *    On exit, get_block should have initialized bh. get_block returns
+ *    0 on success, or an error code if the block didn't exist. On success,
+ *    the bh will be mapped if possible, or marked as unmapped otherwise.
+ *    The boolean flag create indicates whether an allocation (or
+ *    initializiation) should be attempted if no block is already present for
+ *    the logical block offset iblock.
+ * @end_io: a callback on IO completion, or NULL (defaulting to aio_complete)
+ *    signature: void (*end_io)(struct kiocb *iocb, loff_t offset,
+ *			    ssize_t size, void *private, int ret,
+ *			    bool is_async)
+ *    The private argument is extracted from bh.b_private, allowing get_block to
+ *    pass information to the completion.
+ * @submit_io: a callback used to submit struct bios
+ * @flags: two flags allowed, DIO_LOCKING and DIO_SKIP_HOLES.
+ *    DIO_SKIP_HOLES indicates that this DIO request cannot fill
+ *    in a hole in the middle of a file.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	dio_submit_t submit_io,	int flags)
+{
+	BUG_ON(rw != READ && rw != WRITE);
+	if (inode->i_blkbits == PAGE_SHIFT
+	    && nr_segs == 1
+	    && IS_ALIGNED(offset, PAGE_SIZE)
+	    && IS_ALIGNED((unsigned long)iov[0].iov_base, PAGE_SIZE)
+	    && IS_ALIGNED(iov[0].iov_len, PAGE_SIZE)
+	    && iov[0].iov_len >> PAGE_SHIFT <= DIO_PAGES)
+		return dio_fast(rw, iocb, inode,
+				bdev, iov, offset,
+				get_block, end_io, submit_io, flags);
+
+	return dio_generic(rw, iocb, inode,
+			bdev, iov, offset,
+			nr_segs, get_block, end_io, submit_io, flags);
+}
 EXPORT_SYMBOL(__blockdev_direct_IO);
-- 
1.7.3.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
  2011-06-20 23:17 [PATCH] dio: Fast-path for page-aligned IOs Dan Ehrenberg
@ 2011-06-21 20:41 ` Andi Kleen
  2011-06-21 21:21     ` Daniel Ehrenberg
  2011-07-27 21:08 ` Christoph Hellwig
  1 sibling, 1 reply; 15+ messages in thread
From: Andi Kleen @ 2011-06-21 20:41 UTC (permalink / raw)
  To: Dan Ehrenberg; +Cc: Alexander Viro, linux-fsdevel, linux-kernel, Andrew Morton

Dan Ehrenberg <dehrenberg@google.com> writes:

> This code introduces a fast-path variant of __blockdev_direct_IO
> for the special case where the request size is a multiple of the page
> size, the inode block size is a page, the user memory is page-aligned,
> the underlying storage is contiguous on disk and the file location is
> already initialized. The special case decreases the amount of
> bookkeeping required, which saves a significant amount of CPU time on
> a fast device such as a ramdisk or an SSD.  The patch is inspired by
> earlier code by Ken Chen.

Is it understood why your fast path is that much faster? 
i.e. what's the slow part in the normal path that it avoids?

I am wondering if some of the improvements could be gotten even for less
rigid pre conditions.

> +		/*
> +		 * The i_alloc_sem will be released at I/O completion,
> +		 * possibly in a different thread.
> +		 */
> +		down_read_non_owner(&inode->i_alloc_sem);

There's just a patch kit posted from hch which removes that semaphore.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
  2011-06-21 20:41 ` Andi Kleen
@ 2011-06-21 21:21     ` Daniel Ehrenberg
  0 siblings, 0 replies; 15+ messages in thread
From: Daniel Ehrenberg @ 2011-06-21 21:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Alexander Viro, linux-fsdevel, linux-kernel, Andrew Morton

On Tue, Jun 21, 2011 at 1:41 PM, Andi Kleen <andi@firstfloor.org> wrote:
> Dan Ehrenberg <dehrenberg@google.com> writes:
>
>> This code introduces a fast-path variant of __blockdev_direct_IO
>> for the special case where the request size is a multiple of the page
>> size, the inode block size is a page, the user memory is page-aligned,
>> the underlying storage is contiguous on disk and the file location is
>> already initialized. The special case decreases the amount of
>> bookkeeping required, which saves a significant amount of CPU time on
>> a fast device such as a ramdisk or an SSD.  The patch is inspired by
>> earlier code by Ken Chen.
>
> Is it understood why your fast path is that much faster?
> i.e. what's the slow part in the normal path that it avoids?
>
> I am wondering if some of the improvements could be gotten even for less
> rigid pre conditions.

I should start by saying that I really should've submitted this with
an [RFC] tag. I'm eager for feedback on my first Linux kernel patch,
and I'm really glad you responded.

The slowness in the dio code that I have observed is not in any
particular place, but rather a death of a thousand cuts. Lines like
        memset(dio, 0, offsetof(struct dio, pages));
show up as significant in the CPU profile, but so do other random
lines that manipulate the struct dio.

In an earlier version of the patch, I restricted the change to only
page-sized operations. This was criticized for being insufficiently
general. In generalizing to page-multiple operations, I noticed a
minor regression, which seems to be from the IS_ALIGNED calls.

You're right that these preconditions are rather rigid, though. If you
have a suggestion for a more general precondition, I can try it out
and see if it maintains the performance properties I want.
>
>> +             /*
>> +              * The i_alloc_sem will be released at I/O completion,
>> +              * possibly in a different thread.
>> +              */
>> +             down_read_non_owner(&inode->i_alloc_sem);
>
> There's just a patch kit posted from hch which removes that semaphore.
>
> -Andi

Once this patch is finalized and merged, I can make a new version of
the patch based on the new synchronization mechanism.

Dan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
@ 2011-06-21 21:21     ` Daniel Ehrenberg
  0 siblings, 0 replies; 15+ messages in thread
From: Daniel Ehrenberg @ 2011-06-21 21:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Alexander Viro, linux-fsdevel, linux-kernel, Andrew Morton

On Tue, Jun 21, 2011 at 1:41 PM, Andi Kleen <andi@firstfloor.org> wrote:
> Dan Ehrenberg <dehrenberg@google.com> writes:
>
>> This code introduces a fast-path variant of __blockdev_direct_IO
>> for the special case where the request size is a multiple of the page
>> size, the inode block size is a page, the user memory is page-aligned,
>> the underlying storage is contiguous on disk and the file location is
>> already initialized. The special case decreases the amount of
>> bookkeeping required, which saves a significant amount of CPU time on
>> a fast device such as a ramdisk or an SSD.  The patch is inspired by
>> earlier code by Ken Chen.
>
> Is it understood why your fast path is that much faster?
> i.e. what's the slow part in the normal path that it avoids?
>
> I am wondering if some of the improvements could be gotten even for less
> rigid pre conditions.

I should start by saying that I really should've submitted this with
an [RFC] tag. I'm eager for feedback on my first Linux kernel patch,
and I'm really glad you responded.

The slowness in the dio code that I have observed is not in any
particular place, but rather a death of a thousand cuts. Lines like
        memset(dio, 0, offsetof(struct dio, pages));
show up as significant in the CPU profile, but so do other random
lines that manipulate the struct dio.

In an earlier version of the patch, I restricted the change to only
page-sized operations. This was criticized for being insufficiently
general. In generalizing to page-multiple operations, I noticed a
minor regression, which seems to be from the IS_ALIGNED calls.

You're right that these preconditions are rather rigid, though. If you
have a suggestion for a more general precondition, I can try it out
and see if it maintains the performance properties I want.
>
>> +             /*
>> +              * The i_alloc_sem will be released at I/O completion,
>> +              * possibly in a different thread.
>> +              */
>> +             down_read_non_owner(&inode->i_alloc_sem);
>
> There's just a patch kit posted from hch which removes that semaphore.
>
> -Andi

Once this patch is finalized and merged, I can make a new version of
the patch based on the new synchronization mechanism.

Dan
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
  2011-06-21 21:21     ` Daniel Ehrenberg
@ 2011-06-21 21:44       ` Andi Kleen
  -1 siblings, 0 replies; 15+ messages in thread
From: Andi Kleen @ 2011-06-21 21:44 UTC (permalink / raw)
  To: Daniel Ehrenberg
  Cc: Andi Kleen, Alexander Viro, linux-fsdevel, linux-kernel, Andrew Morton

On Tue, Jun 21, 2011 at 02:21:48PM -0700, Daniel Ehrenberg wrote:
> On Tue, Jun 21, 2011 at 1:41 PM, Andi Kleen <andi@firstfloor.org> wrote:
> > Dan Ehrenberg <dehrenberg@google.com> writes:
> >
> >> This code introduces a fast-path variant of __blockdev_direct_IO
> >> for the special case where the request size is a multiple of the page
> >> size, the inode block size is a page, the user memory is page-aligned,
> >> the underlying storage is contiguous on disk and the file location is
> >> already initialized. The special case decreases the amount of
> >> bookkeeping required, which saves a significant amount of CPU time on
> >> a fast device such as a ramdisk or an SSD.  The patch is inspired by
> >> earlier code by Ken Chen.
> >
> > Is it understood why your fast path is that much faster?
> > i.e. what's the slow part in the normal path that it avoids?
> >
> > I am wondering if some of the improvements could be gotten even for less
> > rigid pre conditions.
> 
> I should start by saying that I really should've submitted this with
> an [RFC] tag. I'm eager for feedback on my first Linux kernel patch,
> and I'm really glad you responded.
> 
> The slowness in the dio code that I have observed is not in any
> particular place, but rather a death of a thousand cuts. Lines like
>         memset(dio, 0, offsetof(struct dio, pages));

Hmm, is it cache miss stalls or just core cycles? 

If the later I assume gcc generated an slow out of line call
for memset.  I guess that would be fixable.

If the former maybe need a strategic prefetch?

Possibly a slab constructor would also help and avoid some of the 
reinitialization costs (this would requirement a fixed size
limit for the fast path, but I guess that's reasonable)

> show up as significant in the CPU profile, but so do other random
> lines that manipulate the struct dio.

That would suggest cache misses?

So why does your version avoid those?

> You're right that these preconditions are rather rigid, though. If you
> have a suggestion for a more general precondition, I can try it out
> and see if it maintains the performance properties I want.

Not fully sure, but I would be interested in support for 512 byte sectors
at least.

-Andi

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
@ 2011-06-21 21:44       ` Andi Kleen
  0 siblings, 0 replies; 15+ messages in thread
From: Andi Kleen @ 2011-06-21 21:44 UTC (permalink / raw)
  To: Daniel Ehrenberg
  Cc: Andi Kleen, Alexander Viro, linux-fsdevel, linux-kernel, Andrew Morton

On Tue, Jun 21, 2011 at 02:21:48PM -0700, Daniel Ehrenberg wrote:
> On Tue, Jun 21, 2011 at 1:41 PM, Andi Kleen <andi@firstfloor.org> wrote:
> > Dan Ehrenberg <dehrenberg@google.com> writes:
> >
> >> This code introduces a fast-path variant of __blockdev_direct_IO
> >> for the special case where the request size is a multiple of the page
> >> size, the inode block size is a page, the user memory is page-aligned,
> >> the underlying storage is contiguous on disk and the file location is
> >> already initialized. The special case decreases the amount of
> >> bookkeeping required, which saves a significant amount of CPU time on
> >> a fast device such as a ramdisk or an SSD.  The patch is inspired by
> >> earlier code by Ken Chen.
> >
> > Is it understood why your fast path is that much faster?
> > i.e. what's the slow part in the normal path that it avoids?
> >
> > I am wondering if some of the improvements could be gotten even for less
> > rigid pre conditions.
> 
> I should start by saying that I really should've submitted this with
> an [RFC] tag. I'm eager for feedback on my first Linux kernel patch,
> and I'm really glad you responded.
> 
> The slowness in the dio code that I have observed is not in any
> particular place, but rather a death of a thousand cuts. Lines like
>         memset(dio, 0, offsetof(struct dio, pages));

Hmm, is it cache miss stalls or just core cycles? 

If the later I assume gcc generated an slow out of line call
for memset.  I guess that would be fixable.

If the former maybe need a strategic prefetch?

Possibly a slab constructor would also help and avoid some of the 
reinitialization costs (this would requirement a fixed size
limit for the fast path, but I guess that's reasonable)

> show up as significant in the CPU profile, but so do other random
> lines that manipulate the struct dio.

That would suggest cache misses?

So why does your version avoid those?

> You're right that these preconditions are rather rigid, though. If you
> have a suggestion for a more general precondition, I can try it out
> and see if it maintains the performance properties I want.

Not fully sure, but I would be interested in support for 512 byte sectors
at least.

-Andi
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
  2011-06-21 21:44       ` Andi Kleen
  (?)
@ 2011-06-29 21:03       ` Theodore Ts'o
  -1 siblings, 0 replies; 15+ messages in thread
From: Theodore Ts'o @ 2011-06-29 21:03 UTC (permalink / raw)
  To: linux-fsdevel

Andi Kleen <andi <at> firstfloor.org> writes:

> > You're right that these preconditions are rather rigid, though. If you
> > have a suggestion for a more general precondition, I can try it out
> > and see if it maintains the performance properties I want.
> 
> Not fully sure, but I would be interested in support for 512 byte sectors
> at least.

Looking at the code while I've been reviewing it, it seems to me that a goodly 
amount of the speedup is due to the fact that we can add pages to the bio one 
full page at a time, using bio_add_page() in a simple for loop.  The original 
code, has to go through the whole direct_io_worker(), do_direct_IO(), 
submit_page_section(), dio_send_cur_page(), and dio_bio_add_page().

That's 8 lines of code in the fast path, and close to 400 lines of code in the 
generic section.   (Is it at all surprising the proposed fast path code is 
faster?)   Most of this is due to the need to deal with file systems where the 
block sizes don't line up with the page sizes.  So supporting file systems with 
sub-page-size block sizes is what introduces all of the hair.

-- Ted



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
  2011-06-20 23:17 [PATCH] dio: Fast-path for page-aligned IOs Dan Ehrenberg
  2011-06-21 20:41 ` Andi Kleen
@ 2011-07-27 21:08 ` Christoph Hellwig
  2011-07-27 21:14   ` Andrew Morton
                     ` (2 more replies)
  1 sibling, 3 replies; 15+ messages in thread
From: Christoph Hellwig @ 2011-07-27 21:08 UTC (permalink / raw)
  To: Dan Ehrenberg; +Cc: Alexander Viro, linux-fsdevel, linux-kernel, Andrew Morton

On Mon, Jun 20, 2011 at 04:17:35PM -0700, Dan Ehrenberg wrote:
> The fast path does not apply for operations of the wrong size
> or alignmnent, or for operations on raw drives with 512-byte sectors.
> It might be possible to make this special case a little more general
> while maintaining its performance benefits, but I do not believe that
> the full performance benefits can be achieved without resorting to
> special handling of simple cases, as is done in this patch.

Did you check how this compares to Andis small optimizations?

Also operations on raw disks are something people with fast devices
care about a lot.  We often hear about benchmark regressions due to
stupid little things in the direct I/O code.

If we want to special case something that would be a very easy target,
with a 1:1 mapping of logical to physical blocks and thus no need
to call the allocator first, and no need for any kind of locking
or alignment handling.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
  2011-07-27 21:08 ` Christoph Hellwig
@ 2011-07-27 21:14   ` Andrew Morton
  2011-07-27 22:15     ` Jeff Moyer
  2011-07-28 22:09     ` Daniel Ehrenberg
  2011-07-28 22:10     ` Daniel Ehrenberg
  2 siblings, 1 reply; 15+ messages in thread
From: Andrew Morton @ 2011-07-27 21:14 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Dan Ehrenberg, Alexander Viro, linux-fsdevel, linux-kernel

On Wed, 27 Jul 2011 17:08:28 -0400
Christoph Hellwig <hch@infradead.org> wrote:

> On Mon, Jun 20, 2011 at 04:17:35PM -0700, Dan Ehrenberg wrote:
> > The fast path does not apply for operations of the wrong size
> > or alignmnent, or for operations on raw drives with 512-byte sectors.
> > It might be possible to make this special case a little more general
> > while maintaining its performance benefits, but I do not believe that
> > the full performance benefits can be achieved without resorting to
> > special handling of simple cases, as is done in this patch.
> 
> Did you check how this compares to Andis small optimizations?
> 
> Also operations on raw disks are something people with fast devices
> care about a lot.  We often hear about benchmark regressions due to
> stupid little things in the direct I/O code.
> 
> If we want to special case something that would be a very easy target,
> with a 1:1 mapping of logical to physical blocks and thus no need
> to call the allocator first, and no need for any kind of locking
> or alignment handling.

Ken did this back in 2006
(http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=e61c90188b9956edae1105eef361d8981a352fcd)
but we reverted that shortly afterwards for some reason.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
  2011-07-27 21:14   ` Andrew Morton
@ 2011-07-27 22:15     ` Jeff Moyer
  0 siblings, 0 replies; 15+ messages in thread
From: Jeff Moyer @ 2011-07-27 22:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Dan Ehrenberg, Alexander Viro, linux-fsdevel,
	linux-kernel

Andrew Morton <akpm@linux-foundation.org> writes:

> On Wed, 27 Jul 2011 17:08:28 -0400
> Christoph Hellwig <hch@infradead.org> wrote:
>
>> On Mon, Jun 20, 2011 at 04:17:35PM -0700, Dan Ehrenberg wrote:
>> > The fast path does not apply for operations of the wrong size
>> > or alignmnent, or for operations on raw drives with 512-byte sectors.
>> > It might be possible to make this special case a little more general
>> > while maintaining its performance benefits, but I do not believe that
>> > the full performance benefits can be achieved without resorting to
>> > special handling of simple cases, as is done in this patch.
>> 
>> Did you check how this compares to Andis small optimizations?
>> 
>> Also operations on raw disks are something people with fast devices
>> care about a lot.  We often hear about benchmark regressions due to
>> stupid little things in the direct I/O code.
>> 
>> If we want to special case something that would be a very easy target,
>> with a 1:1 mapping of logical to physical blocks and thus no need
>> to call the allocator first, and no need for any kind of locking
>> or alignment handling.
>
> Ken did this back in 2006
> (http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=e61c90188b9956edae1105eef361d8981a352fcd)
> but we reverted that shortly afterwards for some reason.

For this reason:

commit b2e895dbd80c420bfc0937c3729b4afe073b3848
Author: Andrew Morton <akpm@osdl.org>
Date:   Sat Feb 3 01:14:01 2007 -0800

    [PATCH] revert blockdev direct io back to 2.6.19 version
    
    Andrew Vasquez is reporting as-iosched oopses and a 65% throughput
    slowdown due to the recent special-casing of direct-io against
    blockdevs.  We don't know why either of these things are occurring.
    
    The patch minimally reverts us back to the 2.6.19 code for a 2.6.20
    release.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
  2011-07-27 21:08 ` Christoph Hellwig
@ 2011-07-28 22:09     ` Daniel Ehrenberg
  2011-07-28 22:09     ` Daniel Ehrenberg
  2011-07-28 22:10     ` Daniel Ehrenberg
  2 siblings, 0 replies; 15+ messages in thread
From: Daniel Ehrenberg @ 2011-07-28 22:09 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, Andrew Morton

On Wed, Jul 27, 2011 at 2:08 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Mon, Jun 20, 2011 at 04:17:35PM -0700, Dan Ehrenberg wrote:
>> The fast path does not apply for operations of the wrong size
>> or alignmnent, or for operations on raw drives with 512-byte sectors.
>> It might be possible to make this special case a little more general
>> while maintaining its performance benefits, but I do not believe that
>> the full performance benefits can be achieved without resorting to
>> special handling of simple cases, as is done in this patch.
>
> Did you check how this compares to Andis small optimizations?

I'm having a little trouble getting his patch working. I hope to have
this data soon, but I've been distracted by some other things.
>
> Also operations on raw disks are something people with fast devices
> care about a lot.  We often hear about benchmark regressions due to
> stupid little things in the direct I/O code.
>
> If we want to special case something that would be a very easy target,
> with a 1:1 mapping of logical to physical blocks and thus no need
> to call the allocator first, and no need for any kind of locking
> or alignment handling.

Are you talking about special-casing a raw block device? I'd like the
optimization to also work with a file system to support a particular
workload I've been looking at.

Thanks,
Dan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
@ 2011-07-28 22:09     ` Daniel Ehrenberg
  0 siblings, 0 replies; 15+ messages in thread
From: Daniel Ehrenberg @ 2011-07-28 22:09 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, Andrew Morton

On Wed, Jul 27, 2011 at 2:08 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Mon, Jun 20, 2011 at 04:17:35PM -0700, Dan Ehrenberg wrote:
>> The fast path does not apply for operations of the wrong size
>> or alignmnent, or for operations on raw drives with 512-byte sectors.
>> It might be possible to make this special case a little more general
>> while maintaining its performance benefits, but I do not believe that
>> the full performance benefits can be achieved without resorting to
>> special handling of simple cases, as is done in this patch.
>
> Did you check how this compares to Andis small optimizations?

I'm having a little trouble getting his patch working. I hope to have
this data soon, but I've been distracted by some other things.
>
> Also operations on raw disks are something people with fast devices
> care about a lot.  We often hear about benchmark regressions due to
> stupid little things in the direct I/O code.
>
> If we want to special case something that would be a very easy target,
> with a 1:1 mapping of logical to physical blocks and thus no need
> to call the allocator first, and no need for any kind of locking
> or alignment handling.

Are you talking about special-casing a raw block device? I'd like the
optimization to also work with a file system to support a particular
workload I've been looking at.

Thanks,
Dan
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
  2011-07-27 21:08 ` Christoph Hellwig
@ 2011-07-28 22:10     ` Daniel Ehrenberg
  2011-07-28 22:09     ` Daniel Ehrenberg
  2011-07-28 22:10     ` Daniel Ehrenberg
  2 siblings, 0 replies; 15+ messages in thread
From: Daniel Ehrenberg @ 2011-07-28 22:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, Andrew Morton

On Wed, Jul 27, 2011 at 2:08 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Mon, Jun 20, 2011 at 04:17:35PM -0700, Dan Ehrenberg wrote:
>> The fast path does not apply for operations of the wrong size
>> or alignmnent, or for operations on raw drives with 512-byte sectors.
>> It might be possible to make this special case a little more general
>> while maintaining its performance benefits, but I do not believe that
>> the full performance benefits can be achieved without resorting to
>> special handling of simple cases, as is done in this patch.
>
> Did you check how this compares to Andis small optimizations?

I'm having a little trouble getting his patch working. I hope to have
this data soon, but I've been distracted by some other things.
>
> Also operations on raw disks are something people with fast devices
> care about a lot.  We often hear about benchmark regressions due to
> stupid little things in the direct I/O code.
>
> If we want to special case something that would be a very easy target,
> with a 1:1 mapping of logical to physical blocks and thus no need
> to call the allocator first, and no need for any kind of locking
> or alignment handling.

Are you talking about special-casing a raw block device? I'd like the
optimization to also work with a file system to support a particular
workload I've been looking at.

Thanks,
Dan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] dio: Fast-path for page-aligned IOs
@ 2011-07-28 22:10     ` Daniel Ehrenberg
  0 siblings, 0 replies; 15+ messages in thread
From: Daniel Ehrenberg @ 2011-07-28 22:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Alexander Viro, linux-fsdevel, linux-kernel, Andrew Morton

On Wed, Jul 27, 2011 at 2:08 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Mon, Jun 20, 2011 at 04:17:35PM -0700, Dan Ehrenberg wrote:
>> The fast path does not apply for operations of the wrong size
>> or alignmnent, or for operations on raw drives with 512-byte sectors.
>> It might be possible to make this special case a little more general
>> while maintaining its performance benefits, but I do not believe that
>> the full performance benefits can be achieved without resorting to
>> special handling of simple cases, as is done in this patch.
>
> Did you check how this compares to Andis small optimizations?

I'm having a little trouble getting his patch working. I hope to have
this data soon, but I've been distracted by some other things.
>
> Also operations on raw disks are something people with fast devices
> care about a lot.  We often hear about benchmark regressions due to
> stupid little things in the direct I/O code.
>
> If we want to special case something that would be a very easy target,
> with a 1:1 mapping of logical to physical blocks and thus no need
> to call the allocator first, and no need for any kind of locking
> or alignment handling.

Are you talking about special-casing a raw block device? I'd like the
optimization to also work with a file system to support a particular
workload I've been looking at.

Thanks,
Dan
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] dio: Fast-path for page-aligned IOs
@ 2011-06-20 22:44 Dan Ehrenberg
  0 siblings, 0 replies; 15+ messages in thread
From: Dan Ehrenberg @ 2011-06-20 22:44 UTC (permalink / raw)
  To: Alexander Viro; +Cc: linux-fsdevel, linux-kernel, Andrew Morton, Dan Ehrenberg

This code introduces a fast-path variant of __blockdev_direct_IO
for the special case where the request size is a multiple of the page
size, the inode block size is a page, the user memory is page-aligned,
the underlying storage is contiguous on disk and the file location is
already initialized. The special case decreases the amount of
bookkeeping required, which saves a significant amount of CPU time on
a fast device such as a ramdisk or an SSD.  The patch is inspired by
earlier code by Ken Chen.

In my testing, this patch saves about a quarter of system time
compared to the old version when executed on a workload of 4k
random read operations on a high performance SSD. It does this
by nearly eliminating the CPU usage from fs/direct-io.c.  It only
works for this particular special case which we find to be common
in our workloads.

With hard drives, this optimization is less relevant because
(a) the long latency of a seek will mask most of this CPU time, and
(b) applications issue fewer, larger requests to minimize seeks,
thereby minimizing the CPU overhead from this code path. But on SSDs,
the drive makes requests as small as a page size fast. It is common
to use direct I/O here because the page cache overhead is easily
visible. And it is common to use operations which are multiples of
page size rather than smaller ones because most SSDs use 4k or bigger
blocks at the lowest level, even if the firmware may expose smaller
blocks.

The fast path does not apply for operations of the wrong size
or alignmnent, or for operations on raw drives with 512-byte sectors.
It might be possible to make this special case a little more general
while maintaining its performance benefits, but I do not believe that
the full performance benefits can be achieved without resorting to
special handling of simple cases, as is done in this patch.

Signed-off-by: Dan Ehrenberg <dehrenberg@google.com>
---
 fs/direct-io.c |  289 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 264 insertions(+), 25 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index ac5f164..8af2abf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -38,6 +38,10 @@
 #include <asm/atomic.h>
 
 /*
+ * General case implementation of __blockdev_direct_IO
+ */
+
+/*
  * How many user pages to map in one call to get_user_pages().  This determines
  * the size of a structure on the stack.
  */
@@ -428,22 +432,22 @@ static struct bio *dio_await_one(struct dio *dio)
 /*
  * Process one completed BIO.  No locks are held.
  */
-static int dio_bio_complete(struct dio *dio, struct bio *bio)
+static int __dio_bio_complete(struct bio *bio, int is_async, int rw, int *error)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int page_no;
 
 	if (!uptodate)
-		dio->io_error = -EIO;
+		*error = -EIO;
 
-	if (dio->is_async && dio->rw == READ) {
+	if (is_async && rw == READ) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 	} else {
 		for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
 			struct page *page = bvec[page_no].bv_page;
 
-			if (dio->rw == READ && !PageCompound(page))
+			if (rw == READ && !PageCompound(page))
 				set_page_dirty_lock(page);
 			page_cache_release(page);
 		}
@@ -452,6 +456,11 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	return uptodate ? 0 : -EIO;
 }
 
+static int dio_bio_complete(struct dio *dio, struct bio *bio)
+{
+	return __dio_bio_complete(bio, dio->is_async, dio->rw, &dio->io_error);
+}
+
 /*
  * Wait on and process all in-flight BIOs.  This must only be called once
  * all bios have been issued so that the refcount can only decrease.
@@ -1137,27 +1146,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	return ret;
 }
 
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- *  - if the flags value contains DIO_LOCKING we use a fancy locking
- *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
- *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
- *
- *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- *    internal locking but rather rely on the filesystem to synchronize
- *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
- */
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+static ssize_t
+dio_generic(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
@@ -1253,4 +1243,253 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 out:
 	return retval;
 }
+
+/*
+ * Special case for __blockdev_direct_IO:
+ * page-aligned page-sized IOs when the inode block size is page-sized.
+ *
+ * The following code does the exact same thing as the above
+ * general-case code, but the smaller amount of bookkeeping
+ * has a big impact on CPU usage, which is visible on fast
+ * devices like SSDs.
+ */
+
+/*
+ * struct mini_dio has a subset of the fields in struct dio.
+ * For comments on fields, see that struct definition.
+ */
+struct mini_dio {
+	/* The following struct items are needed in the completion. */
+	dio_iodone_t *end_io;
+	struct kiocb *iocb;
+	loff_t offset;
+	int rw;
+	int flags;
+	struct inode *inode;
+	size_t size;
+
+	/*
+	 * The following struct items are allocated just because
+	 * they're big, and they are only used during submission.
+	 */
+	struct buffer_head map_bh;
+	struct page *pages[DIO_PAGES];
+};
+
+/*
+ * In this function we treat everything as if it's AIO to simplify the logic.
+ * The AIO code paths also handle the synchronous case appropriately.
+ */
+static void dio_fast_end_aio(struct bio *bio, int error)
+{
+	struct mini_dio *mdio = bio->bi_private;
+	struct kiocb *iocb = mdio->iocb;
+	int res;
+
+	__dio_bio_complete(bio, 1, mdio->rw, &error);
+
+	/* Could it be some other value? */
+	res = (error < 0) ? error : mdio->size;
+
+	if (mdio->end_io)
+		mdio->end_io(iocb, mdio->offset, res,
+			     mdio->map_bh.b_private, 0, 1);
+	else
+		aio_complete(iocb, res, 0);
+
+	if (mdio->flags & DIO_LOCKING)
+		up_read_non_owner(&mdio->inode->i_alloc_sem);
+
+	kfree(mdio);
+}
+
+static ssize_t dio_fast(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	get_block_t get_block, dio_iodone_t end_io, int flags)
+{
+	struct bio *bio;
+	struct mini_dio *mdio;
+	int res, i;
+	size_t size = iov[0].iov_len;
+	int num_pages = size >> PAGE_SHIFT;
+
+	mdio = kmalloc(sizeof(*mdio), GFP_KERNEL);
+	if (mdio == NULL)
+		goto fallback;
+	mdio->end_io = end_io;
+	mdio->iocb = iocb;
+	mdio->offset = offset;
+	mdio->rw = rw;
+	mdio->flags = flags;
+	mdio->inode = inode;
+	mdio->size = size;
+	/* get_block expects a partially initialized bh */
+	mdio->map_bh.b_size = size;
+	mdio->map_bh.b_state = 0;
+
+	res = get_user_pages_fast(
+		(unsigned long)iov[0].iov_base,	/* Where from */
+		num_pages,			/* How many pages */
+		rw == READ,			/* Write to memory? */
+		mdio->pages);			/* Put results here */
+	if (res < 0)
+		goto fallback_free;
+	if (res != num_pages) {
+		num_pages = res;	/* Used for how many to free */
+		goto fallback_pages;
+	}
+
+	if (flags & DIO_LOCKING) {
+		if (rw == READ) {
+			struct address_space *mapping =
+					iocb->ki_filp->f_mapping;
+
+			mutex_lock(&inode->i_mutex);
+
+			res = filemap_write_and_wait_range(
+				mapping, offset,
+				offset + size - 1);
+			if (res != 0)
+				goto fallback_unlock;
+		}
+
+		/*
+		 * The i_alloc_sem will be released at I/O completion,
+		 * possibly in a different thread.
+		 */
+		down_read_non_owner(&inode->i_alloc_sem);
+	}
+
+
+	if ((*get_block)(inode, offset >> PAGE_SHIFT, &mdio->map_bh, 0) != 0)
+		goto fallback_sem_up;
+	if (!buffer_mapped(&mdio->map_bh) || mdio->map_bh.b_size != size)
+		/*
+		 * Bail out to the general case.
+		 *
+		 * If the first condition is false, we've either
+		 * encountered a file hole, or we are appending to
+		 * the end of the file.
+		 *
+		 * The second condition will only be false if the
+		 * file system returns a mapping shorter than the
+		 * total size. Here, we only handle physically
+		 * contiguous mappings, which can be implemented
+		 * with a single bio.
+		 */
+		goto fallback_sem_up;
+	bio = bio_alloc(GFP_KERNEL, num_pages);
+	bio->bi_bdev = mdio->map_bh.b_bdev;
+	bio->bi_end_io = dio_fast_end_aio;
+	bio->bi_private = mdio;
+	/*
+	 * On this fast path, inode->i_blkbits is guaranteed
+	 * to be equal to PAGE_SIZE
+	 */
+	bio->bi_sector = mdio->map_bh.b_blocknr << (PAGE_SHIFT - 9);
+
+	for (i = 0; i < num_pages; i++) {
+		if (bio_add_page(bio, mdio->pages[i], PAGE_SIZE, 0)
+				!= PAGE_SIZE) {
+			bio_put(bio);
+			goto fallback_sem_up;
+		}
+	}
+	/* bio is ready, submit it */
+	if (rw == READ)
+		bio_set_pages_dirty(bio);
+	submit_bio(rw, bio);
+	mutex_unlock(&inode->i_mutex);
+	return -EIOCBQUEUED;
+
+fallback_sem_up:
+	if (flags & DIO_LOCKING)
+		up_read_non_owner(&inode->i_alloc_sem);
+fallback_unlock:
+	if ((flags & DIO_LOCKING) && rw == READ)
+		mutex_unlock(&inode->i_mutex);
+fallback_pages:
+	for (i = 0; i < num_pages; i++)
+		page_cache_release(mdio->pages[i]);
+fallback_free:
+	kfree(mdio);
+
+fallback:	/* Bail out to the general case */
+	return dio_generic(rw, iocb, inode,
+			bdev, iov, offset, 1,
+			get_block, end_io, flags);
+}
+
+/*
+ * __blockdev_direct_IO - issue a direct I/O request
+ *
+ * This is a library function for use by filesystem drivers in their
+ * direct I/O routine. It translates an I/O request into bios and
+ * sends those down to the block layer.
+ *
+ * @rw: either READ or WRITE
+ * @iocb: iocb of the request
+ * @inode: inode of the request
+ * @bdev: backing block device
+ * @iov: vector of ranges of user-space memory
+ * @offset: logical byte offset for the start of the IO
+ * @nr_segs: number of elements in iov
+ * @get_block: a callback to map blocks into a buffer_head
+ *    signature: int (*get_block)(struct inode *inode,
+ *		sector_t iblock, struct buffer_head *bh, int create)
+ *    On entry, bh will have its b_size field initialized with the
+ *    length of the request
+ *    On exit, get_block should have initialized bh. get_block returns
+ *    0 on success, or an error code if the block didn't exist. On success,
+ *    the bh will be mapped if possible, or marked as unmapped otherwise.
+ *    The boolean flag create indicates whether an allocation (or
+ *    initializiation) should be attempted if no block is already present for
+ *    the logical block offset iblock.
+ * @end_io: a callback on IO completion, or NULL (defaulting to aio_complete)
+ *    signature: void (*end_io)(struct kiocb *iocb, loff_t offset,
+ *			    ssize_t size, void *private, int ret,
+ *			    bool is_async)
+ *    The private argument is extracted from bh.b_private, allowing get_block to
+ *    pass information to the completion.
+ * @flags: two flags allowed, DIO_LOCKING and DIO_SKIP_HOLES.
+ *    DIO_SKIP_HOLES indicates that this DIO request cannot fill
+ *    in a hole in the middle of a file.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	int flags)
+{
+	BUG_ON(rw != READ && rw != WRITE);
+	if (inode->i_blkbits == PAGE_SHIFT
+	    && nr_segs == 1
+	    && IS_ALIGNED(offset, PAGE_SIZE)
+	    && IS_ALIGNED((unsigned long)iov[0].iov_base, PAGE_SIZE)
+	    && IS_ALIGNED(iov[0].iov_len, PAGE_SIZE)
+	    && iov[0].iov_len >> PAGE_SHIFT <= DIO_PAGES)
+		return dio_fast(rw, iocb, inode,
+				bdev, iov, offset,
+				get_block, end_io, flags);
+
+	return dio_generic(rw, iocb, inode,
+			bdev, iov, offset,
+			nr_segs, get_block, end_io, flags);
+}
 EXPORT_SYMBOL(__blockdev_direct_IO);
-- 
1.7.3.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2011-07-28 22:41 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-06-20 23:17 [PATCH] dio: Fast-path for page-aligned IOs Dan Ehrenberg
2011-06-21 20:41 ` Andi Kleen
2011-06-21 21:21   ` Daniel Ehrenberg
2011-06-21 21:21     ` Daniel Ehrenberg
2011-06-21 21:44     ` Andi Kleen
2011-06-21 21:44       ` Andi Kleen
2011-06-29 21:03       ` Theodore Ts'o
2011-07-27 21:08 ` Christoph Hellwig
2011-07-27 21:14   ` Andrew Morton
2011-07-27 22:15     ` Jeff Moyer
2011-07-28 22:09   ` Daniel Ehrenberg
2011-07-28 22:09     ` Daniel Ehrenberg
2011-07-28 22:10   ` Daniel Ehrenberg
2011-07-28 22:10     ` Daniel Ehrenberg
  -- strict thread matches above, loose matches on Subject: below --
2011-06-20 22:44 Dan Ehrenberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.