All of lore.kernel.org
 help / color / mirror / Atom feed
* Updated direct IO optimization patchkit v2
@ 2011-08-02  4:38 Andi Kleen
  2011-08-02  4:38 ` [PATCH 01/11] DIO: Separate fields only used in the submission path from struct dio Andi Kleen
                   ` (11 more replies)
  0 siblings, 12 replies; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch

Various micro optimizations of the block direct IO path.

This addresses all review comments and adds an additional prefetch
optimization that gives another about 0.3% in a large macro benchmark.
Overall the patchkit is worht about 2.2% in the same macro benchmark.

I also ported it to current mainline, resolving some conflicts with
hch's recent changes to direct-io.

-Andi


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH 01/11] DIO: Separate fields only used in the submission path from struct dio
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 17:59   ` Jeff Moyer
  2011-08-02  4:38 ` [PATCH 02/11] DIO: Fix a wrong comment Andi Kleen
                   ` (10 subsequent siblings)
  11 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

This large, but largely mechanic, patch moves all fields in struct dio
that are only used in the submission path into a separate on stack
data structure. This has the advantage that the memory is very likely
cache hot, which is not guaranteed for memory fresh out of kmalloc.

This also gives gcc more optimization potential because it can easier
determine that there are no external aliases for these variables.

The sdio initialization is a initialization now instead of memset.
This allows gcc to break sdio into individual fields and optimize
away unnecessary zeroing (after all the functions are inlined)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |  395 +++++++++++++++++++++++++++++---------------------------
 1 files changed, 203 insertions(+), 192 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 44a360c..02943e3 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -55,13 +55,10 @@
  * blocksize.
  */
 
-struct dio {
-	/* BIO submission state */
+/* dio_state only used in the submission path */
+
+struct dio_submit {
 	struct bio *bio;		/* bio under assembly */
-	struct inode *inode;
-	int rw;
-	loff_t i_size;			/* i_size when submitted */
-	int flags;			/* doesn't change */
 	unsigned blkbits;		/* doesn't change */
 	unsigned blkfactor;		/* When we're using an alignment which
 					   is finer than the filesystem's soft
@@ -71,7 +68,7 @@ struct dio {
 	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
 					   been performed at the start of a
 					   write */
-	int pages_in_io;		/* approximate total IO pages */
+	int pages_in_io;		/* approximate total IO pages */	
 	size_t	size;			/* total request size (doesn't change)*/
 	sector_t block_in_file;		/* Current offset into the underlying
 					   file in dio_block units. */
@@ -81,14 +78,13 @@ struct dio {
 	int boundary;			/* prev block is at a boundary */
 	int reap_counter;		/* rate limit reaping */
 	get_block_t *get_block;		/* block mapping function */
-	dio_iodone_t *end_io;		/* IO completion function */
-	dio_submit_t *submit_io;	/* IO submition function */
+	dio_submit_t *submit_io;	/* IO submition function */	
+
 	loff_t logical_offset_in_bio;	/* current first logical block in bio */
 	sector_t final_block_in_bio;	/* current final block in bio + 1 */
 	sector_t next_block_for_io;	/* next block to be put under IO,
 					   in dio_blocks units */
-	struct buffer_head map_bh;	/* last get_block() result */
-
+	
 	/*
 	 * Deferred addition of a page to the dio.  These variables are
 	 * private to dio_send_cur_page(), submit_page_section() and
@@ -100,18 +96,6 @@ struct dio {
 	sector_t cur_page_block;	/* Where it starts */
 	loff_t cur_page_fs_offset;	/* Offset in file */
 
-	/* BIO completion state */
-	spinlock_t bio_lock;		/* protects BIO fields below */
-	unsigned long refcount;		/* direct_io_worker() and bios */
-	struct bio *bio_list;		/* singly linked via bi_private */
-	struct task_struct *waiter;	/* waiting task (NULL if none) */
-
-	/* AIO related stuff */
-	struct kiocb *iocb;		/* kiocb */
-	int is_async;			/* is IO async ? */
-	int io_error;			/* IO error in completion path */
-	ssize_t result;                 /* IO result */
-
 	/*
 	 * Page fetching state. These variables belong to dio_refill_pages().
 	 */
@@ -125,6 +109,30 @@ struct dio {
 	 */
 	unsigned head;			/* next page to process */
 	unsigned tail;			/* last valid page + 1 */
+};
+
+/* dio_state communicated between submission path and end_io */
+struct dio {
+	int flags;			/* doesn't change */
+	struct inode *inode;
+	int rw;
+	loff_t i_size;			/* i_size when submitted */
+	dio_iodone_t *end_io;		/* IO completion function */
+	struct buffer_head map_bh;	/* last get_block() result */
+
+
+	/* BIO completion state */
+	spinlock_t bio_lock;		/* protects BIO fields below */
+	unsigned long refcount;		/* direct_io_worker() and bios */
+	struct bio *bio_list;		/* singly linked via bi_private */
+	struct task_struct *waiter;	/* waiting task (NULL if none) */
+
+	/* AIO related stuff */
+	struct kiocb *iocb;		/* kiocb */
+	int is_async;			/* is IO async ? */
+	int io_error;			/* IO error in completion path */
+	ssize_t result;                 /* IO result */
+
 	int page_errors;		/* errno from get_user_pages() */
 
 	/*
@@ -132,7 +140,7 @@ struct dio {
 	 * allocation time.  Don't add new fields after pages[] unless you
 	 * wish that they not be zeroed.
 	 */
-	struct page *pages[DIO_PAGES];	/* page buffer */
+	struct page *pages[DIO_PAGES];	/* page buffer */	
 };
 
 static void __inode_dio_wait(struct inode *inode)
@@ -182,27 +190,27 @@ EXPORT_SYMBOL_GPL(inode_dio_done);
 /*
  * How many pages are in the queue?
  */
-static inline unsigned dio_pages_present(struct dio *dio)
+static inline unsigned dio_pages_present(struct dio *dio, struct dio_submit *sdio)
 {
-	return dio->tail - dio->head;
+	return sdio->tail - sdio->head;
 }
 
 /*
  * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
  */
-static int dio_refill_pages(struct dio *dio)
+static int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 	int nr_pages;
 
-	nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
+	nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
 	ret = get_user_pages_fast(
-		dio->curr_user_address,		/* Where from? */
+		sdio->curr_user_address,		/* Where from? */
 		nr_pages,			/* How many pages? */
 		dio->rw == READ,		/* Write to memory? */
 		&dio->pages[0]);		/* Put results here */
 
-	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
+	if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(0);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
@@ -213,17 +221,17 @@ static int dio_refill_pages(struct dio *dio)
 			dio->page_errors = ret;
 		page_cache_get(page);
 		dio->pages[0] = page;
-		dio->head = 0;
-		dio->tail = 1;
+		sdio->head = 0;
+		sdio->tail = 1;
 		ret = 0;
 		goto out;
 	}
 
 	if (ret >= 0) {
-		dio->curr_user_address += ret * PAGE_SIZE;
-		dio->curr_page += ret;
-		dio->head = 0;
-		dio->tail = ret;
+		sdio->curr_user_address += ret * PAGE_SIZE;
+		sdio->curr_page += ret;
+		sdio->head = 0;
+		sdio->tail = ret;
 		ret = 0;
 	}
 out:
@@ -236,17 +244,17 @@ out:
  * decent number of pages, less frequently.  To provide nicer use of the
  * L1 cache.
  */
-static struct page *dio_get_page(struct dio *dio)
+static struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio)
 {
-	if (dio_pages_present(dio) == 0) {
+	if (dio_pages_present(dio, sdio) == 0) {
 		int ret;
 
-		ret = dio_refill_pages(dio);
+		ret = dio_refill_pages(dio, sdio);
 		if (ret)
 			return ERR_PTR(ret);
-		BUG_ON(dio_pages_present(dio) == 0);
+		BUG_ON(dio_pages_present(dio, sdio) == 0);
 	}
-	return dio->pages[dio->head++];
+	return dio->pages[sdio->head++];
 }
 
 /**
@@ -368,8 +376,9 @@ void dio_end_io(struct bio *bio, int error)
 EXPORT_SYMBOL_GPL(dio_end_io);
 
 static void
-dio_bio_alloc(struct dio *dio, struct block_device *bdev,
-		sector_t first_sector, int nr_vecs)
+dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
+	      struct block_device *bdev,
+	      sector_t first_sector, int nr_vecs)
 {
 	struct bio *bio;
 
@@ -386,8 +395,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 	else
 		bio->bi_end_io = dio_bio_end_io;
 
-	dio->bio = bio;
-	dio->logical_offset_in_bio = dio->cur_page_fs_offset;
+	sdio->bio = bio;
+	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
 }
 
 /*
@@ -397,9 +406,9 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
  *
  * bios hold a dio reference between submit_bio and ->end_io.
  */
-static void dio_bio_submit(struct dio *dio)
+static void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 {
-	struct bio *bio = dio->bio;
+	struct bio *bio = sdio->bio;
 	unsigned long flags;
 
 	bio->bi_private = dio;
@@ -411,24 +420,24 @@ static void dio_bio_submit(struct dio *dio)
 	if (dio->is_async && dio->rw == READ)
 		bio_set_pages_dirty(bio);
 
-	if (dio->submit_io)
-		dio->submit_io(dio->rw, bio, dio->inode,
-			       dio->logical_offset_in_bio);
+	if (sdio->submit_io)
+		sdio->submit_io(dio->rw, bio, dio->inode,
+			       sdio->logical_offset_in_bio);
 	else
 		submit_bio(dio->rw, bio);
 
-	dio->bio = NULL;
-	dio->boundary = 0;
-	dio->logical_offset_in_bio = 0;
+	sdio->bio = NULL;
+	sdio->boundary = 0;
+	sdio->logical_offset_in_bio = 0;
 }
 
 /*
  * Release any resources in case of a failure
  */
-static void dio_cleanup(struct dio *dio)
+static void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
-	while (dio_pages_present(dio))
-		page_cache_release(dio_get_page(dio));
+	while (dio_pages_present(dio, sdio))
+		page_cache_release(dio_get_page(dio, sdio));
 }
 
 /*
@@ -518,11 +527,11 @@ static void dio_await_completion(struct dio *dio)
  *
  * This also helps to limit the peak amount of pinned userspace memory.
  */
-static int dio_bio_reap(struct dio *dio)
+static int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret = 0;
 
-	if (dio->reap_counter++ >= 64) {
+	if (sdio->reap_counter++ >= 64) {
 		while (dio->bio_list) {
 			unsigned long flags;
 			struct bio *bio;
@@ -536,14 +545,14 @@ static int dio_bio_reap(struct dio *dio)
 			if (ret == 0)
 				ret = ret2;
 		}
-		dio->reap_counter = 0;
+		sdio->reap_counter = 0;
 	}
 	return ret;
 }
 
 /*
  * Call into the fs to map some more disk blocks.  We record the current number
- * of available blocks at dio->blocks_available.  These are in units of the
+ * of available blocks at sdio->blocks_available.  These are in units of the
  * fs blocksize, (1 << inode->i_blkbits).
  *
  * The fs is allowed to map lots of blocks at once.  If it wants to do that,
@@ -564,7 +573,7 @@ static int dio_bio_reap(struct dio *dio)
  * buffer_mapped().  However the direct-io code will only process holes one
  * block at a time - it will repeatedly call get_block() as it walks the hole.
  */
-static int get_more_blocks(struct dio *dio)
+static int get_more_blocks(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 	struct buffer_head *map_bh = &dio->map_bh;
@@ -580,11 +589,11 @@ static int get_more_blocks(struct dio *dio)
 	 */
 	ret = dio->page_errors;
 	if (ret == 0) {
-		BUG_ON(dio->block_in_file >= dio->final_block_in_request);
-		fs_startblk = dio->block_in_file >> dio->blkfactor;
-		dio_count = dio->final_block_in_request - dio->block_in_file;
-		fs_count = dio_count >> dio->blkfactor;
-		blkmask = (1 << dio->blkfactor) - 1;
+		BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
+		fs_startblk = sdio->block_in_file >> sdio->blkfactor;
+		dio_count = sdio->final_block_in_request - sdio->block_in_file;
+		fs_count = dio_count >> sdio->blkfactor;
+		blkmask = (1 << sdio->blkfactor) - 1;
 		if (dio_count & blkmask)	
 			fs_count++;
 
@@ -604,12 +613,12 @@ static int get_more_blocks(struct dio *dio)
 		 */
 		create = dio->rw & WRITE;
 		if (dio->flags & DIO_SKIP_HOLES) {
-			if (dio->block_in_file < (i_size_read(dio->inode) >>
-							dio->blkbits))
+			if (sdio->block_in_file < (i_size_read(dio->inode) >>
+							sdio->blkbits))
 				create = 0;
 		}
 
-		ret = (*dio->get_block)(dio->inode, fs_startblk,
+		ret = (*sdio->get_block)(dio->inode, fs_startblk,
 						map_bh, create);
 	}
 	return ret;
@@ -618,20 +627,21 @@ static int get_more_blocks(struct dio *dio)
 /*
  * There is no bio.  Make one now.
  */
-static int dio_new_bio(struct dio *dio, sector_t start_sector)
+static int dio_new_bio(struct dio *dio, struct dio_submit *sdio, 
+		       sector_t start_sector)
 {
 	sector_t sector;
 	int ret, nr_pages;
 
-	ret = dio_bio_reap(dio);
+	ret = dio_bio_reap(dio, sdio);
 	if (ret)
 		goto out;
-	sector = start_sector << (dio->blkbits - 9);
-	nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+	sector = start_sector << (sdio->blkbits - 9);
+	nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
 	nr_pages = min(nr_pages, BIO_MAX_PAGES);
 	BUG_ON(nr_pages <= 0);
-	dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages);
-	dio->boundary = 0;
+	dio_bio_alloc(dio, sdio, dio->map_bh.b_bdev, sector, nr_pages);
+	sdio->boundary = 0;
 out:
 	return ret;
 }
@@ -643,21 +653,21 @@ out:
  *
  * Return zero on success.  Non-zero means the caller needs to start a new BIO.
  */
-static int dio_bio_add_page(struct dio *dio)
+static int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 
-	ret = bio_add_page(dio->bio, dio->cur_page,
-			dio->cur_page_len, dio->cur_page_offset);
-	if (ret == dio->cur_page_len) {
+	ret = bio_add_page(sdio->bio, sdio->cur_page,
+			sdio->cur_page_len, sdio->cur_page_offset);
+	if (ret == sdio->cur_page_len) {
 		/*
 		 * Decrement count only, if we are done with this page
 		 */
-		if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE)
-			dio->pages_in_io--;
-		page_cache_get(dio->cur_page);
-		dio->final_block_in_bio = dio->cur_page_block +
-			(dio->cur_page_len >> dio->blkbits);
+		if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
+			sdio->pages_in_io--;
+		page_cache_get(sdio->cur_page);
+		sdio->final_block_in_bio = sdio->cur_page_block +
+			(sdio->cur_page_len >> sdio->blkbits);
 		ret = 0;
 	} else {
 		ret = 1;
@@ -675,14 +685,14 @@ static int dio_bio_add_page(struct dio *dio)
  * The caller of this function is responsible for removing cur_page from the
  * dio, and for dropping the refcount which came from that presence.
  */
-static int dio_send_cur_page(struct dio *dio)
+static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret = 0;
 
-	if (dio->bio) {
-		loff_t cur_offset = dio->cur_page_fs_offset;
-		loff_t bio_next_offset = dio->logical_offset_in_bio +
-			dio->bio->bi_size;
+	if (sdio->bio) {
+		loff_t cur_offset = sdio->cur_page_fs_offset;
+		loff_t bio_next_offset = sdio->logical_offset_in_bio +
+			sdio->bio->bi_size;
 
 		/*
 		 * See whether this new request is contiguous with the old.
@@ -698,28 +708,28 @@ static int dio_send_cur_page(struct dio *dio)
 		 * be the next logical offset in the bio, submit the bio we
 		 * have.
 		 */
-		if (dio->final_block_in_bio != dio->cur_page_block ||
+		if (sdio->final_block_in_bio != sdio->cur_page_block ||
 		    cur_offset != bio_next_offset)
-			dio_bio_submit(dio);
+			dio_bio_submit(dio, sdio);
 		/*
 		 * Submit now if the underlying fs is about to perform a
 		 * metadata read
 		 */
-		else if (dio->boundary)
-			dio_bio_submit(dio);
+		else if (sdio->boundary)
+			dio_bio_submit(dio, sdio);
 	}
 
-	if (dio->bio == NULL) {
-		ret = dio_new_bio(dio, dio->cur_page_block);
+	if (sdio->bio == NULL) {
+		ret = dio_new_bio(dio, sdio, sdio->cur_page_block);
 		if (ret)
 			goto out;
 	}
 
-	if (dio_bio_add_page(dio) != 0) {
-		dio_bio_submit(dio);
-		ret = dio_new_bio(dio, dio->cur_page_block);
+	if (dio_bio_add_page(dio, sdio) != 0) {
+		dio_bio_submit(dio, sdio);
+		ret = dio_new_bio(dio, sdio, sdio->cur_page_block);
 		if (ret == 0) {
-			ret = dio_bio_add_page(dio);
+			ret = dio_bio_add_page(dio, sdio);
 			BUG_ON(ret != 0);
 		}
 	}
@@ -745,7 +755,7 @@ out:
  * page to the dio instead.
  */
 static int
-submit_page_section(struct dio *dio, struct page *page,
+submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		unsigned offset, unsigned len, sector_t blocknr)
 {
 	int ret = 0;
@@ -760,20 +770,20 @@ submit_page_section(struct dio *dio, struct page *page,
 	/*
 	 * Can we just grow the current page's presence in the dio?
 	 */
-	if (	(dio->cur_page == page) &&
-		(dio->cur_page_offset + dio->cur_page_len == offset) &&
-		(dio->cur_page_block +
-			(dio->cur_page_len >> dio->blkbits) == blocknr)) {
-		dio->cur_page_len += len;
+	if (	(sdio->cur_page == page) &&
+		(sdio->cur_page_offset + sdio->cur_page_len == offset) &&
+		(sdio->cur_page_block +
+			(sdio->cur_page_len >> sdio->blkbits) == blocknr)) {
+		sdio->cur_page_len += len;
 
 		/*
-		 * If dio->boundary then we want to schedule the IO now to
+		 * If sdio->boundary then we want to schedule the IO now to
 		 * avoid metadata seeks.
 		 */
-		if (dio->boundary) {
-			ret = dio_send_cur_page(dio);
-			page_cache_release(dio->cur_page);
-			dio->cur_page = NULL;
+		if (sdio->boundary) {
+			ret = dio_send_cur_page(dio, sdio);
+			page_cache_release(sdio->cur_page);
+			sdio->cur_page = NULL;
 		}
 		goto out;
 	}
@@ -781,20 +791,20 @@ submit_page_section(struct dio *dio, struct page *page,
 	/*
 	 * If there's a deferred page already there then send it.
 	 */
-	if (dio->cur_page) {
-		ret = dio_send_cur_page(dio);
-		page_cache_release(dio->cur_page);
-		dio->cur_page = NULL;
+	if (sdio->cur_page) {
+		ret = dio_send_cur_page(dio, sdio);
+		page_cache_release(sdio->cur_page);
+		sdio->cur_page = NULL;
 		if (ret)
 			goto out;
 	}
 
 	page_cache_get(page);		/* It is in dio */
-	dio->cur_page = page;
-	dio->cur_page_offset = offset;
-	dio->cur_page_len = len;
-	dio->cur_page_block = blocknr;
-	dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
+	sdio->cur_page = page;
+	sdio->cur_page_offset = offset;
+	sdio->cur_page_len = len;
+	sdio->cur_page_block = blocknr;
+	sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
 out:
 	return ret;
 }
@@ -826,19 +836,19 @@ static void clean_blockdev_aliases(struct dio *dio)
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
-static void dio_zero_block(struct dio *dio, int end)
+static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end)
 {
 	unsigned dio_blocks_per_fs_block;
 	unsigned this_chunk_blocks;	/* In dio_blocks */
 	unsigned this_chunk_bytes;
 	struct page *page;
 
-	dio->start_zero_done = 1;
-	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
+	sdio->start_zero_done = 1;
+	if (!sdio->blkfactor || !buffer_new(&dio->map_bh))
 		return;
 
-	dio_blocks_per_fs_block = 1 << dio->blkfactor;
-	this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1);
+	dio_blocks_per_fs_block = 1 << sdio->blkfactor;
+	this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
 
 	if (!this_chunk_blocks)
 		return;
@@ -850,14 +860,14 @@ static void dio_zero_block(struct dio *dio, int end)
 	if (end) 
 		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
 
-	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
+	this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
 
 	page = ZERO_PAGE(0);
-	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
-				dio->next_block_for_io))
+	if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, 
+				sdio->next_block_for_io))
 		return;
 
-	dio->next_block_for_io += this_chunk_blocks;
+	sdio->next_block_for_io += this_chunk_blocks;
 }
 
 /*
@@ -876,9 +886,9 @@ static void dio_zero_block(struct dio *dio, int end)
  * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
  * fine alignment but still allows this function to work in PAGE_SIZE units.
  */
-static int do_direct_IO(struct dio *dio)
+static int do_direct_IO(struct dio *dio, struct dio_submit *sdio)
 {
-	const unsigned blkbits = dio->blkbits;
+	const unsigned blkbits = sdio->blkbits;
 	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	struct page *page;
 	unsigned block_in_page;
@@ -886,10 +896,10 @@ static int do_direct_IO(struct dio *dio)
 	int ret = 0;
 
 	/* The I/O can start at any block offset within the first page */
-	block_in_page = dio->first_block_in_page;
+	block_in_page = sdio->first_block_in_page;
 
-	while (dio->block_in_file < dio->final_block_in_request) {
-		page = dio_get_page(dio);
+	while (sdio->block_in_file < sdio->final_block_in_request) {
+		page = dio_get_page(dio, sdio);
 		if (IS_ERR(page)) {
 			ret = PTR_ERR(page);
 			goto out;
@@ -901,14 +911,14 @@ static int do_direct_IO(struct dio *dio)
 			unsigned this_chunk_blocks;	/* # of blocks */
 			unsigned u;
 
-			if (dio->blocks_available == 0) {
+			if (sdio->blocks_available == 0) {
 				/*
 				 * Need to go and map some more disk
 				 */
 				unsigned long blkmask;
 				unsigned long dio_remainder;
 
-				ret = get_more_blocks(dio);
+				ret = get_more_blocks(dio, sdio);
 				if (ret) {
 					page_cache_release(page);
 					goto out;
@@ -916,18 +926,18 @@ static int do_direct_IO(struct dio *dio)
 				if (!buffer_mapped(map_bh))
 					goto do_holes;
 
-				dio->blocks_available =
-						map_bh->b_size >> dio->blkbits;
-				dio->next_block_for_io =
-					map_bh->b_blocknr << dio->blkfactor;
+				sdio->blocks_available =
+						map_bh->b_size >> sdio->blkbits;
+				sdio->next_block_for_io =
+					map_bh->b_blocknr << sdio->blkfactor;
 				if (buffer_new(map_bh))
 					clean_blockdev_aliases(dio);
 
-				if (!dio->blkfactor)
+				if (!sdio->blkfactor)
 					goto do_holes;
 
-				blkmask = (1 << dio->blkfactor) - 1;
-				dio_remainder = (dio->block_in_file & blkmask);
+				blkmask = (1 << sdio->blkfactor) - 1;
+				dio_remainder = (sdio->block_in_file & blkmask);
 
 				/*
 				 * If we are at the start of IO and that IO
@@ -941,8 +951,8 @@ static int do_direct_IO(struct dio *dio)
 				 * on-disk
 				 */
 				if (!buffer_new(map_bh))
-					dio->next_block_for_io += dio_remainder;
-				dio->blocks_available -= dio_remainder;
+					sdio->next_block_for_io += dio_remainder;
+				sdio->blocks_available -= dio_remainder;
 			}
 do_holes:
 			/* Handle holes */
@@ -961,7 +971,7 @@ do_holes:
 				 */
 				i_size_aligned = ALIGN(i_size_read(dio->inode),
 							1 << blkbits);
-				if (dio->block_in_file >=
+				if (sdio->block_in_file >=
 						i_size_aligned >> blkbits) {
 					/* We hit eof */
 					page_cache_release(page);
@@ -969,7 +979,7 @@ do_holes:
 				}
 				zero_user(page, block_in_page << blkbits,
 						1 << blkbits);
-				dio->block_in_file++;
+				sdio->block_in_file++;
 				block_in_page++;
 				goto next_block;
 			}
@@ -979,38 +989,38 @@ do_holes:
 			 * is finer than the underlying fs, go check to see if
 			 * we must zero out the start of this block.
 			 */
-			if (unlikely(dio->blkfactor && !dio->start_zero_done))
-				dio_zero_block(dio, 0);
+			if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
+				dio_zero_block(dio, sdio, 0);
 
 			/*
 			 * Work out, in this_chunk_blocks, how much disk we
 			 * can add to this page
 			 */
-			this_chunk_blocks = dio->blocks_available;
+			this_chunk_blocks = sdio->blocks_available;
 			u = (PAGE_SIZE - offset_in_page) >> blkbits;
 			if (this_chunk_blocks > u)
 				this_chunk_blocks = u;
-			u = dio->final_block_in_request - dio->block_in_file;
+			u = sdio->final_block_in_request - sdio->block_in_file;
 			if (this_chunk_blocks > u)
 				this_chunk_blocks = u;
 			this_chunk_bytes = this_chunk_blocks << blkbits;
 			BUG_ON(this_chunk_bytes == 0);
 
-			dio->boundary = buffer_boundary(map_bh);
-			ret = submit_page_section(dio, page, offset_in_page,
-				this_chunk_bytes, dio->next_block_for_io);
+			sdio->boundary = buffer_boundary(map_bh);
+			ret = submit_page_section(dio, sdio, page, offset_in_page,
+				this_chunk_bytes, sdio->next_block_for_io);
 			if (ret) {
 				page_cache_release(page);
 				goto out;
 			}
-			dio->next_block_for_io += this_chunk_blocks;
+			sdio->next_block_for_io += this_chunk_blocks;
 
-			dio->block_in_file += this_chunk_blocks;
+			sdio->block_in_file += this_chunk_blocks;
 			block_in_page += this_chunk_blocks;
-			dio->blocks_available -= this_chunk_blocks;
+			sdio->blocks_available -= this_chunk_blocks;
 next_block:
-			BUG_ON(dio->block_in_file > dio->final_block_in_request);
-			if (dio->block_in_file == dio->final_block_in_request)
+			BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
+			if (sdio->block_in_file == sdio->final_block_in_request)
 				break;
 		}
 
@@ -1026,7 +1036,7 @@ static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
 	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-	dio_submit_t submit_io, struct dio *dio)
+	dio_submit_t submit_io, struct dio *dio, struct dio_submit *sdio)
 {
 	unsigned long user_addr; 
 	unsigned long flags;
@@ -1037,15 +1047,15 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 	dio->inode = inode;
 	dio->rw = rw;
-	dio->blkbits = blkbits;
-	dio->blkfactor = inode->i_blkbits - blkbits;
-	dio->block_in_file = offset >> blkbits;
+	sdio->blkbits = blkbits;
+	sdio->blkfactor = inode->i_blkbits - blkbits;
+	sdio->block_in_file = offset >> blkbits;
 
-	dio->get_block = get_block;
+	sdio->get_block = get_block;
 	dio->end_io = end_io;
-	dio->submit_io = submit_io;
-	dio->final_block_in_bio = -1;
-	dio->next_block_for_io = -1;
+	sdio->submit_io = submit_io;
+	sdio->final_block_in_bio = -1;
+	sdio->next_block_for_io = -1;
 
 	dio->iocb = iocb;
 	dio->i_size = i_size_read(inode);
@@ -1057,45 +1067,45 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 * In case of non-aligned buffers, we may need 2 more
 	 * pages since we need to zero out first and last block.
 	 */
-	if (unlikely(dio->blkfactor))
-		dio->pages_in_io = 2;
+	if (unlikely(sdio->blkfactor))
+		sdio->pages_in_io = 2;
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
-		dio->pages_in_io +=
+		sdio->pages_in_io +=
 			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
 				- user_addr/PAGE_SIZE);
 	}
 
 	for (seg = 0; seg < nr_segs; seg++) {
 		user_addr = (unsigned long)iov[seg].iov_base;
-		dio->size += bytes = iov[seg].iov_len;
+		sdio->size += bytes = iov[seg].iov_len;
 
 		/* Index into the first page of the first block */
-		dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		dio->final_block_in_request = dio->block_in_file +
+		sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+		sdio->final_block_in_request = sdio->block_in_file +
 						(bytes >> blkbits);
 		/* Page fetching state */
-		dio->head = 0;
-		dio->tail = 0;
-		dio->curr_page = 0;
+		sdio->head = 0;
+		sdio->tail = 0;
+		sdio->curr_page = 0;
 
-		dio->total_pages = 0;
+		sdio->total_pages = 0;
 		if (user_addr & (PAGE_SIZE-1)) {
-			dio->total_pages++;
+			sdio->total_pages++;
 			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
 		}
-		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		dio->curr_user_address = user_addr;
+		sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		sdio->curr_user_address = user_addr;
 	
-		ret = do_direct_IO(dio);
+		ret = do_direct_IO(dio, sdio);
 
 		dio->result += iov[seg].iov_len -
-			((dio->final_block_in_request - dio->block_in_file) <<
+			((sdio->final_block_in_request - sdio->block_in_file) <<
 					blkbits);
 
 		if (ret) {
-			dio_cleanup(dio);
+			dio_cleanup(dio, sdio);
 			break;
 		}
 	} /* end iovec loop */
@@ -1111,23 +1121,23 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 * There may be some unwritten disk at the end of a part-written
 	 * fs-block-sized block.  Go zero that now.
 	 */
-	dio_zero_block(dio, 1);
+	dio_zero_block(dio, sdio, 1);
 
-	if (dio->cur_page) {
-		ret2 = dio_send_cur_page(dio);
+	if (sdio->cur_page) {
+		ret2 = dio_send_cur_page(dio, sdio);
 		if (ret == 0)
 			ret = ret2;
-		page_cache_release(dio->cur_page);
-		dio->cur_page = NULL;
+		page_cache_release(sdio->cur_page);
+		sdio->cur_page = NULL;
 	}
-	if (dio->bio)
-		dio_bio_submit(dio);
+	if (sdio->bio)
+		dio_bio_submit(dio, sdio);
 
 	/*
 	 * It is possible that, we return short IO due to end of file.
 	 * In that case, we need to release all the pages we got hold on.
 	 */
-	dio_cleanup(dio);
+	dio_cleanup(dio, sdio);
 
 	/*
 	 * All block lookups have been performed. For READ requests
@@ -1146,7 +1156,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	BUG_ON(ret == -EIOCBQUEUED);
 	if (dio->is_async && ret == 0 && dio->result &&
-	    ((rw & READ) || (dio->result == dio->size)))
+	    ((rw & READ) || (dio->result == sdio->size)))
 		ret = -EIOCBQUEUED;
 
 	if (ret != -EIOCBQUEUED)
@@ -1211,6 +1221,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
 	struct dio *dio;
+	struct dio_submit sdio = { 0, };
 
 	if (rw & WRITE)
 		rw = WRITE_ODIRECT;
@@ -1290,7 +1301,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
 				nr_segs, blkbits, get_block, end_io,
-				submit_io, dio);
+				  submit_io, dio, &sdio);
 
 out:
 	return retval;
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 02/11] DIO: Fix a wrong comment
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
  2011-08-02  4:38 ` [PATCH 01/11] DIO: Separate fields only used in the submission path from struct dio Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 17:59   ` Jeff Moyer
  2011-08-02  4:38 ` [PATCH 03/11] DIO: Rearrange fields in dio/dio_submit to avoid holes Andi Kleen
                   ` (9 subsequent siblings)
  11 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

There's nothing on the stack, even before my changes.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 02943e3..18e1a14 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -39,7 +39,7 @@
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
- * the size of a structure on the stack.
+ * the size of a structure in the slab cache
  */
 #define DIO_PAGES	64
 
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 03/11] DIO: Rearrange fields in dio/dio_submit to avoid holes.
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
  2011-08-02  4:38 ` [PATCH 01/11] DIO: Separate fields only used in the submission path from struct dio Andi Kleen
  2011-08-02  4:38 ` [PATCH 02/11] DIO: Fix a wrong comment Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 18:00   ` Jeff Moyer
  2011-08-02  4:38 ` [PATCH 04/11] DIO: Use a slab cache for struct dio Andi Kleen
                   ` (8 subsequent siblings)
  11 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Fix most problems reported by pahole.

There is still a weird 104 byte hole after map_bh. I'm not sure what
causes this.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |   13 ++++++-------
 1 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 18e1a14..8f23db3 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -73,10 +73,10 @@ struct dio_submit {
 	sector_t block_in_file;		/* Current offset into the underlying
 					   file in dio_block units. */
 	unsigned blocks_available;	/* At block_in_file.  changes */
+	int reap_counter;		/* rate limit reaping */
 	sector_t final_block_in_request;/* doesn't change */
 	unsigned first_block_in_page;	/* doesn't change, Used only once */
 	int boundary;			/* prev block is at a boundary */
-	int reap_counter;		/* rate limit reaping */
 	get_block_t *get_block;		/* block mapping function */
 	dio_submit_t *submit_io;	/* IO submition function */	
 
@@ -114,27 +114,26 @@ struct dio_submit {
 /* dio_state communicated between submission path and end_io */
 struct dio {
 	int flags;			/* doesn't change */
-	struct inode *inode;
 	int rw;
+	struct inode *inode;
 	loff_t i_size;			/* i_size when submitted */
 	dio_iodone_t *end_io;		/* IO completion function */
-	struct buffer_head map_bh;	/* last get_block() result */
 
 
 	/* BIO completion state */
 	spinlock_t bio_lock;		/* protects BIO fields below */
+	int page_errors;		/* errno from get_user_pages() */
+	int is_async;			/* is IO async ? */
+	int io_error;			/* IO error in completion path */
 	unsigned long refcount;		/* direct_io_worker() and bios */
 	struct bio *bio_list;		/* singly linked via bi_private */
 	struct task_struct *waiter;	/* waiting task (NULL if none) */
 
 	/* AIO related stuff */
 	struct kiocb *iocb;		/* kiocb */
-	int is_async;			/* is IO async ? */
-	int io_error;			/* IO error in completion path */
 	ssize_t result;                 /* IO result */
 
-	int page_errors;		/* errno from get_user_pages() */
-
+	struct buffer_head map_bh;	/* last get_block() result */
 	/*
 	 * pages[] (and any fields placed after it) are not zeroed out at
 	 * allocation time.  Don't add new fields after pages[] unless you
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 04/11] DIO: Use a slab cache for struct dio
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
                   ` (2 preceding siblings ...)
  2011-08-02  4:38 ` [PATCH 03/11] DIO: Rearrange fields in dio/dio_submit to avoid holes Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 18:01   ` Jeff Moyer
  2011-08-02  4:38 ` [PATCH 05/11] DIO: Separate map_bh from dio v2 Andi Kleen
                   ` (7 subsequent siblings)
  11 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

A direct slab call is slightly faster than kmalloc and can be better cached
per CPU. It also avoids rounding to the next kmalloc slab.

In addition this enforces cache line alignment for struct dio to avoid
any false sharing.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |   19 ++++++++++++++-----
 1 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8f23db3..e91ac83 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -140,7 +140,9 @@ struct dio {
 	 * wish that they not be zeroed.
 	 */
 	struct page *pages[DIO_PAGES];	/* page buffer */	
-};
+} ____cacheline_aligned_in_smp;
+
+static struct kmem_cache *dio_cache __read_mostly;
 
 static void __inode_dio_wait(struct inode *inode)
 {
@@ -330,7 +332,7 @@ static void dio_bio_end_aio(struct bio *bio, int error)
 
 	if (remaining == 0) {
 		dio_complete(dio, dio->iocb->ki_pos, 0, true);
-		kfree(dio);
+		kmem_cache_free(dio_cache, dio);
 	}
 }
 
@@ -1178,7 +1180,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 	if (ret2 == 0) {
 		ret = dio_complete(dio, offset, ret, false);
-		kfree(dio);
+		kmem_cache_free(dio_cache, dio);
 	} else
 		BUG_ON(ret != -EIOCBQUEUED);
 
@@ -1254,7 +1256,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (rw == READ && end == offset)
 		return 0;
 
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
 	retval = -ENOMEM;
 	if (!dio)
 		goto out;
@@ -1278,7 +1280,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 							      end - 1);
 			if (retval) {
 				mutex_unlock(&inode->i_mutex);
-				kfree(dio);
+				kmem_cache_free(dio_cache, dio);
 				goto out;
 			}
 		}
@@ -1306,3 +1308,10 @@ out:
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
+
+static __init int dio_init(void)
+{
+	dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
+	return 0;
+}
+module_init(dio_init)
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 05/11] DIO: Separate map_bh from dio v2
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
                   ` (3 preceding siblings ...)
  2011-08-02  4:38 ` [PATCH 04/11] DIO: Use a slab cache for struct dio Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 18:11   ` Jeff Moyer
  2011-08-02  4:38 ` [PATCH 06/11] DIO: Inline the complete submission path v2 Andi Kleen
                   ` (6 subsequent siblings)
  11 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Only a single b_private field in the map_bh buffer head is needed after
the submission path. Move map_bh separately to avoid storing
this information in the long term slab.

This avoids the weird 104 byte hole in struct dio_submit which also needed
to be memseted early.

v2: b_private->private (hch)
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |   65 +++++++++++++++++++++++++++++++-------------------------
 1 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index e91ac83..172ec77 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -119,6 +119,7 @@ struct dio {
 	loff_t i_size;			/* i_size when submitted */
 	dio_iodone_t *end_io;		/* IO completion function */
 
+	void *private;			/* copy from map_bh.b_private */
 
 	/* BIO completion state */
 	spinlock_t bio_lock;		/* protects BIO fields below */
@@ -133,7 +134,6 @@ struct dio {
 	struct kiocb *iocb;		/* kiocb */
 	ssize_t result;                 /* IO result */
 
-	struct buffer_head map_bh;	/* last get_block() result */
 	/*
 	 * pages[] (and any fields placed after it) are not zeroed out at
 	 * allocation time.  Don't add new fields after pages[] unless you
@@ -301,7 +301,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
 
 	if (dio->end_io && dio->result) {
 		dio->end_io(dio->iocb, offset, transferred,
-			    dio->map_bh.b_private, ret, is_async);
+			    dio->private, ret, is_async);
 	} else {
 		if (is_async)
 			aio_complete(dio->iocb, ret, 0);
@@ -574,10 +574,10 @@ static int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
  * buffer_mapped().  However the direct-io code will only process holes one
  * block at a time - it will repeatedly call get_block() as it walks the hole.
  */
-static int get_more_blocks(struct dio *dio, struct dio_submit *sdio)
+static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
+			   struct buffer_head *map_bh)
 {
 	int ret;
-	struct buffer_head *map_bh = &dio->map_bh;
 	sector_t fs_startblk;	/* Into file, in filesystem-sized blocks */
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	unsigned long dio_count;/* Number of dio_block-sized blocks */
@@ -621,6 +621,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio)
 
 		ret = (*sdio->get_block)(dio->inode, fs_startblk,
 						map_bh, create);
+
+		/* Store for completion */
+		dio->private = map_bh->b_private;
 	}
 	return ret;
 }
@@ -629,7 +632,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio)
  * There is no bio.  Make one now.
  */
 static int dio_new_bio(struct dio *dio, struct dio_submit *sdio, 
-		       sector_t start_sector)
+		       sector_t start_sector, struct buffer_head *map_bh)
 {
 	sector_t sector;
 	int ret, nr_pages;
@@ -638,10 +641,10 @@ static int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
 	if (ret)
 		goto out;
 	sector = start_sector << (sdio->blkbits - 9);
-	nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev));
+	nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
 	nr_pages = min(nr_pages, BIO_MAX_PAGES);
 	BUG_ON(nr_pages <= 0);
-	dio_bio_alloc(dio, sdio, dio->map_bh.b_bdev, sector, nr_pages);
+	dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
 	sdio->boundary = 0;
 out:
 	return ret;
@@ -686,7 +689,8 @@ static int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
  * The caller of this function is responsible for removing cur_page from the
  * dio, and for dropping the refcount which came from that presence.
  */
-static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio)
+static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
+			     struct buffer_head *map_bh)
 {
 	int ret = 0;
 
@@ -721,14 +725,14 @@ static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio)
 	}
 
 	if (sdio->bio == NULL) {
-		ret = dio_new_bio(dio, sdio, sdio->cur_page_block);
+		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
 		if (ret)
 			goto out;
 	}
 
 	if (dio_bio_add_page(dio, sdio) != 0) {
 		dio_bio_submit(dio, sdio);
-		ret = dio_new_bio(dio, sdio, sdio->cur_page_block);
+		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
 		if (ret == 0) {
 			ret = dio_bio_add_page(dio, sdio);
 			BUG_ON(ret != 0);
@@ -757,7 +761,8 @@ out:
  */
 static int
 submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
-		unsigned offset, unsigned len, sector_t blocknr)
+		    unsigned offset, unsigned len, sector_t blocknr,
+		    struct buffer_head *map_bh)
 {
 	int ret = 0;
 
@@ -782,7 +787,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		 * avoid metadata seeks.
 		 */
 		if (sdio->boundary) {
-			ret = dio_send_cur_page(dio, sdio);
+			ret = dio_send_cur_page(dio, sdio, map_bh);
 			page_cache_release(sdio->cur_page);
 			sdio->cur_page = NULL;
 		}
@@ -793,7 +798,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 	 * If there's a deferred page already there then send it.
 	 */
 	if (sdio->cur_page) {
-		ret = dio_send_cur_page(dio, sdio);
+		ret = dio_send_cur_page(dio, sdio, map_bh);
 		page_cache_release(sdio->cur_page);
 		sdio->cur_page = NULL;
 		if (ret)
@@ -815,16 +820,16 @@ out:
  * file blocks.  Only called for S_ISREG files - blockdevs do not set
  * buffer_new
  */
-static void clean_blockdev_aliases(struct dio *dio)
+static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
 {
 	unsigned i;
 	unsigned nblocks;
 
-	nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits;
+	nblocks = map_bh->b_size >> dio->inode->i_blkbits;
 
 	for (i = 0; i < nblocks; i++) {
-		unmap_underlying_metadata(dio->map_bh.b_bdev,
-					dio->map_bh.b_blocknr + i);
+		unmap_underlying_metadata(map_bh->b_bdev,
+					  map_bh->b_blocknr + i);
 	}
 }
 
@@ -837,7 +842,8 @@ static void clean_blockdev_aliases(struct dio *dio)
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
-static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end)
+static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end,
+			   struct buffer_head *map_bh)
 {
 	unsigned dio_blocks_per_fs_block;
 	unsigned this_chunk_blocks;	/* In dio_blocks */
@@ -845,7 +851,7 @@ static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end)
 	struct page *page;
 
 	sdio->start_zero_done = 1;
-	if (!sdio->blkfactor || !buffer_new(&dio->map_bh))
+	if (!sdio->blkfactor || !buffer_new(map_bh))
 		return;
 
 	dio_blocks_per_fs_block = 1 << sdio->blkfactor;
@@ -865,7 +871,7 @@ static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end)
 
 	page = ZERO_PAGE(0);
 	if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, 
-				sdio->next_block_for_io))
+				sdio->next_block_for_io, map_bh))
 		return;
 
 	sdio->next_block_for_io += this_chunk_blocks;
@@ -887,13 +893,13 @@ static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end)
  * it should set b_size to PAGE_SIZE or more inside get_block().  This gives
  * fine alignment but still allows this function to work in PAGE_SIZE units.
  */
-static int do_direct_IO(struct dio *dio, struct dio_submit *sdio)
+static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
+			struct buffer_head *map_bh)
 {
 	const unsigned blkbits = sdio->blkbits;
 	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	struct page *page;
 	unsigned block_in_page;
-	struct buffer_head *map_bh = &dio->map_bh;
 	int ret = 0;
 
 	/* The I/O can start at any block offset within the first page */
@@ -919,7 +925,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio)
 				unsigned long blkmask;
 				unsigned long dio_remainder;
 
-				ret = get_more_blocks(dio, sdio);
+				ret = get_more_blocks(dio, sdio, map_bh);
 				if (ret) {
 					page_cache_release(page);
 					goto out;
@@ -932,7 +938,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio)
 				sdio->next_block_for_io =
 					map_bh->b_blocknr << sdio->blkfactor;
 				if (buffer_new(map_bh))
-					clean_blockdev_aliases(dio);
+					clean_blockdev_aliases(dio, map_bh);
 
 				if (!sdio->blkfactor)
 					goto do_holes;
@@ -991,7 +997,7 @@ do_holes:
 			 * we must zero out the start of this block.
 			 */
 			if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
-				dio_zero_block(dio, sdio, 0);
+				dio_zero_block(dio, sdio, 0, map_bh);
 
 			/*
 			 * Work out, in this_chunk_blocks, how much disk we
@@ -1009,7 +1015,7 @@ do_holes:
 
 			sdio->boundary = buffer_boundary(map_bh);
 			ret = submit_page_section(dio, sdio, page, offset_in_page,
-				this_chunk_bytes, sdio->next_block_for_io);
+				  this_chunk_bytes, sdio->next_block_for_io, map_bh);
 			if (ret) {
 				page_cache_release(page);
 				goto out;
@@ -1045,6 +1051,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	ssize_t ret = 0;
 	ssize_t ret2;
 	size_t bytes;
+	struct buffer_head map_bh = { 0, };
 
 	dio->inode = inode;
 	dio->rw = rw;
@@ -1099,7 +1106,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
 		sdio->curr_user_address = user_addr;
 	
-		ret = do_direct_IO(dio, sdio);
+		ret = do_direct_IO(dio, sdio, &map_bh);
 
 		dio->result += iov[seg].iov_len -
 			((sdio->final_block_in_request - sdio->block_in_file) <<
@@ -1122,10 +1129,10 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 * There may be some unwritten disk at the end of a part-written
 	 * fs-block-sized block.  Go zero that now.
 	 */
-	dio_zero_block(dio, sdio, 1);
+	dio_zero_block(dio, sdio, 1, &map_bh);
 
 	if (sdio->cur_page) {
-		ret2 = dio_send_cur_page(dio, sdio);
+		ret2 = dio_send_cur_page(dio, sdio, &map_bh);
 		if (ret == 0)
 			ret = ret2;
 		page_cache_release(sdio->cur_page);
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 06/11] DIO: Inline the complete submission path v2
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
                   ` (4 preceding siblings ...)
  2011-08-02  4:38 ` [PATCH 05/11] DIO: Separate map_bh from dio v2 Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 18:14   ` Jeff Moyer
  2011-08-02  4:38 ` [PATCH 07/11] DIO: Merge direct_io_walker into __blockdev_direct_IO Andi Kleen
                   ` (5 subsequent siblings)
  11 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Add inlines to all the submission path functions. While this increases
code size it also gives gcc a lot of optimization opportunities
in this critical hotpath.

In particular -- together with some other changes -- this
allows gcc to get rid of the unnecessary clearing of
sdio at the beginning and optimize the messy parameter passing.
Any non inlining of a function which takes a sdio parameter
would break this optimization because they cannot be done if the
address of a structure is taken.

Note that benefits are only seen with CONFIG_OPTIMIZE_INLINING
and CONFIG_CC_OPTIMIZE_FOR_SIZE both set to off.

This gives about 2.2% improvement on a large database benchmark
with a high IOPS rate.

v2: Add a comment
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |   31 ++++++++++++++++++-------------
 1 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 172ec77..103a6fc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -199,7 +199,7 @@ static inline unsigned dio_pages_present(struct dio *dio, struct dio_submit *sdi
 /*
  * Go grab and pin some userspace pages.   Typically we'll get 64 at a time.
  */
-static int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
+static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 	int nr_pages;
@@ -245,7 +245,7 @@ out:
  * decent number of pages, less frequently.  To provide nicer use of the
  * L1 cache.
  */
-static struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio)
+static inline struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio)
 {
 	if (dio_pages_present(dio, sdio) == 0) {
 		int ret;
@@ -376,7 +376,7 @@ void dio_end_io(struct bio *bio, int error)
 }
 EXPORT_SYMBOL_GPL(dio_end_io);
 
-static void
+static void inline
 dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 	      struct block_device *bdev,
 	      sector_t first_sector, int nr_vecs)
@@ -407,7 +407,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
  *
  * bios hold a dio reference between submit_bio and ->end_io.
  */
-static void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
+static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 {
 	struct bio *bio = sdio->bio;
 	unsigned long flags;
@@ -435,7 +435,7 @@ static void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 /*
  * Release any resources in case of a failure
  */
-static void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
+static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
 	while (dio_pages_present(dio, sdio))
 		page_cache_release(dio_get_page(dio, sdio));
@@ -528,7 +528,7 @@ static void dio_await_completion(struct dio *dio)
  *
  * This also helps to limit the peak amount of pinned userspace memory.
  */
-static int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
+static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret = 0;
 
@@ -631,7 +631,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 /*
  * There is no bio.  Make one now.
  */
-static int dio_new_bio(struct dio *dio, struct dio_submit *sdio, 
+static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, 
 		       sector_t start_sector, struct buffer_head *map_bh)
 {
 	sector_t sector;
@@ -657,7 +657,7 @@ out:
  *
  * Return zero on success.  Non-zero means the caller needs to start a new BIO.
  */
-static int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
+static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 
@@ -689,8 +689,8 @@ static int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
  * The caller of this function is responsible for removing cur_page from the
  * dio, and for dropping the refcount which came from that presence.
  */
-static int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
-			     struct buffer_head *map_bh)
+static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
+	  		    	    struct buffer_head *map_bh)
 {
 	int ret = 0;
 
@@ -759,7 +759,7 @@ out:
  * If that doesn't work out then we put the old page into the bio and add this
  * page to the dio instead.
  */
-static int
+static inline int
 submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		    unsigned offset, unsigned len, sector_t blocknr,
 		    struct buffer_head *map_bh)
@@ -842,7 +842,7 @@ static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
  * `end' is zero if we're doing the start of the IO, 1 at the end of the
  * IO.
  */
-static void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end,
+static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, int end,
 			   struct buffer_head *map_bh)
 {
 	unsigned dio_blocks_per_fs_block;
@@ -1039,7 +1039,7 @@ out:
 	return ret;
 }
 
-static ssize_t
+static inline ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
 	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
@@ -1213,6 +1213,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
  * expected that filesystem provide exclusion between new direct I/O
  * and truncates.  For DIO_LOCKING filesystems this is done by i_mutex,
  * but other filesystems need to take care of this on their own.
+ * 
+ * NOTE: if you pass "sdio" to anything by pointer make sure that function
+ * is always inlined. Otherwise gcc is unable to split the structure into
+ * individual fields and will generate much worse code. 
+ * This is important for the whole file.
  */
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 07/11] DIO: Merge direct_io_walker into __blockdev_direct_IO
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
                   ` (5 preceding siblings ...)
  2011-08-02  4:38 ` [PATCH 06/11] DIO: Inline the complete submission path v2 Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 18:20   ` Jeff Moyer
  2011-08-02  4:38 ` [PATCH 08/11] DIO: Remove unnecessary dio argument from dio_pages_present() Andi Kleen
                   ` (4 subsequent siblings)
  11 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

This doesn't change anything for the compiler, but hch thought it would
make the code clearer.

I moved the reference counting into its own little inline.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |  271 +++++++++++++++++++++++++++-----------------------------
 1 files changed, 132 insertions(+), 139 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 103a6fc..669b667 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1039,136 +1039,10 @@ out:
 	return ret;
 }
 
-static inline ssize_t
-direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
-	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
-	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-	dio_submit_t submit_io, struct dio *dio, struct dio_submit *sdio)
+static inline int drop_refcount(struct dio *dio)
 {
-	unsigned long user_addr; 
+	int ret2;
 	unsigned long flags;
-	int seg;
-	ssize_t ret = 0;
-	ssize_t ret2;
-	size_t bytes;
-	struct buffer_head map_bh = { 0, };
-
-	dio->inode = inode;
-	dio->rw = rw;
-	sdio->blkbits = blkbits;
-	sdio->blkfactor = inode->i_blkbits - blkbits;
-	sdio->block_in_file = offset >> blkbits;
-
-	sdio->get_block = get_block;
-	dio->end_io = end_io;
-	sdio->submit_io = submit_io;
-	sdio->final_block_in_bio = -1;
-	sdio->next_block_for_io = -1;
-
-	dio->iocb = iocb;
-	dio->i_size = i_size_read(inode);
-
-	spin_lock_init(&dio->bio_lock);
-	dio->refcount = 1;
-
-	/*
-	 * In case of non-aligned buffers, we may need 2 more
-	 * pages since we need to zero out first and last block.
-	 */
-	if (unlikely(sdio->blkfactor))
-		sdio->pages_in_io = 2;
-
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio->pages_in_io +=
-			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
-				- user_addr/PAGE_SIZE);
-	}
-
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio->size += bytes = iov[seg].iov_len;
-
-		/* Index into the first page of the first block */
-		sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		sdio->final_block_in_request = sdio->block_in_file +
-						(bytes >> blkbits);
-		/* Page fetching state */
-		sdio->head = 0;
-		sdio->tail = 0;
-		sdio->curr_page = 0;
-
-		sdio->total_pages = 0;
-		if (user_addr & (PAGE_SIZE-1)) {
-			sdio->total_pages++;
-			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
-		}
-		sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		sdio->curr_user_address = user_addr;
-	
-		ret = do_direct_IO(dio, sdio, &map_bh);
-
-		dio->result += iov[seg].iov_len -
-			((sdio->final_block_in_request - sdio->block_in_file) <<
-					blkbits);
-
-		if (ret) {
-			dio_cleanup(dio, sdio);
-			break;
-		}
-	} /* end iovec loop */
-
-	if (ret == -ENOTBLK) {
-		/*
-		 * The remaining part of the request will be
-		 * be handled by buffered I/O when we return
-		 */
-		ret = 0;
-	}
-	/*
-	 * There may be some unwritten disk at the end of a part-written
-	 * fs-block-sized block.  Go zero that now.
-	 */
-	dio_zero_block(dio, sdio, 1, &map_bh);
-
-	if (sdio->cur_page) {
-		ret2 = dio_send_cur_page(dio, sdio, &map_bh);
-		if (ret == 0)
-			ret = ret2;
-		page_cache_release(sdio->cur_page);
-		sdio->cur_page = NULL;
-	}
-	if (sdio->bio)
-		dio_bio_submit(dio, sdio);
-
-	/*
-	 * It is possible that, we return short IO due to end of file.
-	 * In that case, we need to release all the pages we got hold on.
-	 */
-	dio_cleanup(dio, sdio);
-
-	/*
-	 * All block lookups have been performed. For READ requests
-	 * we can let i_mutex go now that its achieved its purpose
-	 * of protecting us from looking up uninitialized blocks.
-	 */
-	if (rw == READ && (dio->flags & DIO_LOCKING))
-		mutex_unlock(&dio->inode->i_mutex);
-
-	/*
-	 * The only time we want to leave bios in flight is when a successful
-	 * partial aio read or full aio write have been setup.  In that case
-	 * bio completion will call aio_complete.  The only time it's safe to
-	 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
-	 * This had *better* be the only place that raises -EIOCBQUEUED.
-	 */
-	BUG_ON(ret == -EIOCBQUEUED);
-	if (dio->is_async && ret == 0 && dio->result &&
-	    ((rw & READ) || (dio->result == sdio->size)))
-		ret = -EIOCBQUEUED;
-
-	if (ret != -EIOCBQUEUED)
-		dio_await_completion(dio);
 
 	/*
 	 * Sync will always be dropping the final ref and completing the
@@ -1184,14 +1058,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	spin_lock_irqsave(&dio->bio_lock, flags);
 	ret2 = --dio->refcount;
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
-
-	if (ret2 == 0) {
-		ret = dio_complete(dio, offset, ret, false);
-		kmem_cache_free(dio_cache, dio);
-	} else
-		BUG_ON(ret != -EIOCBQUEUED);
-
-	return ret;
+	return ret2;
 }
 
 /*
@@ -1235,6 +1102,9 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	loff_t end = offset;
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
+	unsigned long user_addr; 
+	size_t bytes;
+	struct buffer_head map_bh = { 0, };
 
 	if (rw & WRITE)
 		rw = WRITE_ODIRECT;
@@ -1312,9 +1182,132 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 
-	retval = direct_io_worker(rw, iocb, inode, iov, offset,
-				nr_segs, blkbits, get_block, end_io,
-				  submit_io, dio, &sdio);
+	retval = 0;
+
+	dio->inode = inode;
+	dio->rw = rw;
+	sdio.blkbits = blkbits;
+	sdio.blkfactor = inode->i_blkbits - blkbits;
+	sdio.block_in_file = offset >> blkbits;
+
+	sdio.get_block = get_block;
+	dio->end_io = end_io;
+	sdio.submit_io = submit_io;
+	sdio.final_block_in_bio = -1;
+	sdio.next_block_for_io = -1;
+
+	dio->iocb = iocb;
+	dio->i_size = i_size_read(inode);
+
+	spin_lock_init(&dio->bio_lock);
+	dio->refcount = 1;
+
+	/*
+	 * In case of non-aligned buffers, we may need 2 more
+	 * pages since we need to zero out first and last block.
+	 */
+	if (unlikely(sdio.blkfactor))
+		sdio.pages_in_io = 2;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		sdio.pages_in_io +=
+			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
+				- user_addr/PAGE_SIZE);
+	}
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		sdio.size += bytes = iov[seg].iov_len;
+
+		/* Index into the first page of the first block */
+		sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+		sdio.final_block_in_request = sdio.block_in_file +
+						(bytes >> blkbits);
+		/* Page fetching state */
+		sdio.head = 0;
+		sdio.tail = 0;
+		sdio.curr_page = 0;
+
+		sdio.total_pages = 0;
+		if (user_addr & (PAGE_SIZE-1)) {
+			sdio.total_pages++;
+			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+		}
+		sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		sdio.curr_user_address = user_addr;
+	
+		retval = do_direct_IO(dio, &sdio, &map_bh);
+
+		dio->result += iov[seg].iov_len -
+			((sdio.final_block_in_request - sdio.block_in_file) <<
+					blkbits);
+
+		if (retval) {
+			dio_cleanup(dio, &sdio);
+			break;
+		}
+	} /* end iovec loop */
+
+	if (retval == -ENOTBLK) {
+		/*
+		 * The remaining part of the request will be
+		 * be handled by buffered I/O when we return
+		 */
+		retval = 0;
+	}
+	/*
+	 * There may be some unwritten disk at the end of a part-written
+	 * fs-block-sized block.  Go zero that now.
+	 */
+	dio_zero_block(dio, &sdio, 1, &map_bh);
+
+	if (sdio.cur_page) {
+		ssize_t ret2;
+
+		ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
+		if (retval == 0)
+			retval = ret2;
+		page_cache_release(sdio.cur_page);
+		sdio.cur_page = NULL;
+	}
+	if (sdio.bio)
+		dio_bio_submit(dio, &sdio);
+
+	/*
+	 * It is possible that, we return short IO due to end of file.
+	 * In that case, we need to release all the pages we got hold on.
+	 */
+	dio_cleanup(dio, &sdio);
+
+	/*
+	 * All block lookups have been performed. For READ requests
+	 * we can let i_mutex go now that its achieved its purpose
+	 * of protecting us from looking up uninitialized blocks.
+	 */
+	if (rw == READ && (dio->flags & DIO_LOCKING))
+		mutex_unlock(&dio->inode->i_mutex);
+
+	/*
+	 * The only time we want to leave bios in flight is when a successful
+	 * partial aio read or full aio write have been setup.  In that case
+	 * bio completion will call aio_complete.  The only time it's safe to
+	 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
+	 * This had *better* be the only place that raises -EIOCBQUEUED.
+	 */
+	BUG_ON(retval == -EIOCBQUEUED);
+	if (dio->is_async && retval == 0 && dio->result &&
+	    ((rw & READ) || (dio->result == sdio.size)))
+		retval = -EIOCBQUEUED;
+
+	if (retval != -EIOCBQUEUED)
+		dio_await_completion(dio);
+
+	if (drop_refcount(dio) == 0) {
+		retval = dio_complete(dio, offset, retval, false);
+		kmem_cache_free(dio_cache, dio);
+	} else
+		BUG_ON(retval != -EIOCBQUEUED);
 
 out:
 	return retval;
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 08/11] DIO: Remove unnecessary dio argument from dio_pages_present()
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
                   ` (6 preceding siblings ...)
  2011-08-02  4:38 ` [PATCH 07/11] DIO: Merge direct_io_walker into __blockdev_direct_IO Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 18:21   ` Jeff Moyer
  2011-08-02  4:38 ` [PATCH 09/11] DIO: Remove unused dio parameter from dio_bio_add_page Andi Kleen
                   ` (3 subsequent siblings)
  11 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 669b667..3f5d1b4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(inode_dio_done);
 /*
  * How many pages are in the queue?
  */
-static inline unsigned dio_pages_present(struct dio *dio, struct dio_submit *sdio)
+static inline unsigned dio_pages_present(struct dio_submit *sdio)
 {
 	return sdio->tail - sdio->head;
 }
@@ -247,13 +247,13 @@ out:
  */
 static inline struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio)
 {
-	if (dio_pages_present(dio, sdio) == 0) {
+	if (dio_pages_present(sdio) == 0) {
 		int ret;
 
 		ret = dio_refill_pages(dio, sdio);
 		if (ret)
 			return ERR_PTR(ret);
-		BUG_ON(dio_pages_present(dio, sdio) == 0);
+		BUG_ON(dio_pages_present(sdio) == 0);
 	}
 	return dio->pages[sdio->head++];
 }
@@ -437,7 +437,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
  */
 static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
-	while (dio_pages_present(dio, sdio))
+	while (dio_pages_present(sdio))
 		page_cache_release(dio_get_page(dio, sdio));
 }
 
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 09/11] DIO: Remove unused dio parameter from dio_bio_add_page
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
                   ` (7 preceding siblings ...)
  2011-08-02  4:38 ` [PATCH 08/11] DIO: Remove unnecessary dio argument from dio_pages_present() Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 18:21   ` Jeff Moyer
  2011-08-02  4:38 ` [PATCH 10/11] VFS: Cache request_queue in struct block_device Andi Kleen
                   ` (2 subsequent siblings)
  11 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 3f5d1b4..03bcc6f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -657,7 +657,7 @@ out:
  *
  * Return zero on success.  Non-zero means the caller needs to start a new BIO.
  */
-static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
+static inline int dio_bio_add_page(struct dio_submit *sdio)
 {
 	int ret;
 
@@ -730,11 +730,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
 			goto out;
 	}
 
-	if (dio_bio_add_page(dio, sdio) != 0) {
+	if (dio_bio_add_page(sdio) != 0) {
 		dio_bio_submit(dio, sdio);
 		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
 		if (ret == 0) {
-			ret = dio_bio_add_page(dio, sdio);
+			ret = dio_bio_add_page(sdio);
 			BUG_ON(ret != 0);
 		}
 	}
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 10/11] VFS: Cache request_queue in struct block_device
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
                   ` (8 preceding siblings ...)
  2011-08-02  4:38 ` [PATCH 09/11] DIO: Remove unused dio parameter from dio_bio_add_page Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 18:22   ` Jeff Moyer
  2011-08-18 19:42   ` Vivek Goyal
  2011-08-02  4:38 ` [PATCH 11/11] DIO: optimize cache misses in the submission path Andi Kleen
  2011-08-18 17:53 ` Updated direct IO optimization patchkit v2 Jeff Moyer
  11 siblings, 2 replies; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

This makes it possible to get from the inode to the request_queue
with one less cache miss. Used in followon optimization.

The livetime of the pointer is the same as the gendisk.

This assumes that the queue will always stay the same in the
gendisk while it's visible to block_devices. I think that's safe correct?

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/block_dev.c     |    3 +++
 include/linux/fs.h |    2 ++
 2 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f55aad4..5e07536 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1110,6 +1110,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
+		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
 		if (!partno) {
 			struct backing_dev_info *bdi;
@@ -1130,6 +1131,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					disk_put_part(bdev->bd_part);
 					bdev->bd_part = NULL;
 					bdev->bd_disk = NULL;
+					bdev->bd_queue = NULL;
 					mutex_unlock(&bdev->bd_mutex);
 					disk_unblock_events(disk);
 					module_put(disk->fops->owner);
@@ -1203,6 +1205,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	disk_put_part(bdev->bd_part);
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
+	bdev->bd_queue = NULL;
 	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f23bcb7..ea5cb4d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -657,6 +657,7 @@ struct address_space {
 	 * must be enforced here for CRIS, to let the least significant bit
 	 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
 	 */
+struct request_queue;
 
 struct block_device {
 	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
@@ -679,6 +680,7 @@ struct block_device {
 	unsigned		bd_part_count;
 	int			bd_invalidated;
 	struct gendisk *	bd_disk;
+	struct request_queue *  bd_queue;
 	struct list_head	bd_list;
 	/*
 	 * Private data.  You must have bd_claim'ed the block_device
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 11/11] DIO: optimize cache misses in the submission path
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
                   ` (9 preceding siblings ...)
  2011-08-02  4:38 ` [PATCH 10/11] VFS: Cache request_queue in struct block_device Andi Kleen
@ 2011-08-02  4:38 ` Andi Kleen
  2011-08-08 18:43   ` Jeff Moyer
  2011-08-18 17:53 ` Updated direct IO optimization patchkit v2 Jeff Moyer
  11 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-02  4:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-fsdevel, hch, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Some investigation of a transaction processing workload showed that
a major consumer of cycles in __blockdev_direct_IO is the cache miss
while accessing the block size. This is because it has to walk
the chain from block_dev to gendisk to queue.

The block size is needed early on to check alignment and sizes.
It's only done if the check for the inode block size fails.
But the costly block device state is unconditionally fetched.

- Reorganize the code to only fetch block dev state when actually
needed.

Then do a prefetch on the block dev early on in the direct IO
path. This is worth it, because there is substantial code run
before we actually touch the block dev now.

- I also added some unlikelies to make it clear the compiler
that block device fetch code is not normally executed.

This gave a small, but measurable improvement on a large database
benchmark (about 0.3%)

BTW the check code looks somewhat dubious to me: why is the block size
blk size only checked when the inode size check fails? Can
someone explain the difference between all these different block
sizes? Are they cheaper in a dozen?

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |   47 +++++++++++++++++++++++++++++++++++++----------
 1 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 03bcc6f..c424b88 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1086,8 +1086,8 @@ static inline int drop_refcount(struct dio *dio)
  * individual fields and will generate much worse code. 
  * This is important for the whole file.
  */
-ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+static inline ssize_t
+do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
 	dio_submit_t submit_io,	int flags)
@@ -1096,7 +1096,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	size_t size;
 	unsigned long addr;
 	unsigned blkbits = inode->i_blkbits;
-	unsigned bdev_blkbits = 0;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
@@ -1109,12 +1108,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (rw & WRITE)
 		rw = WRITE_ODIRECT;
 
-	if (bdev)
-		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+	/* 
+	 * Avoid references to bdev if not absolutely needed to give
+	 * the early prefetch in the caller enough time.
+	 */
 
-	if (offset & blocksize_mask) {
+	if (unlikely(offset & blocksize_mask)) {
 		if (bdev)
-			 blkbits = bdev_blkbits;
+			blkbits = blksize_bits(bdev_logical_block_size(bdev));
 		blocksize_mask = (1 << blkbits) - 1;
 		if (offset & blocksize_mask)
 			goto out;
@@ -1125,11 +1126,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		addr = (unsigned long)iov[seg].iov_base;
 		size = iov[seg].iov_len;
 		end += size;
-		if ((addr & blocksize_mask) || (size & blocksize_mask))  {
+		if (unlikely((addr & blocksize_mask) || 
+			     (size & blocksize_mask)))  {
 			if (bdev)
-				 blkbits = bdev_blkbits;
+				 blkbits = blksize_bits(
+					 bdev_logical_block_size(bdev));
 			blocksize_mask = (1 << blkbits) - 1;
-			if ((addr & blocksize_mask) || (size & blocksize_mask))  
+			if ((addr & blocksize_mask) || (size & blocksize_mask))
 				goto out;
 		}
 	}
@@ -1312,6 +1315,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 out:
 	return retval;
 }
+
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
+	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+	dio_submit_t submit_io,	int flags)
+{
+	/* 
+	 * The block device state is needed in the end to finally
+	 * submit everything.  Since it's likely to be cache cold
+	 * prefetch it here as first thing to hide some of the
+	 * latency.
+	 * 
+	 * Attempt to prefetch the pieces we likely need later.
+	 */
+	prefetch(&bdev->bd_disk->part_tbl);
+	prefetch(bdev->bd_queue);
+	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+
+	return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+				     nr_segs, get_block, end_io,
+				     submit_io, flags);
+}
+
 EXPORT_SYMBOL(__blockdev_direct_IO);
 
 static __init int dio_init(void)
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/11] DIO: Separate fields only used in the submission path from struct dio
  2011-08-02  4:38 ` [PATCH 01/11] DIO: Separate fields only used in the submission path from struct dio Andi Kleen
@ 2011-08-08 17:59   ` Jeff Moyer
  2011-08-08 19:43     ` Andi Kleen
  0 siblings, 1 reply; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 17:59 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> This large, but largely mechanic, patch moves all fields in struct dio
> that are only used in the submission path into a separate on stack
> data structure. This has the advantage that the memory is very likely
> cache hot, which is not guaranteed for memory fresh out of kmalloc.
>
> This also gives gcc more optimization potential because it can easier
> determine that there are no external aliases for these variables.
>
> The sdio initialization is a initialization now instead of memset.
> This allows gcc to break sdio into individual fields and optimize
> away unnecessary zeroing (after all the functions are inlined)
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>

The pages array is only used on submission.  Why did it land in the
struct dio?  I would also think you'd want to keep it close to
curr_page, head, tail.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 02/11] DIO: Fix a wrong comment
  2011-08-02  4:38 ` [PATCH 02/11] DIO: Fix a wrong comment Andi Kleen
@ 2011-08-08 17:59   ` Jeff Moyer
  0 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 17:59 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> There's nothing on the stack, even before my changes.

Very old comment.

Acked-by: Jeff Moyer <jmoyer@redhat.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 03/11] DIO: Rearrange fields in dio/dio_submit to avoid holes.
  2011-08-02  4:38 ` [PATCH 03/11] DIO: Rearrange fields in dio/dio_submit to avoid holes Andi Kleen
@ 2011-08-08 18:00   ` Jeff Moyer
  0 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 18:00 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> Fix most problems reported by pahole.
>
> There is still a weird 104 byte hole after map_bh. I'm not sure what
> causes this.
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>

Acked-by: Jeff Moyer <jmoyer@redhat.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 04/11] DIO: Use a slab cache for struct dio
  2011-08-02  4:38 ` [PATCH 04/11] DIO: Use a slab cache for struct dio Andi Kleen
@ 2011-08-08 18:01   ` Jeff Moyer
  0 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 18:01 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> A direct slab call is slightly faster than kmalloc and can be better cached
> per CPU. It also avoids rounding to the next kmalloc slab.
>
> In addition this enforces cache line alignment for struct dio to avoid
> any false sharing.
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>

Acked-by: Jeff Moyer <jmoyer@redhat.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 05/11] DIO: Separate map_bh from dio v2
  2011-08-02  4:38 ` [PATCH 05/11] DIO: Separate map_bh from dio v2 Andi Kleen
@ 2011-08-08 18:11   ` Jeff Moyer
  0 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 18:11 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> Only a single b_private field in the map_bh buffer head is needed after
> the submission path. Move map_bh separately to avoid storing
> this information in the long term slab.
>
> This avoids the weird 104 byte hole in struct dio_submit which also needed
> to be memseted early.
>
> v2: b_private->private (hch)
> Signed-off-by: Andi Kleen <ak@linux.intel.com>

> @@ -1045,6 +1051,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
>  	ssize_t ret = 0;
>  	ssize_t ret2;
>  	size_t bytes;
> +	struct buffer_head map_bh = { 0, };

So long as that zeroes the entire struct (seems to), I'm okay with this patch.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 06/11] DIO: Inline the complete submission path v2
  2011-08-02  4:38 ` [PATCH 06/11] DIO: Inline the complete submission path v2 Andi Kleen
@ 2011-08-08 18:14   ` Jeff Moyer
  0 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 18:14 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> Add inlines to all the submission path functions. While this increases
> code size it also gives gcc a lot of optimization opportunities
> in this critical hotpath.
>
> In particular -- together with some other changes -- this
> allows gcc to get rid of the unnecessary clearing of
> sdio at the beginning and optimize the messy parameter passing.
> Any non inlining of a function which takes a sdio parameter
> would break this optimization because they cannot be done if the
> address of a structure is taken.

Do you want to use __always_inline, then?

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 07/11] DIO: Merge direct_io_walker into __blockdev_direct_IO
  2011-08-02  4:38 ` [PATCH 07/11] DIO: Merge direct_io_walker into __blockdev_direct_IO Andi Kleen
@ 2011-08-08 18:20   ` Jeff Moyer
  0 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 18:20 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> This doesn't change anything for the compiler, but hch thought it would
> make the code clearer.
>
> I moved the reference counting into its own little inline.
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>


I would've been fine with leaving it as-is... that function's mighty
long.

Anyway, whatever.

Acked-by: Jeff Moyer <jmoyer@redhat.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 08/11] DIO: Remove unnecessary dio argument from dio_pages_present()
  2011-08-02  4:38 ` [PATCH 08/11] DIO: Remove unnecessary dio argument from dio_pages_present() Andi Kleen
@ 2011-08-08 18:21   ` Jeff Moyer
  0 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 18:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>

Acked-by: Jeff Moyer <jmoyer@redhat.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 09/11] DIO: Remove unused dio parameter from dio_bio_add_page
  2011-08-02  4:38 ` [PATCH 09/11] DIO: Remove unused dio parameter from dio_bio_add_page Andi Kleen
@ 2011-08-08 18:21   ` Jeff Moyer
  0 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 18:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>

Acked-by: Jeff Moyer <jmoyer@redhat.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 10/11] VFS: Cache request_queue in struct block_device
  2011-08-02  4:38 ` [PATCH 10/11] VFS: Cache request_queue in struct block_device Andi Kleen
@ 2011-08-08 18:22   ` Jeff Moyer
  2011-08-18 19:42   ` Vivek Goyal
  1 sibling, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 18:22 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen, jaxboe


CC'd Jens.

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> This makes it possible to get from the inode to the request_queue
> with one less cache miss. Used in followon optimization.
>
> The livetime of the pointer is the same as the gendisk.
>
> This assumes that the queue will always stay the same in the
> gendisk while it's visible to block_devices. I think that's safe correct?
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  fs/block_dev.c     |    3 +++
>  include/linux/fs.h |    2 ++
>  2 files changed, 5 insertions(+), 0 deletions(-)
>
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index f55aad4..5e07536 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1110,6 +1110,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>  	mutex_lock_nested(&bdev->bd_mutex, for_part);
>  	if (!bdev->bd_openers) {
>  		bdev->bd_disk = disk;
> +		bdev->bd_queue = disk->queue;
>  		bdev->bd_contains = bdev;
>  		if (!partno) {
>  			struct backing_dev_info *bdi;
> @@ -1130,6 +1131,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>  					disk_put_part(bdev->bd_part);
>  					bdev->bd_part = NULL;
>  					bdev->bd_disk = NULL;
> +					bdev->bd_queue = NULL;
>  					mutex_unlock(&bdev->bd_mutex);
>  					disk_unblock_events(disk);
>  					module_put(disk->fops->owner);
> @@ -1203,6 +1205,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>  	disk_put_part(bdev->bd_part);
>  	bdev->bd_disk = NULL;
>  	bdev->bd_part = NULL;
> +	bdev->bd_queue = NULL;
>  	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
>  	if (bdev != bdev->bd_contains)
>  		__blkdev_put(bdev->bd_contains, mode, 1);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f23bcb7..ea5cb4d 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -657,6 +657,7 @@ struct address_space {
>  	 * must be enforced here for CRIS, to let the least significant bit
>  	 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
>  	 */
> +struct request_queue;
>  
>  struct block_device {
>  	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
> @@ -679,6 +680,7 @@ struct block_device {
>  	unsigned		bd_part_count;
>  	int			bd_invalidated;
>  	struct gendisk *	bd_disk;
> +	struct request_queue *  bd_queue;
>  	struct list_head	bd_list;
>  	/*
>  	 * Private data.  You must have bd_claim'ed the block_device

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 11/11] DIO: optimize cache misses in the submission path
  2011-08-02  4:38 ` [PATCH 11/11] DIO: optimize cache misses in the submission path Andi Kleen
@ 2011-08-08 18:43   ` Jeff Moyer
  2011-08-08 19:32     ` Andi Kleen
  0 siblings, 1 reply; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 18:43 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

> From: Andi Kleen <ak@linux.intel.com>
>
> Some investigation of a transaction processing workload showed that
> a major consumer of cycles in __blockdev_direct_IO is the cache miss
> while accessing the block size. This is because it has to walk
> the chain from block_dev to gendisk to queue.
>
> The block size is needed early on to check alignment and sizes.
> It's only done if the check for the inode block size fails.
> But the costly block device state is unconditionally fetched.
>
> - Reorganize the code to only fetch block dev state when actually
> needed.
>
> Then do a prefetch on the block dev early on in the direct IO
> path. This is worth it, because there is substantial code run
> before we actually touch the block dev now.
>
> - I also added some unlikelies to make it clear the compiler
> that block device fetch code is not normally executed.
>
> This gave a small, but measurable improvement on a large database
> benchmark (about 0.3%)
>
> BTW the check code looks somewhat dubious to me: why is the block size
> blk size only checked when the inode size check fails? Can
> someone explain the difference between all these different block
> sizes? Are they cheaper in a dozen?

There are two block sizes, the block size of the file system (typically
PAGE_SHIFT), and the logical block size of the underlying storage.  The
dio blkfactor represents the number of dio blocks in a single fs block.
Alignment to the fs block means that you don't have to do any sub-block
zeroing.  It also means you don't have to do as much math in converting
between dio blocks and fs blocks (big deal, right?).

I bet we could default to using the smaller block size all the time, and
still be able to detect when we don't have to do the sub-block zeroing.
Maybe that would be a good follow-on patch.

> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  fs/direct-io.c |   47 +++++++++++++++++++++++++++++++++++++----------
>  1 files changed, 37 insertions(+), 10 deletions(-)
>
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 03bcc6f..c424b88 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -1086,8 +1086,8 @@ static inline int drop_refcount(struct dio *dio)
>   * individual fields and will generate much worse code. 
>   * This is important for the whole file.
>   */
> -ssize_t
> -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
> +static inline ssize_t
> +do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
>  	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
>  	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
>  	dio_submit_t submit_io,	int flags)
> @@ -1096,7 +1096,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
>  	size_t size;
>  	unsigned long addr;
>  	unsigned blkbits = inode->i_blkbits;
> -	unsigned bdev_blkbits = 0;
>  	unsigned blocksize_mask = (1 << blkbits) - 1;
>  	ssize_t retval = -EINVAL;
>  	loff_t end = offset;
> @@ -1109,12 +1108,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
>  	if (rw & WRITE)
>  		rw = WRITE_ODIRECT;
>  
> -	if (bdev)
> -		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
> +	/* 
> +	 * Avoid references to bdev if not absolutely needed to give
> +	 * the early prefetch in the caller enough time.
> +	 */
>  
> -	if (offset & blocksize_mask) {
> +	if (unlikely(offset & blocksize_mask)) {

You can't make this assumption.  Userspace controls what size/alignment
of blocks to send in.

> @@ -1312,6 +1315,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
>  out:
>  	return retval;
>  }
> +
> +ssize_t
> +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
> +	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
> +	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
> +	dio_submit_t submit_io,	int flags)
> +{
> +	/* 
> +	 * The block device state is needed in the end to finally
> +	 * submit everything.  Since it's likely to be cache cold
> +	 * prefetch it here as first thing to hide some of the
> +	 * latency.
> +	 * 
> +	 * Attempt to prefetch the pieces we likely need later.
> +	 */
> +	prefetch(&bdev->bd_disk->part_tbl);
> +	prefetch(bdev->bd_queue);
> +	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
> +
> +	return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
> +				     nr_segs, get_block, end_io,
> +				     submit_io, flags);
> +}
> +
>  EXPORT_SYMBOL(__blockdev_direct_IO);

Heh... you broke direct_io_worker out again (kind of).  ;-)

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 11/11] DIO: optimize cache misses in the submission path
  2011-08-08 18:43   ` Jeff Moyer
@ 2011-08-08 19:32     ` Andi Kleen
  2011-08-08 19:38       ` Jeff Moyer
  0 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-08 19:32 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Andi Kleen, linux-kernel, linux-fsdevel, hch, Andi Kleen

> I bet we could default to using the smaller block size all the time, and
> still be able to detect when we don't have to do the sub-block zeroing.
> Maybe that would be a good follow-on patch.

It doesn't really matter because it's out of the fast path now.

> > +	/* 
> > +	 * Avoid references to bdev if not absolutely needed to give
> > +	 * the early prefetch in the caller enough time.
> > +	 */
> >  
> > -	if (offset & blocksize_mask) {
> > +	if (unlikely(offset & blocksize_mask)) {
> 
> You can't make this assumption.  Userspace controls what size/alignment
> of blocks to send in.

What assumption do you mean?

The code semantics are identical, just the place where I fetch
the block size is different (unless I missed something of course)


-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 11/11] DIO: optimize cache misses in the submission path
  2011-08-08 19:32     ` Andi Kleen
@ 2011-08-08 19:38       ` Jeff Moyer
  0 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 19:38 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

>> I bet we could default to using the smaller block size all the time, and
>> still be able to detect when we don't have to do the sub-block zeroing.
>> Maybe that would be a good follow-on patch.
>
> It doesn't really matter because it's out of the fast path now.
>
>> > +	/* 
>> > +	 * Avoid references to bdev if not absolutely needed to give
>> > +	 * the early prefetch in the caller enough time.
>> > +	 */
>> >  
>> > -	if (offset & blocksize_mask) {
>> > +	if (unlikely(offset & blocksize_mask)) {
>> 
>> You can't make this assumption.  Userspace controls what size/alignment
>> of blocks to send in.
>
> What assumption do you mean?

Sorry, I meant that we don't know whether the offset & blocksize_mask
check is unlikely.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/11] DIO: Separate fields only used in the submission path from struct dio
  2011-08-08 17:59   ` Jeff Moyer
@ 2011-08-08 19:43     ` Andi Kleen
  2011-08-08 19:46       ` Jeff Moyer
  0 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-08 19:43 UTC (permalink / raw)
  To: Jeff Moyer; +Cc: Andi Kleen, linux-kernel, linux-fsdevel, hch, Andi Kleen

> The pages array is only used on submission.  Why did it land in the
> struct dio? 

It's too large for the stack.

-Andi

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 01/11] DIO: Separate fields only used in the submission path from struct dio
  2011-08-08 19:43     ` Andi Kleen
@ 2011-08-08 19:46       ` Jeff Moyer
  0 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-08 19:46 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen

Andi Kleen <andi@firstfloor.org> writes:

>> The pages array is only used on submission.  Why did it land in the
>> struct dio? 
>
> It's too large for the stack.

Heh.  Right...

Acked-by: Jeff Moyer <jmoyer@redhat.com>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: Updated direct IO optimization patchkit v2
  2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
                   ` (10 preceding siblings ...)
  2011-08-02  4:38 ` [PATCH 11/11] DIO: optimize cache misses in the submission path Andi Kleen
@ 2011-08-18 17:53 ` Jeff Moyer
  11 siblings, 0 replies; 36+ messages in thread
From: Jeff Moyer @ 2011-08-18 17:53 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch

Andi Kleen <andi@firstfloor.org> writes:

> Various micro optimizations of the block direct IO path.
>
> This addresses all review comments and adds an additional prefetch
> optimization that gives another about 0.3% in a large macro benchmark.
> Overall the patchkit is worht about 2.2% in the same macro benchmark.
>
> I also ported it to current mainline, resolving some conflicts with
> hch's recent changes to direct-io.

Hi, Andi,

If you get rid of the unlikely I pointed out in patch 11 and address my
question on patch 6 (about always_inline) and repost, I'll ack the
series.  I was hoping hch or Jens would chime in on patch 10, but it
seems they're busy.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 10/11] VFS: Cache request_queue in struct block_device
  2011-08-02  4:38 ` [PATCH 10/11] VFS: Cache request_queue in struct block_device Andi Kleen
  2011-08-08 18:22   ` Jeff Moyer
@ 2011-08-18 19:42   ` Vivek Goyal
  2011-08-18 21:03     ` Andi Kleen
  1 sibling, 1 reply; 36+ messages in thread
From: Vivek Goyal @ 2011-08-18 19:42 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-fsdevel, hch, Andi Kleen, Jens Axboe

On Mon, Aug 01, 2011 at 09:38:12PM -0700, Andi Kleen wrote:
> From: Andi Kleen <ak@linux.intel.com>
> 
> This makes it possible to get from the inode to the request_queue
> with one less cache miss. Used in followon optimization.
> 
> The livetime of the pointer is the same as the gendisk.
> 
> This assumes that the queue will always stay the same in the
> gendisk while it's visible to block_devices. I think that's safe correct?
> 
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> ---
>  fs/block_dev.c     |    3 +++
>  include/linux/fs.h |    2 ++
>  2 files changed, 5 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index f55aad4..5e07536 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1110,6 +1110,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>  	mutex_lock_nested(&bdev->bd_mutex, for_part);
>  	if (!bdev->bd_openers) {
>  		bdev->bd_disk = disk;
> +		bdev->bd_queue = disk->queue;

I am really not sure how good a idea it is to stash away another pointer
in bdev (bdev->queue), just because we don't want to dereference a pointer
(bdev->bd_disk->queue). 

Personally I think it is not a very good idea as if we start following
everywhere in the code, it will make things more complicated.

Is the performance gain because of this one less dereference really 
substantial.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 10/11] VFS: Cache request_queue in struct block_device
  2011-08-18 19:42   ` Vivek Goyal
@ 2011-08-18 21:03     ` Andi Kleen
  2011-08-19 14:14       ` Vivek Goyal
  0 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-18 21:03 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Andi Kleen, linux-kernel, linux-fsdevel, hch, Jens Axboe


> Is the performance gain because of this one less dereference really
> substantial.
Yes it is measurable on a large macro benchmark.

The gain is from doing the prefetch early enough, and that needs the 
additional pointer.

-Andi


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 10/11] VFS: Cache request_queue in struct block_device
  2011-08-18 21:03     ` Andi Kleen
@ 2011-08-19 14:14       ` Vivek Goyal
  2011-08-19 15:36         ` Andi Kleen
  0 siblings, 1 reply; 36+ messages in thread
From: Vivek Goyal @ 2011-08-19 14:14 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, linux-kernel, linux-fsdevel, hch, Jens Axboe

On Thu, Aug 18, 2011 at 02:03:56PM -0700, Andi Kleen wrote:
> 
> >Is the performance gain because of this one less dereference really
> >substantial.
> Yes it is measurable on a large macro benchmark.
> 
> The gain is from doing the prefetch early enough, and that needs the
> additional pointer.

So it gives you extra .3% (as mentioned in your first mail). IMHO, for
.3% we should not cache extra request queue pointer.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 10/11] VFS: Cache request_queue in struct block_device
  2011-08-19 14:14       ` Vivek Goyal
@ 2011-08-19 15:36         ` Andi Kleen
  2011-08-19 15:55           ` Vivek Goyal
  0 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-19 15:36 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Andi Kleen, Andi Kleen, linux-kernel, linux-fsdevel, hch, Jens Axboe

On Fri, Aug 19, 2011 at 10:14:09AM -0400, Vivek Goyal wrote:
> On Thu, Aug 18, 2011 at 02:03:56PM -0700, Andi Kleen wrote:
> > 
> > >Is the performance gain because of this one less dereference really
> > >substantial.
> > Yes it is measurable on a large macro benchmark.
> > 
> > The gain is from doing the prefetch early enough, and that needs the
> > additional pointer.
> 
> So it gives you extra .3% (as mentioned in your first mail). IMHO, for
> .3% we should not cache extra request queue pointer.

Note this is on a benchmark which is primarily userland.  Kernel 
is only a small part, so it's a much higher percentage for the kernel
time.

Also on that large benchmark it's hard to any improvement at all,
and this isn't even a particularly ugly or intrusive change. Not sure 
why you're against it.

Cache optimizations are important.

-Andi

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 10/11] VFS: Cache request_queue in struct block_device
  2011-08-19 15:36         ` Andi Kleen
@ 2011-08-19 15:55           ` Vivek Goyal
  2011-08-19 16:23             ` Andi Kleen
  0 siblings, 1 reply; 36+ messages in thread
From: Vivek Goyal @ 2011-08-19 15:55 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, linux-kernel, linux-fsdevel, hch, Jens Axboe

On Fri, Aug 19, 2011 at 05:36:22PM +0200, Andi Kleen wrote:
> On Fri, Aug 19, 2011 at 10:14:09AM -0400, Vivek Goyal wrote:
> > On Thu, Aug 18, 2011 at 02:03:56PM -0700, Andi Kleen wrote:
> > > 
> > > >Is the performance gain because of this one less dereference really
> > > >substantial.
> > > Yes it is measurable on a large macro benchmark.
> > > 
> > > The gain is from doing the prefetch early enough, and that needs the
> > > additional pointer.
> > 
> > So it gives you extra .3% (as mentioned in your first mail). IMHO, for
> > .3% we should not cache extra request queue pointer.
> 
> Note this is on a benchmark which is primarily userland.  Kernel 
> is only a small part, so it's a much higher percentage for the kernel
> time.
> 
> Also on that large benchmark it's hard to any improvement at all,
> and this isn't even a particularly ugly or intrusive change.

> Not sure why you're against it.

Primarily because of code complexity. We are stashing away a pointer and
not taking any reference anywhere. So I am not even sure who is making
sure that request queue is not gone and there are no comments in the code
about why we are stashing a pointer and how are we making sure that
request queue is around for the lifetime of bdev.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 10/11] VFS: Cache request_queue in struct block_device
  2011-08-19 15:55           ` Vivek Goyal
@ 2011-08-19 16:23             ` Andi Kleen
  2011-08-19 16:51               ` Vivek Goyal
  0 siblings, 1 reply; 36+ messages in thread
From: Andi Kleen @ 2011-08-19 16:23 UTC (permalink / raw)
  To: Vivek Goyal
  Cc: Andi Kleen, Andi Kleen, linux-kernel, linux-fsdevel, hch, Jens Axboe

> Primarily because of code complexity. We are stashing away a pointer and
> not taking any reference anywhere. So I am not even sure who is making
> sure that request queue is not gone and there are no comments in the code
> about why we are stashing a pointer and how are we making sure that
> request queue is around for the lifetime of bdev.

Can you point out a concrete problem in my approach?  This seems
rather vague.

If you have an alternative way to get the 0.3% I would be also
interested.

-Andi

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 10/11] VFS: Cache request_queue in struct block_device
  2011-08-19 16:23             ` Andi Kleen
@ 2011-08-19 16:51               ` Vivek Goyal
  0 siblings, 0 replies; 36+ messages in thread
From: Vivek Goyal @ 2011-08-19 16:51 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, linux-kernel, linux-fsdevel, hch, Jens Axboe

On Fri, Aug 19, 2011 at 06:23:12PM +0200, Andi Kleen wrote:
> > Primarily because of code complexity. We are stashing away a pointer and
> > not taking any reference anywhere. So I am not even sure who is making
> > sure that request queue is not gone and there are no comments in the code
> > about why we are stashing a pointer and how are we making sure that
> > request queue is around for the lifetime of bdev.
> 
> Can you point out a concrete problem in my approach?  This seems
> rather vague.

I am not saying that there is a problem in your approach. I am just
asking or trying to understand that who makes sure request queue is
not gone. 

So if you could just comment two things in code it will help for
a futuer reader of the code.

- Why are we stashing this extra pointer.
- What makes sure that object pointed by this pointer is not freed.

I am not saying that there is a problem. Just that I don't know enough
of code to understand how we are making sure queue object is still
around.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 36+ messages in thread

* [PATCH 08/11] DIO: Remove unnecessary dio argument from dio_pages_present()
  2011-08-29 23:23 Updated direct IO optimization patchkit v3 Andi Kleen
@ 2011-08-29 23:23 ` Andi Kleen
  0 siblings, 0 replies; 36+ messages in thread
From: Andi Kleen @ 2011-08-29 23:23 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-fsdevel, viro, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 fs/direct-io.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 669b667..3f5d1b4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -191,7 +191,7 @@ EXPORT_SYMBOL_GPL(inode_dio_done);
 /*
  * How many pages are in the queue?
  */
-static inline unsigned dio_pages_present(struct dio *dio, struct dio_submit *sdio)
+static inline unsigned dio_pages_present(struct dio_submit *sdio)
 {
 	return sdio->tail - sdio->head;
 }
@@ -247,13 +247,13 @@ out:
  */
 static inline struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio)
 {
-	if (dio_pages_present(dio, sdio) == 0) {
+	if (dio_pages_present(sdio) == 0) {
 		int ret;
 
 		ret = dio_refill_pages(dio, sdio);
 		if (ret)
 			return ERR_PTR(ret);
-		BUG_ON(dio_pages_present(dio, sdio) == 0);
+		BUG_ON(dio_pages_present(sdio) == 0);
 	}
 	return dio->pages[sdio->head++];
 }
@@ -437,7 +437,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
  */
 static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
-	while (dio_pages_present(dio, sdio))
+	while (dio_pages_present(sdio))
 		page_cache_release(dio_get_page(dio, sdio));
 }
 
-- 
1.7.4.4


^ permalink raw reply related	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2011-08-29 23:26 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-02  4:38 Updated direct IO optimization patchkit v2 Andi Kleen
2011-08-02  4:38 ` [PATCH 01/11] DIO: Separate fields only used in the submission path from struct dio Andi Kleen
2011-08-08 17:59   ` Jeff Moyer
2011-08-08 19:43     ` Andi Kleen
2011-08-08 19:46       ` Jeff Moyer
2011-08-02  4:38 ` [PATCH 02/11] DIO: Fix a wrong comment Andi Kleen
2011-08-08 17:59   ` Jeff Moyer
2011-08-02  4:38 ` [PATCH 03/11] DIO: Rearrange fields in dio/dio_submit to avoid holes Andi Kleen
2011-08-08 18:00   ` Jeff Moyer
2011-08-02  4:38 ` [PATCH 04/11] DIO: Use a slab cache for struct dio Andi Kleen
2011-08-08 18:01   ` Jeff Moyer
2011-08-02  4:38 ` [PATCH 05/11] DIO: Separate map_bh from dio v2 Andi Kleen
2011-08-08 18:11   ` Jeff Moyer
2011-08-02  4:38 ` [PATCH 06/11] DIO: Inline the complete submission path v2 Andi Kleen
2011-08-08 18:14   ` Jeff Moyer
2011-08-02  4:38 ` [PATCH 07/11] DIO: Merge direct_io_walker into __blockdev_direct_IO Andi Kleen
2011-08-08 18:20   ` Jeff Moyer
2011-08-02  4:38 ` [PATCH 08/11] DIO: Remove unnecessary dio argument from dio_pages_present() Andi Kleen
2011-08-08 18:21   ` Jeff Moyer
2011-08-02  4:38 ` [PATCH 09/11] DIO: Remove unused dio parameter from dio_bio_add_page Andi Kleen
2011-08-08 18:21   ` Jeff Moyer
2011-08-02  4:38 ` [PATCH 10/11] VFS: Cache request_queue in struct block_device Andi Kleen
2011-08-08 18:22   ` Jeff Moyer
2011-08-18 19:42   ` Vivek Goyal
2011-08-18 21:03     ` Andi Kleen
2011-08-19 14:14       ` Vivek Goyal
2011-08-19 15:36         ` Andi Kleen
2011-08-19 15:55           ` Vivek Goyal
2011-08-19 16:23             ` Andi Kleen
2011-08-19 16:51               ` Vivek Goyal
2011-08-02  4:38 ` [PATCH 11/11] DIO: optimize cache misses in the submission path Andi Kleen
2011-08-08 18:43   ` Jeff Moyer
2011-08-08 19:32     ` Andi Kleen
2011-08-08 19:38       ` Jeff Moyer
2011-08-18 17:53 ` Updated direct IO optimization patchkit v2 Jeff Moyer
2011-08-29 23:23 Updated direct IO optimization patchkit v3 Andi Kleen
2011-08-29 23:23 ` [PATCH 08/11] DIO: Remove unnecessary dio argument from dio_pages_present() Andi Kleen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.