linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Dave Kleikamp <dave.kleikamp@oracle.com>
To: linux-kernel@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org,
	Andrew Morton <akpm@linux-foundation.org>,
	"Maxim V. Patlasov" <mpatlasov@parallels.com>,
	Zach Brown <zab@zabbo.net>,
	Dave Kleikamp <dave.kleikamp@oracle.com>
Subject: [PATCH V8 12/33] dio: add bio_vec support to __blockdev_direct_IO()
Date: Thu, 25 Jul 2013 12:50:38 -0500	[thread overview]
Message-ID: <1374774659-13121-13-git-send-email-dave.kleikamp@oracle.com> (raw)
In-Reply-To: <1374774659-13121-1-git-send-email-dave.kleikamp@oracle.com>

The trick here is to initialize the dio state so that do_direct_IO()
consumes the pages we provide and never tries to map user pages.  This
is done by making sure that final_block_in_request covers the page that
we set in the dio.  do_direct_IO() will return before running out of
pages.

The caller is responsible for dirtying these pages, if needed.  We add
an option to the dio struct that makes sure we only dirty pages when
we're operating on iovecs of user addresses.

Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Cc: Zach Brown <zab@zabbo.net>
---
 fs/direct-io.c | 206 +++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 148 insertions(+), 58 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index a81366c..75a3989 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -127,6 +127,7 @@ struct dio {
 	spinlock_t bio_lock;		/* protects BIO fields below */
 	int page_errors;		/* errno from get_user_pages() */
 	int is_async;			/* is IO async ? */
+	int should_dirty;		/* should we mark read pages dirty? */
 	int io_error;			/* IO error in completion path */
 	unsigned long refcount;		/* direct_io_worker() and bios */
 	struct bio *bio_list;		/* singly linked via bi_private */
@@ -377,7 +378,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	dio->refcount++;
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
 
-	if (dio->is_async && dio->rw == READ)
+	if (dio->is_async && dio->rw == READ && dio->should_dirty)
 		bio_set_pages_dirty(bio);
 
 	if (sdio->submit_io)
@@ -448,13 +449,14 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 	if (!uptodate)
 		dio->io_error = -EIO;
 
-	if (dio->is_async && dio->rw == READ) {
+	if (dio->is_async && dio->rw == READ && dio->should_dirty) {
 		bio_check_pages_dirty(bio);	/* transfers ownership */
 	} else {
 		bio_for_each_segment_all(bvec, bio, i) {
 			struct page *page = bvec->bv_page;
 
-			if (dio->rw == READ && !PageCompound(page))
+			if (dio->rw == READ && !PageCompound(page) &&
+			    dio->should_dirty)
 				set_page_dirty_lock(page);
 			page_cache_release(page);
 		}
@@ -1016,6 +1018,101 @@ static inline int drop_refcount(struct dio *dio)
 	return ret2;
 }
 
+static ssize_t direct_IO_iovec(const struct iovec *iov, unsigned long nr_segs,
+			       struct dio *dio, struct dio_submit *sdio,
+			       unsigned blkbits, struct buffer_head *map_bh)
+{
+	size_t bytes;
+	ssize_t retval = 0;
+	int seg;
+	unsigned long user_addr;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		sdio->pages_in_io +=
+			((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
+				PAGE_SIZE - user_addr / PAGE_SIZE);
+	}
+
+	dio->should_dirty = 1;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		sdio->size += bytes = iov[seg].iov_len;
+
+		/* Index into the first page of the first block */
+		sdio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+		sdio->final_block_in_request = sdio->block_in_file +
+						(bytes >> blkbits);
+		/* Page fetching state */
+		sdio->head = 0;
+		sdio->tail = 0;
+		sdio->curr_page = 0;
+
+		sdio->total_pages = 0;
+		if (user_addr & (PAGE_SIZE-1)) {
+			sdio->total_pages++;
+			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+		}
+		sdio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		sdio->curr_user_address = user_addr;
+
+		retval = do_direct_IO(dio, sdio, map_bh);
+
+		dio->result += iov[seg].iov_len -
+			((sdio->final_block_in_request - sdio->block_in_file) <<
+					blkbits);
+
+		if (retval) {
+			dio_cleanup(dio, sdio);
+			break;
+		}
+	} /* end iovec loop */
+
+	return retval;
+}
+
+static ssize_t direct_IO_bvec(struct bio_vec *bvec, unsigned long nr_segs,
+			      struct dio *dio, struct dio_submit *sdio,
+			      unsigned blkbits, struct buffer_head *map_bh)
+{
+	ssize_t retval = 0;
+	int seg;
+
+	sdio->pages_in_io += nr_segs;
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		sdio->size += bvec[seg].bv_len;
+
+		/* Index into the first page of the first block */
+		sdio->first_block_in_page = bvec[seg].bv_offset >> blkbits;
+		sdio->final_block_in_request = sdio->block_in_file +
+						(bvec[seg].bv_len  >> blkbits);
+		/* Page fetching state */
+		sdio->curr_page = 0;
+		page_cache_get(bvec[seg].bv_page);
+		dio->pages[0] = bvec[seg].bv_page;
+		sdio->head = 0;
+		sdio->tail = 1;
+
+		sdio->total_pages = 1;
+		sdio->curr_user_address = 0;
+
+		retval = do_direct_IO(dio, sdio, map_bh);
+
+		dio->result += bvec[seg].bv_len -
+			((sdio->final_block_in_request - sdio->block_in_file) <<
+					blkbits);
+
+		if (retval) {
+			dio_cleanup(dio, sdio);
+			break;
+		}
+	}
+
+	return retval;
+}
+
 /*
  * This is a library function for use by filesystem drivers.
  *
@@ -1057,11 +1154,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	loff_t end = offset;
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
-	unsigned long user_addr;
-	size_t bytes;
 	struct buffer_head map_bh = { 0, };
 	struct blk_plug plug;
-	const struct iovec *iov = iov_iter_iovec(iter);
 	unsigned long nr_segs = iter->nr_segs;
 
 	if (rw & WRITE)
@@ -1081,20 +1175,49 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	}
 
 	/* Check the memory alignment.  Blocks cannot straddle pages */
-	for (seg = 0; seg < nr_segs; seg++) {
-		addr = (unsigned long)iov[seg].iov_base;
-		size = iov[seg].iov_len;
-		end += size;
-		if (unlikely((addr & blocksize_mask) ||
-			     (size & blocksize_mask))) {
-			if (bdev)
-				blkbits = blksize_bits(
-					 bdev_logical_block_size(bdev));
-			blocksize_mask = (1 << blkbits) - 1;
-			if ((addr & blocksize_mask) || (size & blocksize_mask))
-				goto out;
+	if (iov_iter_has_iovec(iter)) {
+		const struct iovec *iov = iov_iter_iovec(iter);
+
+		for (seg = 0; seg < nr_segs; seg++) {
+			addr = (unsigned long)iov[seg].iov_base;
+			size = iov[seg].iov_len;
+			end += size;
+			if (unlikely((addr & blocksize_mask) ||
+				     (size & blocksize_mask))) {
+				if (bdev)
+					blkbits = blksize_bits(
+						 bdev_logical_block_size(bdev));
+				blocksize_mask = (1 << blkbits) - 1;
+				if ((addr & blocksize_mask) ||
+				    (size & blocksize_mask))
+					goto out;
+			}
 		}
-	}
+	} else if (iov_iter_has_bvec(iter)) {
+		/*
+		 * Is this necessary, or can we trust the in-kernel
+		 * caller? Can we replace this with
+		 *	end += iov_iter_count(iter); ?
+		 */
+		struct bio_vec *bvec = iov_iter_bvec(iter);
+
+		for (seg = 0; seg < nr_segs; seg++) {
+			addr = bvec[seg].bv_offset;
+			size = bvec[seg].bv_len;
+			end += size;
+			if (unlikely((addr & blocksize_mask) ||
+				     (size & blocksize_mask))) {
+				if (bdev)
+					blkbits = blksize_bits(
+						 bdev_logical_block_size(bdev));
+				blocksize_mask = (1 << blkbits) - 1;
+				if ((addr & blocksize_mask) ||
+				    (size & blocksize_mask))
+					goto out;
+			}
+		}
+	} else
+		BUG();
 
 	/* watch out for a 0 len io from a tricksy fs */
 	if (rw == READ && end == offset)
@@ -1171,47 +1294,14 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	if (unlikely(sdio.blkfactor))
 		sdio.pages_in_io = 2;
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio.pages_in_io +=
-			((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
-				PAGE_SIZE - user_addr / PAGE_SIZE);
-	}
-
 	blk_start_plug(&plug);
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		sdio.size += bytes = iov[seg].iov_len;
-
-		/* Index into the first page of the first block */
-		sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		sdio.final_block_in_request = sdio.block_in_file +
-						(bytes >> blkbits);
-		/* Page fetching state */
-		sdio.head = 0;
-		sdio.tail = 0;
-		sdio.curr_page = 0;
-
-		sdio.total_pages = 0;
-		if (user_addr & (PAGE_SIZE-1)) {
-			sdio.total_pages++;
-			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
-		}
-		sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		sdio.curr_user_address = user_addr;
-
-		retval = do_direct_IO(dio, &sdio, &map_bh);
-
-		dio->result += iov[seg].iov_len -
-			((sdio.final_block_in_request - sdio.block_in_file) <<
-					blkbits);
-
-		if (retval) {
-			dio_cleanup(dio, &sdio);
-			break;
-		}
-	} /* end iovec loop */
+	if (iov_iter_has_iovec(iter))
+		retval = direct_IO_iovec(iov_iter_iovec(iter), nr_segs, dio,
+					 &sdio, blkbits, &map_bh);
+	else
+		retval = direct_IO_bvec(iov_iter_bvec(iter), nr_segs, dio,
+					&sdio, blkbits, &map_bh);
 
 	if (retval == -ENOTBLK) {
 		/*
-- 
1.8.3.4

  parent reply	other threads:[~2013-07-25 17:50 UTC|newest]

Thread overview: 72+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-07-25 17:50 [PATCH V8 00/33] loop: Issue O_DIRECT aio using bio_vec Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 01/33] iov_iter: move into its own file Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 02/33] iov_iter: iov_iter_copy_from_user() should use non-atomic copy Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 03/33] iov_iter: add copy_to_user support Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 04/33] iov_iter: add __iovec_copy_to_user() Dave Kleikamp
     [not found] ` <1374774659-13121-1-git-send-email-dave.kleikamp-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2013-07-25 17:50   ` [PATCH V8 05/33] fuse: convert fuse to use iov_iter_copy_[to|from]_user Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 06/33] iov_iter: hide iovec details behind ops function pointers Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 07/33] iov_iter: ii_iovec_copy_to_user should pre-fault user pages Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 08/33] iov_iter: add bvec support Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 09/33] iov_iter: add a shorten call Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 10/33] iov_iter: let callers extract iovecs and bio_vecs Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 11/33] dio: Convert direct_IO to use iov_iter Dave Kleikamp
2013-08-23 15:48   ` Geert Uytterhoeven
2013-07-25 17:50 ` Dave Kleikamp [this message]
2013-07-25 17:50 ` [PATCH V8 13/33] fs: pull iov_iter use higher up the stack Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 14/33] aio: add aio_kernel_() interface Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 15/33] aio: add aio support for iov_iter arguments Dave Kleikamp
2013-08-21 13:55   ` Benjamin LaHaise
2013-08-30 20:05     ` Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 16/33] bio: add bvec_length(), like iov_length() Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 17/33] loop: use aio to perform io on the underlying file Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 18/33] fs: create file_readable() and file_writable() functions Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 19/33] fs: use read_iter and write_iter rather than aio_read and aio_write Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 20/33] fs: add read_iter and write_iter to several file systems Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 21/33] ocfs2: add support for read_iter and write_iter Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 22/33] ext4: " Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 23/33] nfs: add support for read_iter, write_iter Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 24/33] nfs: simplify swap Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 25/33] btrfs: add support for read_iter and write_iter Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 26/33] block_dev: add support for read_iter, write_iter Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 27/33] xfs: add support for read_iter and write_iter Dave Kleikamp
2013-07-26 11:51   ` Dave Chinner
2013-07-25 17:50 ` [PATCH V8 28/33] gfs2: Convert aio_read/write ops to read/write_iter Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 29/33] udf: convert file ops from aio_read/write " Dave Kleikamp
2013-07-25 21:34   ` Jan Kara
2013-07-25 17:50 ` [PATCH V8 30/33] afs: add support for read_iter and write_iter Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 31/33] ecrpytfs: Convert aio_read/write ops to read/write_iter Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 32/33] ubifs: convert file ops from aio_read/write " Dave Kleikamp
2013-07-25 17:50 ` [PATCH V8 33/33] tmpfs: add support for read_iter and write_iter Dave Kleikamp
2013-07-30 21:28 ` [PATCH V8 00/33] loop: Issue O_DIRECT aio using bio_vec Andrew Morton
2013-07-31  0:43   ` Dave Chinner
2013-07-31  6:40     ` Sedat Dilek
2013-07-31  8:41       ` Sedat Dilek
2013-07-31 11:22         ` Sedat Dilek
2013-07-31  9:51   ` Maxim Patlasov
2013-08-01  8:58 ` Christoph Hellwig
2013-08-01 13:04   ` Dave Kleikamp
2013-08-02 10:48     ` Christoph Hellwig
2013-08-20 13:00 ` Christoph Hellwig
2013-08-20 19:13   ` Dave Kleikamp
2013-08-21  0:14     ` Stephen Rothwell
2013-08-21  5:35       ` Sedat Dilek
2013-08-20 22:46   ` Andrew Morton
2013-08-21 13:02 ` Benjamin LaHaise
2013-08-21 16:30   ` Dave Kleikamp
2013-08-21 16:39     ` Benjamin LaHaise
2013-08-21 17:12       ` Dave Kleikamp
2013-08-21 19:30   ` Andrew Morton
2013-08-21 20:24     ` Benjamin LaHaise
2013-10-14 15:07   ` Christoph Hellwig
2013-10-14 21:29     ` Benjamin LaHaise
2013-10-15 16:55       ` Christoph Hellwig
2013-10-15 17:14         ` Benjamin LaHaise
2013-10-15 17:18           ` Christoph Hellwig
2013-10-15 17:53             ` Dave Kleikamp
2014-12-31 20:38 ` Sedat Dilek
2014-12-31 21:52   ` Dave Kleikamp
2014-12-31 22:35     ` Sedat Dilek
2015-01-01  0:52       ` Ming Lei
2015-01-05 19:24         ` Maxim Patlasov
2015-01-06 13:18           ` Ming Lei
2015-01-10 16:51             ` Ming Lei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1374774659-13121-13-git-send-email-dave.kleikamp@oracle.com \
    --to=dave.kleikamp@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mpatlasov@parallels.com \
    --cc=zab@zabbo.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).