All of lore.kernel.org
 help / color / mirror / Atom feed
From: David Howells <dhowells@redhat.com>
To: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>, Jan Kara <jack@suse.cz>,
	Christoph Hellwig <hch@lst.de>,
	Matthew Wilcox <willy@infradead.org>,
	Logan Gunthorpe <logang@deltatee.com>,
	linux-fsdevel@vger.kernel.org, linux-block@vger.kernel.org,
	dhowells@redhat.com, Christoph Hellwig <hch@infradead.org>,
	Matthew Wilcox <willy@infradead.org>,
	Jens Axboe <axboe@kernel.dk>, Jan Kara <jack@suse.cz>,
	Jeff Layton <jlayton@kernel.org>,
	Logan Gunthorpe <logang@deltatee.com>,
	linux-fsdevel@vger.kernel.org, linux-block@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH v6 18/34] dio: Pin pages rather than ref'ing if appropriate
Date: Mon, 16 Jan 2023 23:10:11 +0000	[thread overview]
Message-ID: <167391061117.2311931.16807283804788007499.stgit@warthog.procyon.org.uk> (raw)
In-Reply-To: <167391047703.2311931.8115712773222260073.stgit@warthog.procyon.org.uk>

Convert the generic direct-I/O code to use iov_iter_extract_pages() instead
of iov_iter_get_pages().  This will pin pages or leave them unaltered
rather than getting a ref on them as appropriate to the iterator.

The pages need to be pinned for DIO-read rather than having refs taken on
them to prevent VM copy-on-write from malfunctioning during a concurrent
fork() (the result of the I/O would otherwise end up only visible to the
child process and not the parent).

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: Jens Axboe <axboe@kernel.dk>
cc: Jan Kara <jack@suse.cz>
cc: Christoph Hellwig <hch@lst.de>
cc: Matthew Wilcox <willy@infradead.org>
cc: Logan Gunthorpe <logang@deltatee.com>
cc: linux-fsdevel@vger.kernel.org
cc: linux-block@vger.kernel.org
---

 fs/direct-io.c |   57 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index b1e26a706e31..b4d2c9f85a5b 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -142,9 +142,11 @@ struct dio {
 
 	/*
 	 * pages[] (and any fields placed after it) are not zeroed out at
-	 * allocation time.  Don't add new fields after pages[] unless you
-	 * wish that they not be zeroed.
+	 * allocation time.  Don't add new fields after pages[] unless you wish
+	 * that they not be zeroed.  Pages may have a ref taken, a pin emplaced
+	 * or no retention measures.
 	 */
+	unsigned int cleanup_mode;	/* How pages should be cleaned up (0/FOLL_GET/PIN) */
 	union {
 		struct page *pages[DIO_PAGES];	/* page buffer */
 		struct work_struct complete_work;/* deferred AIO completion */
@@ -167,12 +169,13 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio)
 static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
 	const enum req_op dio_op = dio->opf & REQ_OP_MASK;
+	unsigned int gup_flags =
+		op_is_write(dio_op) ? FOLL_SOURCE_BUF : FOLL_DEST_BUF;
+	struct page **pages = dio->pages;
 	ssize_t ret;
 
-	ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
-				 &sdio->from,
-				 op_is_write(dio_op) ?
-				 FOLL_SOURCE_BUF : FOLL_DEST_BUF);
+	ret = iov_iter_extract_pages(sdio->iter, &pages, LONG_MAX, DIO_PAGES,
+				     gup_flags, &sdio->from);
 
 	if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) {
 		struct page *page = ZERO_PAGE(0);
@@ -183,7 +186,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 		 */
 		if (dio->page_errors == 0)
 			dio->page_errors = ret;
-		get_page(page);
+		dio->cleanup_mode = 0;
 		dio->pages[0] = page;
 		sdio->head = 0;
 		sdio->tail = 1;
@@ -197,6 +200,8 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 		sdio->head = 0;
 		sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
 		sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
+		dio->cleanup_mode =
+			iov_iter_extract_mode(sdio->iter, gup_flags);
 		return 0;
 	}
 	return ret;	
@@ -400,6 +405,10 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 	 * we request a valid number of vectors.
 	 */
 	bio = bio_alloc(bdev, nr_vecs, dio->opf, GFP_KERNEL);
+	if (!(dio->cleanup_mode & FOLL_GET))
+		bio_clear_flag(bio, BIO_PAGE_REFFED);
+	if (dio->cleanup_mode & FOLL_PIN)
+		bio_set_flag(bio, BIO_PAGE_PINNED);
 	bio->bi_iter.bi_sector = first_sector;
 	if (dio->is_async)
 		bio->bi_end_io = dio_bio_end_aio;
@@ -443,13 +452,18 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 	sdio->logical_offset_in_bio = 0;
 }
 
+static void dio_cleanup_page(struct dio *dio, struct page *page)
+{
+	page_put_unpin(page, dio->cleanup_mode);
+}
+
 /*
  * Release any resources in case of a failure
  */
 static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
 	while (sdio->head < sdio->tail)
-		put_page(dio->pages[sdio->head++]);
+		dio_cleanup_page(dio, dio->pages[sdio->head++]);
 }
 
 /*
@@ -704,7 +718,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
  *
  * Return zero on success.  Non-zero means the caller needs to start a new BIO.
  */
-static inline int dio_bio_add_page(struct dio_submit *sdio)
+static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio)
 {
 	int ret;
 
@@ -771,11 +785,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
 			goto out;
 	}
 
-	if (dio_bio_add_page(sdio) != 0) {
+	if (dio_bio_add_page(dio, sdio) != 0) {
 		dio_bio_submit(dio, sdio);
 		ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
 		if (ret == 0) {
-			ret = dio_bio_add_page(sdio);
+			ret = dio_bio_add_page(dio, sdio);
 			BUG_ON(ret != 0);
 		}
 	}
@@ -832,13 +846,16 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 	 */
 	if (sdio->cur_page) {
 		ret = dio_send_cur_page(dio, sdio, map_bh);
-		put_page(sdio->cur_page);
+		dio_cleanup_page(dio, sdio->cur_page);
 		sdio->cur_page = NULL;
 		if (ret)
 			return ret;
 	}
 
-	get_page(page);		/* It is in dio */
+	ret = try_grab_page(page, dio->cleanup_mode);		/* It is in dio */
+	if (ret < 0)
+		return ret;
+
 	sdio->cur_page = page;
 	sdio->cur_page_offset = offset;
 	sdio->cur_page_len = len;
@@ -853,7 +870,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 		ret = dio_send_cur_page(dio, sdio, map_bh);
 		if (sdio->bio)
 			dio_bio_submit(dio, sdio);
-		put_page(sdio->cur_page);
+		dio_cleanup_page(dio, sdio->cur_page);
 		sdio->cur_page = NULL;
 	}
 	return ret;
@@ -954,7 +971,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 
 				ret = get_more_blocks(dio, sdio, map_bh);
 				if (ret) {
-					put_page(page);
+					dio_cleanup_page(dio, page);
 					goto out;
 				}
 				if (!buffer_mapped(map_bh))
@@ -999,7 +1016,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
 				if (dio_op == REQ_OP_WRITE) {
-					put_page(page);
+					dio_cleanup_page(dio, page);
 					return -ENOTBLK;
 				}
 
@@ -1012,7 +1029,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 				if (sdio->block_in_file >=
 						i_size_aligned >> blkbits) {
 					/* We hit eof */
-					put_page(page);
+					dio_cleanup_page(dio, page);
 					goto out;
 				}
 				zero_user(page, from, 1 << blkbits);
@@ -1052,7 +1069,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 						  sdio->next_block_for_io,
 						  map_bh);
 			if (ret) {
-				put_page(page);
+				dio_cleanup_page(dio, page);
 				goto out;
 			}
 			sdio->next_block_for_io += this_chunk_blocks;
@@ -1068,7 +1085,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 		}
 
 		/* Drop the ref which was taken in get_user_pages() */
-		put_page(page);
+		dio_cleanup_page(dio, page);
 	}
 out:
 	return ret;
@@ -1288,7 +1305,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
 		if (retval == 0)
 			retval = ret2;
-		put_page(sdio.cur_page);
+		dio_cleanup_page(dio, sdio.cur_page);
 		sdio.cur_page = NULL;
 	}
 	if (sdio.bio)



  parent reply	other threads:[~2023-01-16 23:14 UTC|newest]

Thread overview: 91+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-16 23:07 [PATCH v6 00/34] iov_iter: Improve page extraction (ref, pin or just list) David Howells
2023-01-16 23:08 ` [PATCH v6 01/34] vfs: Unconditionally set IOCB_WRITE in call_write_iter() David Howells
2023-01-17  7:52   ` Christoph Hellwig
2023-01-18 22:11     ` Al Viro
2023-01-19  5:44       ` Christoph Hellwig
2023-01-19 11:34       ` David Howells
2023-01-19 16:48         ` Christoph Hellwig
2023-01-19 21:14         ` David Howells
2023-01-17  8:28   ` David Howells
2023-01-17  8:44     ` Christoph Hellwig
2023-01-17 11:11   ` David Howells
2023-01-17 11:11     ` Christoph Hellwig
2023-01-18 22:05   ` Al Viro
2023-01-19  5:41     ` Christoph Hellwig
2023-01-19 10:01   ` David Howells
2023-01-19 16:46     ` Christoph Hellwig
2023-01-19 11:06   ` David Howells
2023-01-16 23:08 ` [PATCH v6 02/34] iov_iter: Use IOCB/IOMAP_WRITE/op_is_write rather than iterator direction David Howells
2023-01-17  7:55   ` Christoph Hellwig
2023-01-18 22:18     ` Al Viro
2023-01-16 23:08 ` [PATCH v6 03/34] iov_iter: Pass I/O direction into iov_iter_get_pages*() David Howells
2023-01-17  7:57   ` Christoph Hellwig
2023-01-17  8:07     ` David Hildenbrand
2023-01-17  8:09       ` Christoph Hellwig
2023-01-18 23:03     ` Al Viro
2023-01-19  0:15       ` Al Viro
2023-01-19  2:11         ` Al Viro
2023-01-19  5:47           ` Christoph Hellwig
2023-01-19  5:47         ` Christoph Hellwig
2023-01-17  8:44   ` David Howells
2023-01-17  8:46     ` Christoph Hellwig
2023-01-17  8:47     ` David Hildenbrand
2023-01-16 23:08 ` [PATCH v6 04/34] iov_iter: Remove iov_iter_get_pages2/pages_alloc2() David Howells
2023-01-16 23:08 ` [PATCH v6 05/34] iov_iter: Change the direction macros into an enum David Howells
2023-01-18 23:14   ` Al Viro
2023-01-18 23:17   ` David Howells
2023-01-18 23:19     ` Al Viro
2023-01-18 23:24     ` David Howells
2023-01-16 23:08 ` [PATCH v6 06/34] iov_iter: Use the direction in the iterator functions David Howells
2023-01-17  7:58   ` Christoph Hellwig
2023-01-18 22:28     ` Al Viro
2023-01-19  5:45       ` Christoph Hellwig
2023-01-18 23:15   ` Al Viro
2023-01-16 23:08 ` [PATCH v6 07/34] iov_iter: Add a function to extract a page list from an iterator David Howells
2023-01-17  8:01   ` Christoph Hellwig
2023-01-17  8:19   ` David Howells
2023-01-16 23:08 ` [PATCH v6 08/34] mm: Provide a helper to drop a pin/ref on a page David Howells
2023-01-17  8:02   ` Christoph Hellwig
2023-01-17  8:21   ` David Howells
2023-01-16 23:09 ` [PATCH v6 09/34] bio: Rename BIO_NO_PAGE_REF to BIO_PAGE_REFFED and invert the meaning David Howells
2023-01-17  8:02   ` Christoph Hellwig
2023-01-16 23:09 ` [PATCH v6 10/34] mm, block: Make BIO_PAGE_REFFED/PINNED the same as FOLL_GET/PIN numerically David Howells
2023-01-17  8:03   ` Christoph Hellwig
2023-01-16 23:09 ` [PATCH v6 11/34] iov_iter, block: Make bio structs pin pages rather than ref'ing if appropriate David Howells
2023-01-17  8:07   ` Christoph Hellwig
2023-01-17  8:26   ` David Howells
2023-01-17  8:44     ` Christoph Hellwig
2023-01-16 23:09 ` [PATCH v6 12/34] bio: Fix bio_flagged() so that gcc can better optimise it David Howells
2023-01-17  8:07   ` Christoph Hellwig
2023-01-16 23:09 ` [PATCH v6 13/34] netfs: Add a function to extract a UBUF or IOVEC into a BVEC iterator David Howells
2023-01-16 23:09 ` [PATCH v6 14/34] netfs: Add a function to extract an iterator into a scatterlist David Howells
2023-01-16 23:09 ` [PATCH v6 15/34] af_alg: Pin pages rather than ref'ing if appropriate David Howells
2023-01-16 23:09 ` [PATCH v6 16/34] af_alg: [RFC] Use netfs_extract_iter_to_sg() to create scatterlists David Howells
2023-01-16 23:10 ` [PATCH v6 17/34] scsi: [RFC] Use netfs_extract_iter_to_sg() David Howells
2023-01-16 23:10 ` David Howells [this message]
2023-01-19  5:04   ` [PATCH v6 18/34] dio: Pin pages rather than ref'ing if appropriate Al Viro
2023-01-19  5:51     ` Christoph Hellwig
2023-01-16 23:10 ` [PATCH v6 19/34] fuse: " David Howells
2023-01-16 23:10 ` [PATCH v6 20/34] vfs: Make splice use iov_iter_extract_pages() David Howells
2023-01-19  2:31   ` Al Viro
2023-01-16 23:10 ` [PATCH v6 21/34] 9p: Pin pages rather than ref'ing if appropriate David Howells
2023-01-19  2:52   ` Al Viro
2023-01-19 16:44   ` David Howells
2023-01-19 16:51     ` Christoph Hellwig
2023-01-16 23:10 ` [PATCH v6 22/34] nfs: " David Howells
2023-01-16 23:10 ` [PATCH v6 23/34] cifs: Implement splice_read to pass down ITER_BVEC not ITER_PIPE David Howells
2023-01-16 23:10 ` [PATCH v6 24/34] cifs: Add a function to build an RDMA SGE list from an iterator David Howells
2023-01-16 23:11 ` [PATCH v6 25/34] cifs: Add a function to Hash the contents of " David Howells
2023-01-16 23:11 ` [PATCH v6 26/34] cifs: Add some helper functions David Howells
2023-01-16 23:11 ` [PATCH v6 27/34] cifs: Add a function to read into an iter from a socket David Howells
2023-01-16 23:11 ` [PATCH v6 28/34] cifs: Change the I/O paths to use an iterator rather than a page list David Howells
2023-01-16 23:11 ` [PATCH v6 29/34] cifs: Build the RDMA SGE list directly from an iterator David Howells
2023-01-16 23:11 ` [PATCH v6 30/34] cifs: Remove unused code David Howells
2023-01-16 23:11 ` [PATCH v6 31/34] cifs: Fix problem with encrypted RDMA data read David Howells
2023-01-19 16:25   ` Stefan Metzmacher
2023-01-16 23:11 ` [PATCH v6 32/34] cifs: DIO to/from KVEC-type iterators should now work David Howells
2023-01-16 23:12 ` [PATCH v6 33/34] net: [RFC][WIP] Mark each skb_frags as to how they should be cleaned up David Howells
2023-01-16 23:12 ` [PATCH v6 34/34] net: [RFC][WIP] Make __zerocopy_sg_from_iter() correctly pin or leave pages unref'd David Howells
2023-01-17  7:46 ` [PATCH v6 00/34] iov_iter: Improve page extraction (ref, pin or just list) Christoph Hellwig
2023-01-18 14:00 ` [PATCH v6 09/34] bio: Rename BIO_NO_PAGE_REF to BIO_PAGE_REFFED and invert the meaning David Howells
2023-01-18 14:09   ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=167391061117.2311931.16807283804788007499.stgit@warthog.procyon.org.uk \
    --to=dhowells@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=hch@infradead.org \
    --cc=hch@lst.de \
    --cc=jack@suse.cz \
    --cc=jlayton@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=logang@deltatee.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.