All of lore.kernel.org
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.de>
To: Trond Myklebust <trond.myklebust@hammerspace.com>,
	Anna Schumaker <anna.schumaker@netapp.com>,
	Chuck Lever <chuck.lever@oracle.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mel Gorman <mgorman@suse.de>,
	Christoph Hellwig <hch@infradead.org>,
	David Howells <dhowells@redhat.com>
Cc: linux-nfs@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 10/23] MM: submit multipage write for SWP_FS_OPS swap-space
Date: Mon, 24 Jan 2022 14:48:32 +1100	[thread overview]
Message-ID: <164299611279.26253.12350012848236496937.stgit@noble.brown> (raw)
In-Reply-To: <164299573337.26253.7538614611220034049.stgit@noble.brown>

swap_writepage() is given one page at a time, but may be called repeatedly
in succession.
For block-device swapspace, the blk_plug functionality allows the
multiple pages to be combined together at lower layers.
That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is
only active when CONFIG_BLOCK=y.  Consequently all swap reads over NFS
are single page reads.

With this patch we pass a pointer-to-pointer via the wbc.
swap_writepage can store state between calls - much like the pointer
passed explicitly to swap_readpage.  After calling swap_writepage() some
number of times, the state will be passed to swap_write_unplug() which
can submit the combined request.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/writeback.h |    7 +++
 mm/page_io.c              |  103 +++++++++++++++++++++++++++++----------------
 mm/swap.h                 |    1 
 mm/vmscan.c               |    9 +++-
 4 files changed, 82 insertions(+), 38 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index fec248ab1fec..6dcaa0639c0d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -80,6 +80,13 @@ struct writeback_control {
 
 	unsigned punt_to_cgroup:1;	/* cgrp punting, see __REQ_CGROUP_PUNT */
 
+	/* To enable batching of swap writes to non-block-device backends,
+	 * "plug" can be set point to a 'struct swap_iocb *'.  When all swap
+	 * writes have been submitted, if with swap_iocb is not NULL,
+	 * swap_write_unplug() should be called.
+	 */
+	struct swap_iocb **plug;
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback *wb;	/* wb this writeback is issued under */
 	struct inode *inode;		/* inode being written out */
diff --git a/mm/page_io.c b/mm/page_io.c
index bcf655d650c8..b61c2cafc4f9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -307,56 +307,74 @@ int sio_pool_init(void)
 static void sio_write_complete(struct kiocb *iocb, long ret)
 {
 	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
-	struct page *page = sio->bvec[0].bv_page;
+	int p;
 
-	if (ret != 0 && ret != PAGE_SIZE) {
-		/*
-		 * In the case of swap-over-nfs, this can be a
-		 * temporary failure if the system has limited
-		 * memory for allocating transmit buffers.
-		 * Mark the page dirty and avoid
-		 * folio_rotate_reclaimable but rate-limit the
-		 * messages but do not flag PageError like
-		 * the normal direct-to-bio case as it could
-		 * be temporary.
-		 */
-		set_page_dirty(page);
-		ClearPageReclaim(page);
-		pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
-				   ret, page_file_offset(page));
-	} else
-		count_vm_event(PSWPOUT);
-	end_page_writeback(page);
+	for (p = 0; p < sio->pages; p++) {
+		struct page *page = sio->bvec[p].bv_page;
+
+		if (ret != 0 && ret != PAGE_SIZE * sio->pages) {
+			/*
+			 * In the case of swap-over-nfs, this can be a
+			 * temporary failure if the system has limited
+			 * memory for allocating transmit buffers.
+			 * Mark the page dirty and avoid
+			 * folio_rotate_reclaimable but rate-limit the
+			 * messages but do not flag PageError like
+			 * the normal direct-to-bio case as it could
+			 * be temporary.
+			 */
+			set_page_dirty(page);
+			ClearPageReclaim(page);
+			pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
+					   ret, page_file_offset(page));
+		} else
+			count_vm_event(PSWPOUT);
+		end_page_writeback(page);
+	}
 	mempool_free(sio, sio_pool);
 }
 
 static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
 {
-	struct swap_iocb *sio;
+	struct swap_iocb *sio = NULL;
 	struct swap_info_struct *sis = page_swap_info(page);
 	struct file *swap_file = sis->swap_file;
-	struct address_space *mapping = swap_file->f_mapping;
-	struct iov_iter from;
-	int ret;
+	loff_t pos = page_file_offset(page);
 
 	set_page_writeback(page);
 	unlock_page(page);
-	sio = mempool_alloc(sio_pool, GFP_NOIO);
-	init_sync_kiocb(&sio->iocb, swap_file);
-	sio->iocb.ki_complete = sio_write_complete;
-	sio->iocb.ki_pos = page_file_offset(page);
-	sio->bvec[0].bv_page = page;
-	sio->bvec[0].bv_len = PAGE_SIZE;
-	sio->bvec[0].bv_offset = 0;
-	iov_iter_bvec(&from, WRITE, &sio->bvec[0], 1, PAGE_SIZE);
-	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
-	if (ret != -EIOCBQUEUED)
-		sio_write_complete(&sio->iocb, ret);
-	return ret;
+	if (wbc->plug)
+		sio = *wbc->plug;
+	if (sio) {
+		if (sio->iocb.ki_filp != swap_file ||
+		    sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
+			swap_write_unplug(sio);
+			sio = NULL;
+		}
+	}
+	if (!sio) {
+		sio = mempool_alloc(sio_pool, GFP_NOIO);
+		init_sync_kiocb(&sio->iocb, swap_file);
+		sio->iocb.ki_complete = sio_write_complete;
+		sio->iocb.ki_pos = pos;
+		sio->pages = 0;
+	}
+	sio->bvec[sio->pages].bv_page = page;
+	sio->bvec[sio->pages].bv_len = PAGE_SIZE;
+	sio->bvec[sio->pages].bv_offset = 0;
+	sio->pages += 1;
+	if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->plug) {
+		swap_write_unplug(sio);
+		sio = NULL;
+	}
+	if (wbc->plug)
+		*wbc->plug = sio;
+
+	return 0;
 }
 
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
-		bio_end_io_t end_write_func)
+		     bio_end_io_t end_write_func)
 {
 	struct bio *bio;
 	int ret;
@@ -388,6 +406,19 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 	return 0;
 }
 
+void swap_write_unplug(struct swap_iocb *sio)
+{
+	struct iov_iter from;
+	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+	int ret;
+
+	iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages,
+		      PAGE_SIZE * sio->pages);
+	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+	if (ret != -EIOCBQUEUED)
+		sio_write_complete(&sio->iocb, ret);
+}
+
 static void sio_read_complete(struct kiocb *iocb, long ret)
 {
 	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
diff --git a/mm/swap.h b/mm/swap.h
index 0c79b2478f3f..0194ac153d40 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -13,6 +13,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
 	if (unlikely(plug))
 		__swap_read_unplug(plug);
 }
+void swap_write_unplug(struct swap_iocb *sio);
 int swap_writepage(struct page *page, struct writeback_control *wbc);
 void end_swap_bio_write(struct bio *bio);
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ad5026d06aa8..f75c71490921 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1164,7 +1164,8 @@ typedef enum {
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping,
+			 struct swap_iocb **plug)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
@@ -1211,6 +1212,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 			.range_start = 0,
 			.range_end = LLONG_MAX,
 			.for_reclaim = 1,
+			.plug = plug,
 		};
 
 		SetPageReclaim(page);
@@ -1537,6 +1539,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 	unsigned int nr_reclaimed = 0;
 	unsigned int pgactivate = 0;
 	bool do_demote_pass;
+	struct swap_iocb *plug = NULL;
 
 	memset(stat, 0, sizeof(*stat));
 	cond_resched();
@@ -1817,7 +1820,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 			 * starts and then write it out here.
 			 */
 			try_to_unmap_flush_dirty();
-			switch (pageout(page, mapping)) {
+			switch (pageout(page, mapping, &plug)) {
 			case PAGE_KEEP:
 				goto keep_locked;
 			case PAGE_ACTIVATE:
@@ -1971,6 +1974,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 	list_splice(&ret_pages, page_list);
 	count_vm_events(PGACTIVATE, pgactivate);
 
+	if (plug)
+		swap_write_unplug(plug);
 	return nr_reclaimed;
 }
 



  parent reply	other threads:[~2022-01-24  3:52 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-24  3:48 [PATCH 00/23 V3] Repair SWAP-over_NFS NeilBrown
2022-01-24  3:48 ` [PATCH 05/23] MM: reclaim mustn't enter FS for SWP_FS_OPS swap-space NeilBrown
2022-01-24  7:31   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 03/23] MM: drop swap_set_page_dirty NeilBrown
2022-01-24  7:28   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 14/23] NFS: swap IO handling is slightly different for O_DIRECT IO NeilBrown
2022-01-24  8:58   ` Christoph Hellwig
2022-01-24 13:22   ` Mark Hemment
2022-01-26 22:51     ` NeilBrown
2022-01-24  3:48 ` [PATCH 22/23] NFS: swap-out must always use STABLE writes NeilBrown
2022-01-26  3:45   ` Trond Myklebust
2022-01-26 21:42     ` NeilBrown
2022-01-24  3:48 ` [PATCH 23/23] SUNRPC: lock against ->sock changing during sysfs read NeilBrown
2022-01-24  3:48 ` [PATCH 08/23] DOC: update documentation for swap_activate and swap_rw NeilBrown
2022-01-24  8:50   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 07/23] MM: perform async writes to SWP_FS_OPS swap-space using ->swap_rw NeilBrown
2022-01-24  8:49   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 02/23] MM: extend block-plugging to cover all swap reads with read-ahead NeilBrown
2022-01-24  7:27   ` Christoph Hellwig
2022-01-26 21:47     ` NeilBrown
2022-01-26 23:09       ` Hugh Dickins
2022-01-27  0:32         ` NeilBrown
2022-01-24  3:48 ` [PATCH 16/23] SUNRPC/auth: async tasks mustn't block waiting for memory NeilBrown
2022-01-24  3:48 ` [PATCH 04/23] MM: move responsibility for setting SWP_FS_OPS to ->swap_activate NeilBrown
2022-01-24  7:30   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 06/23] MM: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space NeilBrown
2022-01-24  8:48   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 15/23] SUNRPC/call_alloc: async tasks mustn't block waiting for memory NeilBrown
2022-01-24  3:48 ` [PATCH 20/23] SUNRPC: improve 'swap' handling: scheduling and PF_MEMALLOC NeilBrown
2022-01-24  3:48 ` [PATCH 01/23] MM: create new mm/swap.h header file NeilBrown
2022-02-07 13:51   ` Geert Uytterhoeven
2022-01-24  3:48 ` [PATCH 09/23] MM: submit multipage reads for SWP_FS_OPS swap-space NeilBrown
2022-01-24  8:25   ` kernel test robot
2022-01-24  8:25     ` kernel test robot
2022-01-24  8:52   ` Christoph Hellwig
2022-01-24  9:27   ` kernel test robot
2022-01-24  9:27     ` kernel test robot
2022-01-24 13:16   ` Mark Hemment
2022-01-26 22:04     ` NeilBrown
2022-02-08 11:07   ` Geert Uytterhoeven
2022-01-24  3:48 ` [PATCH 12/23] NFS: remove IS_SWAPFILE hack NeilBrown
2022-01-24  8:56   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 19/23] NFS: discard NFS_RPC_SWAPFLAGS and RPC_TASK_ROOTCREDS NeilBrown
2022-01-24  3:48 ` [PATCH 17/23] SUNRPC/xprt: async tasks mustn't block waiting for memory NeilBrown
2022-01-24  3:48 ` [PATCH 18/23] SUNRPC: remove scheduling boost for "SWAPPER" tasks NeilBrown
2022-01-24  3:48 ` [PATCH 21/23] NFSv4: keep state manager thread active if swap is enabled NeilBrown
2022-01-24  3:48 ` [PATCH 11/23] VFS: Add FMODE_CAN_ODIRECT file flag NeilBrown
2022-01-24  8:56   ` Christoph Hellwig
2022-01-26 22:14     ` NeilBrown
2022-01-24  3:48 ` NeilBrown [this message]
2022-01-24  8:55   ` [PATCH 10/23] MM: submit multipage write for SWP_FS_OPS swap-space Christoph Hellwig
2022-01-24 10:29   ` kernel test robot
2022-01-24 10:29     ` kernel test robot
2022-01-24  3:48 ` [PATCH 13/23] NFS: rename nfs_direct_IO and use as ->swap_rw NeilBrown
2022-01-24  8:57   ` Christoph Hellwig
2022-02-07 17:55 ` [PATCH 00/23 V3] Repair SWAP-over_NFS Geert Uytterhoeven

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=164299611279.26253.12350012848236496937.stgit@noble.brown \
    --to=neilb@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=anna.schumaker@netapp.com \
    --cc=chuck.lever@oracle.com \
    --cc=dhowells@redhat.com \
    --cc=hch@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=trond.myklebust@hammerspace.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.