All of lore.kernel.org
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.de>
To: Trond Myklebust <trond.myklebust@hammerspace.com>,
	Anna Schumaker <anna.schumaker@netapp.com>,
	Chuck Lever <chuck.lever@oracle.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mel Gorman <mgorman@suse.de>,
	Christoph Hellwig <hch@infradead.org>,
	David Howells <dhowells@redhat.com>
Cc: linux-nfs@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 07/18] MM: submit multipage write for SWP_FS_OPS swap-space
Date: Fri, 17 Dec 2021 10:48:23 +1100	[thread overview]
Message-ID: <163969850299.20885.11549845125423716814.stgit@noble.brown> (raw)
In-Reply-To: <163969801519.20885.3977673503103544412.stgit@noble.brown>

swap_writepage() is given one page at a time, but may be called repeatedly
in succession.
For block-device swapspace, the blk_plug functionality allows the
multiple pages to be combined together at lower layers.
That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is
only active when CONFIG_BLOCK=y.  Consequently all swap reads over NFS
are single page reads.

With this patch we pass a pointer-to-pointer via the wbc.
swap_writepage can store state between calls - much like the pointer
passed explicitly to swap_readpage.  After calling swap_writepage() some
number of times, the state will be passed to swap_write_unplug() which
can submit the combined request.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 include/linux/writeback.h |    7 +++
 mm/page_io.c              |   98 ++++++++++++++++++++++++++++++---------------
 mm/swap.h                 |    1 
 mm/vmscan.c               |    9 +++-
 4 files changed, 80 insertions(+), 35 deletions(-)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3bfd487d1dd2..16f780b618d2 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -79,6 +79,13 @@ struct writeback_control {
 
 	unsigned punt_to_cgroup:1;	/* cgrp punting, see __REQ_CGROUP_PUNT */
 
+	/* To enable batching of swap writes to non-block-device backends,
+	 * "plug" can be set point to a 'struct swap_iocb *'.  When all swap
+	 * writes have been submitted, if with swap_iocb is not NULL,
+	 * swap_write_unplug() should be called.
+	 */
+	struct swap_iocb **plug;
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback *wb;	/* wb this writeback is issued under */
 	struct inode *inode;		/* inode being written out */
diff --git a/mm/page_io.c b/mm/page_io.c
index 03fbf9463081..92a31df467a2 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -304,26 +304,30 @@ int sio_pool_init(void)
 static void sio_write_complete(struct kiocb *iocb, long ret)
 {
 	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
-	struct page *page = sio->bvec[0].bv_page;
+	int p;
 
-	if (ret != 0 && ret != PAGE_SIZE) {
-		/*
-		 * In the case of swap-over-nfs, this can be a
-		 * temporary failure if the system has limited
-		 * memory for allocating transmit buffers.
-		 * Mark the page dirty and avoid
-		 * folio_rotate_reclaimable but rate-limit the
-		 * messages but do not flag PageError like
-		 * the normal direct-to-bio case as it could
-		 * be temporary.
-		 */
-		set_page_dirty(page);
-		ClearPageReclaim(page);
-		pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
-				   ret, page_file_offset(page));
-	} else
-		count_vm_event(PSWPOUT);
-	end_page_writeback(page);
+	for (p = 0; p < sio->pages; p++) {
+		struct page *page = sio->bvec[p].bv_page;
+
+		if (ret != 0 && ret != PAGE_SIZE * sio->pages) {
+			/*
+			 * In the case of swap-over-nfs, this can be a
+			 * temporary failure if the system has limited
+			 * memory for allocating transmit buffers.
+			 * Mark the page dirty and avoid
+			 * folio_rotate_reclaimable but rate-limit the
+			 * messages but do not flag PageError like
+			 * the normal direct-to-bio case as it could
+			 * be temporary.
+			 */
+			set_page_dirty(page);
+			ClearPageReclaim(page);
+			pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
+					   ret, page_file_offset(page));
+		} else
+			count_vm_event(PSWPOUT);
+		end_page_writeback(page);
+	}
 	mempool_free(sio, sio_pool);
 }
 
@@ -336,24 +340,39 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 
 	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 	if (data_race(sis->flags & SWP_FS_OPS)) {
-		struct swap_iocb *sio;
+		struct swap_iocb *sio = NULL;
 		struct file *swap_file = sis->swap_file;
-		struct address_space *mapping = swap_file->f_mapping;
-		struct iov_iter from;
+		loff_t pos = page_file_offset(page);
 
 		set_page_writeback(page);
 		unlock_page(page);
-		sio = mempool_alloc(sio_pool, GFP_NOIO);
-		init_sync_kiocb(&sio->iocb, swap_file);
-		sio->iocb.ki_complete = sio_write_complete;
-		sio->iocb.ki_pos = page_file_offset(page);
-		sio->bvec[0].bv_page = page;
-		sio->bvec[0].bv_len = PAGE_SIZE;
-		sio->bvec[0].bv_offset = 0;
-		iov_iter_bvec(&from, WRITE, &sio->bvec[0], 1, PAGE_SIZE);
-		ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
-		if (ret != -EIOCBQUEUED)
-			sio_write_complete(&sio->iocb, ret);
+
+		if (wbc->plug)
+			sio = *wbc->plug;
+		if (sio) {
+			if (sio->iocb.ki_filp != swap_file ||
+			    sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
+				swap_write_unplug(sio);
+				sio = NULL;
+			}
+		}
+		if (!sio) {
+			sio = mempool_alloc(sio_pool, GFP_NOIO);
+			init_sync_kiocb(&sio->iocb, swap_file);
+			sio->iocb.ki_complete = sio_write_complete;
+			sio->iocb.ki_pos = pos;
+			sio->pages = 0;
+		}
+		sio->bvec[sio->pages].bv_page = page;
+		sio->bvec[sio->pages].bv_len = PAGE_SIZE;
+		sio->bvec[sio->pages].bv_offset = 0;
+		sio->pages += 1;
+		if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->plug) {
+			swap_write_unplug(sio);
+			sio = NULL;
+		}
+		if (wbc->plug)
+			*wbc->plug = sio;
 
 		return ret;
 	}
@@ -380,6 +399,19 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 	return 0;
 }
 
+void swap_write_unplug(struct swap_iocb *sio)
+{
+	struct iov_iter from;
+	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+	int ret;
+
+	iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages,
+		      PAGE_SIZE * sio->pages);
+	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+	if (ret != -EIOCBQUEUED)
+		sio_write_complete(&sio->iocb, ret);
+}
+
 static void sio_read_complete(struct kiocb *iocb, long ret)
 {
 	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
diff --git a/mm/swap.h b/mm/swap.h
index ce967abc5f46..f4d0edda6e59 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -13,6 +13,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
 	if (unlikely(plug))
 		__swap_read_unplug(plug);
 }
+void swap_write_unplug(struct swap_iocb *sio);
 int swap_writepage(struct page *page, struct writeback_control *wbc);
 void end_swap_bio_write(struct bio *bio);
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5f460d174b1b..50a363e63102 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1123,7 +1123,8 @@ typedef enum {
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping,
+			 struct swap_iocb **plug)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
@@ -1170,6 +1171,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 			.range_start = 0,
 			.range_end = LLONG_MAX,
 			.for_reclaim = 1,
+			.plug = plug,
 		};
 
 		SetPageReclaim(page);
@@ -1495,6 +1497,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 	unsigned int nr_reclaimed = 0;
 	unsigned int pgactivate = 0;
 	bool do_demote_pass;
+	struct swap_iocb *plug = NULL;
 
 	memset(stat, 0, sizeof(*stat));
 	cond_resched();
@@ -1780,7 +1783,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 			 * starts and then write it out here.
 			 */
 			try_to_unmap_flush_dirty();
-			switch (pageout(page, mapping)) {
+			switch (pageout(page, mapping, &plug)) {
 			case PAGE_KEEP:
 				goto keep_locked;
 			case PAGE_ACTIVATE:
@@ -1934,6 +1937,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 	list_splice(&ret_pages, page_list);
 	count_vm_events(PGACTIVATE, pgactivate);
 
+	if (plug)
+		swap_write_unplug(plug);
 	return nr_reclaimed;
 }
 



  parent reply	other threads:[~2021-12-16 23:53 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-12-16 23:48 [PATCH 00/18 V2] Repair SWAP-over-NFS NeilBrown
2021-12-16 23:48 ` [PATCH 05/18] MM: reclaim mustn't enter FS for SWP_FS_OPS swap-space NeilBrown
2021-12-17  8:51   ` kernel test robot
2021-12-17  8:51     ` kernel test robot
2021-12-21  8:43   ` Christoph Hellwig
2021-12-16 23:48 ` [PATCH 01/18] Structural cleanup for filesystem-based swap NeilBrown
2021-12-17 10:33   ` kernel test robot
2021-12-17 10:33     ` kernel test robot
2021-12-21  8:34   ` Christoph Hellwig
2021-12-16 23:48 ` [PATCH 03/18] MM: use ->swap_rw for reads from SWP_FS_OPS swap-space NeilBrown
2021-12-20 12:16   ` Mark Hemment
2021-12-21  8:40   ` Christoph Hellwig
2021-12-16 23:48 ` [PATCH 02/18] MM: create new mm/swap.h header file NeilBrown
2021-12-17 10:03   ` kernel test robot
2021-12-17 10:03     ` kernel test robot
2021-12-21  8:36   ` Christoph Hellwig
2021-12-16 23:48 ` [PATCH 04/18] MM: perform async writes to SWP_FS_OPS swap-space NeilBrown
2021-12-21  8:41   ` Christoph Hellwig
2021-12-16 23:48 ` [PATCH 06/18] MM: submit multipage reads for " NeilBrown
2021-12-17  7:09   ` kernel test robot
2021-12-17  7:09     ` kernel test robot
2021-12-21  8:44   ` Christoph Hellwig
2021-12-16 23:48 ` [PATCH 17/18] NFSv4: keep state manager thread active if swap is enabled NeilBrown
2021-12-16 23:48 ` [PATCH 10/18] NFS: swap IO handling is slightly different for O_DIRECT IO NeilBrown
2021-12-20 15:02   ` Mark Hemment
2021-12-16 23:48 ` [PATCH 12/18] SUNRPC/auth: async tasks mustn't block waiting for memory NeilBrown
2021-12-16 23:48 ` [PATCH 15/18] NFS: discard NFS_RPC_SWAPFLAGS and RPC_TASK_ROOTCREDS NeilBrown
2021-12-16 23:48 ` [PATCH 14/18] SUNRPC: remove scheduling boost for "SWAPPER" tasks NeilBrown
2021-12-16 23:48 ` [PATCH 13/18] SUNRPC/xprt: async tasks mustn't block waiting for memory NeilBrown
2021-12-16 23:48 ` [PATCH 11/18] SUNRPC/call_alloc: " NeilBrown
2021-12-16 23:48 ` [PATCH 16/18] SUNRPC: improve 'swap' handling: scheduling and PF_MEMALLOC NeilBrown
2021-12-16 23:48 ` [PATCH 08/18] MM: Add AS_CAN_DIO mapping flag NeilBrown
2021-12-19 13:38   ` Mark Hemment
2021-12-19 20:59     ` NeilBrown
2021-12-21  8:46   ` Christoph Hellwig
2022-01-19  3:54     ` NeilBrown
2021-12-16 23:48 ` NeilBrown [this message]
2021-12-20 12:21   ` [PATCH 07/18] MM: submit multipage write for SWP_FS_OPS swap-space Mark Hemment
2021-12-16 23:48 ` [PATCH 09/18] NFS: rename nfs_direct_IO and use as ->swap_rw NeilBrown
2021-12-16 23:48 ` [PATCH 18/18] NFS: swap-out must always use STABLE writes NeilBrown
2021-12-17 21:29 ` [PATCH 00/18 V2] Repair SWAP-over-NFS Anna Schumaker
2021-12-19 21:07   ` NeilBrown
2021-12-21  8:48     ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=163969850299.20885.11549845125423716814.stgit@noble.brown \
    --to=neilb@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=anna.schumaker@netapp.com \
    --cc=chuck.lever@oracle.com \
    --cc=dhowells@redhat.com \
    --cc=hch@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=trond.myklebust@hammerspace.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.