All of lore.kernel.org
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.de>
To: Trond Myklebust <trond.myklebust@hammerspace.com>,
	Anna Schumaker <anna.schumaker@netapp.com>,
	Chuck Lever <chuck.lever@oracle.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Mel Gorman <mgorman@suse.de>,
	Christoph Hellwig <hch@infradead.org>,
	David Howells <dhowells@redhat.com>
Cc: linux-nfs@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org
Subject: [PATCH 09/23] MM: submit multipage reads for SWP_FS_OPS swap-space
Date: Mon, 24 Jan 2022 14:48:32 +1100	[thread overview]
Message-ID: <164299611278.26253.14950274629759580371.stgit@noble.brown> (raw)
In-Reply-To: <164299573337.26253.7538614611220034049.stgit@noble.brown>

swap_readpage() is given one page at a time, but maybe called repeatedly
in succession.
For block-device swapspace, the blk_plug functionality allows the
multiple pages to be combined together at lower layers.
That cannot be used for SWP_FS_OPS as blk_plug may not exist - it is
only active when CONFIG_BLOCK=y.  Consequently all swap reads over NFS
are single page reads.

With this patch we pass in a pointer-to-pointer when swap_readpage can
store state between calls - much like the effect of blk_plug.  After
calling swap_readpage() some number of times, the state will be passed
to swap_read_unplug() which can submit the combined request.

Some caller currently call blk_finish_plug() *before* the final call to
swap_readpage(), so the last page cannot be included.  This patch moves
blk_finish_plug() to after the last call, and calls swap_read_unplug()
there too.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 mm/madvise.c    |    8 +++-
 mm/memory.c     |    2 +
 mm/page_io.c    |  102 +++++++++++++++++++++++++++++++++++--------------------
 mm/swap.h       |   16 +++++++--
 mm/swap_state.c |   19 +++++++---
 5 files changed, 98 insertions(+), 49 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 1ee4b7583379..2b1ab30af141 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -225,6 +225,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 	pte_t *orig_pte;
 	struct vm_area_struct *vma = walk->private;
 	unsigned long index;
+	struct swap_iocb *splug = NULL;
 
 	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 		return 0;
@@ -246,10 +247,11 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 			continue;
 
 		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
-							vma, index, false);
+					     vma, index, false, &splug);
 		if (page)
 			put_page(page);
 	}
+	swap_read_unplug(splug);
 
 	return 0;
 }
@@ -265,6 +267,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
 	pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
 	struct page *page;
+	struct swap_iocb *splug = NULL;
 
 	rcu_read_lock();
 	xas_for_each(&xas, page, end_index) {
@@ -277,13 +280,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 
 		swap = radix_to_swp_entry(page);
 		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
-							NULL, 0, false);
+					     NULL, 0, false, &splug);
 		if (page)
 			put_page(page);
 
 		rcu_read_lock();
 	}
 	rcu_read_unlock();
+	swap_read_unplug(splug);
 
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 }
diff --git a/mm/memory.c b/mm/memory.c
index d25372340107..8bd18c54eaa4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3559,7 +3559,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
 				/* To provide entry to swap_readpage() */
 				set_page_private(page, entry.val);
-				swap_readpage(page, true);
+				swap_readpage(page, true, NULL);
 				set_page_private(page, 0);
 			}
 		} else {
diff --git a/mm/page_io.c b/mm/page_io.c
index 6e32ca35d9b6..bcf655d650c8 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -286,7 +286,8 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 
 struct swap_iocb {
 	struct kiocb		iocb;
-	struct bio_vec		bvec;
+	struct bio_vec		bvec[SWAP_CLUSTER_MAX];
+	int			pages;
 };
 static mempool_t *sio_pool;
 
@@ -306,7 +307,7 @@ int sio_pool_init(void)
 static void sio_write_complete(struct kiocb *iocb, long ret)
 {
 	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
-	struct page *page = sio->bvec.bv_page;
+	struct page *page = sio->bvec[0].bv_page;
 
 	if (ret != 0 && ret != PAGE_SIZE) {
 		/*
@@ -344,10 +345,10 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
 	init_sync_kiocb(&sio->iocb, swap_file);
 	sio->iocb.ki_complete = sio_write_complete;
 	sio->iocb.ki_pos = page_file_offset(page);
-	sio->bvec.bv_page = page;
-	sio->bvec.bv_len = PAGE_SIZE;
-	sio->bvec.bv_offset = 0;
-	iov_iter_bvec(&from, WRITE, &sio->bvec, 1, PAGE_SIZE);
+	sio->bvec[0].bv_page = page;
+	sio->bvec[0].bv_len = PAGE_SIZE;
+	sio->bvec[0].bv_offset = 0;
+	iov_iter_bvec(&from, WRITE, &sio->bvec[0], 1, PAGE_SIZE);
 	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
 	if (ret != -EIOCBQUEUED)
 		sio_write_complete(&sio->iocb, ret);
@@ -390,46 +391,60 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 static void sio_read_complete(struct kiocb *iocb, long ret)
 {
 	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
-	struct page *page = sio->bvec.bv_page;
-
-	if (ret != 0 && ret != PAGE_SIZE) {
-		SetPageError(page);
-		ClearPageUptodate(page);
-		pr_alert_ratelimited("Read-error on swap-device\n");
-	} else {
-		SetPageUptodate(page);
-		count_vm_event(PSWPIN);
+	int p;
+
+	for (p = 0; p < sio->pages; p++) {
+		struct page *page = sio->bvec[p].bv_page;
+		if (ret != 0 && ret != PAGE_SIZE * sio->pages) {
+			SetPageError(page);
+			ClearPageUptodate(page);
+			pr_alert_ratelimited("Read-error on swap-device\n");
+		} else {
+			SetPageUptodate(page);
+			count_vm_event(PSWPIN);
+		}
+		unlock_page(page);
 	}
-	unlock_page(page);
 	mempool_free(sio, sio_pool);
 }
 
-static int swap_readpage_fs(struct page *page)
+static void swap_readpage_fs(struct page *page,
+			     struct swap_iocb **plug)
 {
 	struct swap_info_struct *sis = page_swap_info(page);
-	struct file *swap_file = sis->swap_file;
-	struct address_space *mapping = swap_file->f_mapping;
-	struct iov_iter from;
-	struct swap_iocb *sio;
+	struct swap_iocb *sio = NULL;
 	loff_t pos = page_file_offset(page);
-	int ret;
-
-	sio = mempool_alloc(sio_pool, GFP_KERNEL);
-	init_sync_kiocb(&sio->iocb, swap_file);
-	sio->iocb.ki_pos = pos;
-	sio->iocb.ki_complete = sio_read_complete;
-	sio->bvec.bv_page = page;
-	sio->bvec.bv_len = PAGE_SIZE;
-	sio->bvec.bv_offset = 0;
 
-	iov_iter_bvec(&from, READ, &sio->bvec, 1, PAGE_SIZE);
-	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
-	if (ret != -EIOCBQUEUED)
-		sio_read_complete(&sio->iocb, ret);
-	return ret;
+	if (*plug)
+		sio = *plug;
+	if (sio) {
+		if (sio->iocb.ki_filp != sis->swap_file ||
+		    sio->iocb.ki_pos + sio->pages * PAGE_SIZE != pos) {
+			swap_read_unplug(sio);
+			sio = NULL;
+		}
+	}
+	if (!sio) {
+		sio = mempool_alloc(sio_pool, GFP_KERNEL);
+		init_sync_kiocb(&sio->iocb, sis->swap_file);
+		sio->iocb.ki_pos = pos;
+		sio->iocb.ki_complete = sio_read_complete;
+		sio->pages = 0;
+	}
+	sio->bvec[sio->pages].bv_page = page;
+	sio->bvec[sio->pages].bv_len = PAGE_SIZE;
+	sio->bvec[sio->pages].bv_offset = 0;
+	sio->pages += 1;
+	if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
+		swap_read_unplug(sio);
+		sio = NULL;
+	}
+	if (plug)
+		*plug = sio;
 }
 
-int swap_readpage(struct page *page, bool synchronous)
+int swap_readpage(struct page *page, bool synchronous,
+		  struct swap_iocb **plug)
 {
 	struct bio *bio;
 	int ret = 0;
@@ -455,7 +470,7 @@ int swap_readpage(struct page *page, bool synchronous)
 	}
 
 	if (data_race(sis->flags & SWP_FS_OPS)) {
-		ret = swap_readpage_fs(page);
+		swap_readpage_fs(page, plug);
 		goto out;
 	}
 
@@ -507,3 +522,16 @@ int swap_readpage(struct page *page, bool synchronous)
 	delayacct_swapin_end();
 	return ret;
 }
+
+void __swap_read_unplug(struct swap_iocb *sio)
+{
+	struct iov_iter from;
+	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+	int ret;
+
+	iov_iter_bvec(&from, READ, sio->bvec, sio->pages,
+		      PAGE_SIZE * sio->pages);
+	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+	if (ret != -EIOCBQUEUED)
+		sio_read_complete(&sio->iocb, ret);
+}
diff --git a/mm/swap.h b/mm/swap.h
index e8ee995cf8d8..0c79b2478f3f 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -4,7 +4,15 @@
 
 /* linux/mm/page_io.c */
 int sio_pool_init(void);
-int swap_readpage(struct page *page, bool do_poll);
+struct swap_iocb;
+int swap_readpage(struct page *page, bool do_poll,
+		  struct swap_iocb **plug);
+void __swap_read_unplug(struct swap_iocb *plug);
+static inline void swap_read_unplug(struct swap_iocb *plug)
+{
+	if (unlikely(plug))
+		__swap_read_unplug(plug);
+}
 int swap_writepage(struct page *page, struct writeback_control *wbc);
 void end_swap_bio_write(struct bio *bio);
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
@@ -38,7 +46,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
 struct page *read_swap_cache_async(swp_entry_t, gfp_t,
 				   struct vm_area_struct *vma,
 				   unsigned long addr,
-				   bool do_poll);
+				   bool do_poll,
+				   struct swap_iocb **plug);
 struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
 				     struct vm_area_struct *vma,
 				     unsigned long addr,
@@ -53,7 +62,8 @@ static inline unsigned int page_swap_flags(struct page *page)
 	return page_swap_info(page)->flags;
 }
 #else /* CONFIG_SWAP */
-static inline int swap_readpage(struct page *page, bool do_poll)
+static inline int swap_readpage(struct page *page, bool do_poll,
+				struct swap_iocb **plug);
 {
 	return 0;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d541594be1c3..5cb2c75fa247 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -520,14 +520,16 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
  * the swap entry is no longer in use.
  */
 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-		struct vm_area_struct *vma, unsigned long addr, bool do_poll)
+				   struct vm_area_struct *vma,
+				   unsigned long addr, bool do_poll,
+				   struct swap_iocb **plug)
 {
 	bool page_was_allocated;
 	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
 			vma, addr, &page_was_allocated);
 
 	if (page_was_allocated)
-		swap_readpage(retpage, do_poll);
+		swap_readpage(retpage, do_poll, plug);
 
 	return retpage;
 }
@@ -621,6 +623,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	unsigned long mask;
 	struct swap_info_struct *si = swp_swap_info(entry);
 	struct blk_plug plug;
+	struct swap_iocb *splug = NULL;
 	bool do_poll = true, page_allocated;
 	struct vm_area_struct *vma = vmf->vma;
 	unsigned long addr = vmf->address;
@@ -647,7 +650,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 		if (!page)
 			continue;
 		if (page_allocated) {
-			swap_readpage(page, false);
+			swap_readpage(page, false, &splug);
 			if (offset != entry_offset) {
 				SetPageReadahead(page);
 				count_vm_event(SWAP_RA);
@@ -658,8 +661,10 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
 
 	lru_add_drain();	/* Push any new pages onto the LRU now */
 skip:
-	page = read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+	page = read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll,
+				     &splug);
 	blk_finish_plug(&plug);
+	swap_read_unplug(splug);
 	return page;
 }
 
@@ -791,6 +796,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 				       struct vm_fault *vmf)
 {
 	struct blk_plug plug;
+	struct swap_iocb *splug = NULL;
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
 	pte_t *pte, pentry;
@@ -821,7 +827,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 		if (!page)
 			continue;
 		if (page_allocated) {
-			swap_readpage(page, false);
+			swap_readpage(page, false, &splug);
 			if (i != ra_info.offset) {
 				SetPageReadahead(page);
 				count_vm_event(SWAP_RA);
@@ -832,8 +838,9 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
 	lru_add_drain();
 skip:
 	page = read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
-				     ra_info.win == 1);
+				     ra_info.win == 1, &splug);
 	blk_finish_plug(&plug);
+	swap_read_unplug(splug);
 	return page;
 }
 



  parent reply	other threads:[~2022-01-24  3:52 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-24  3:48 [PATCH 00/23 V3] Repair SWAP-over_NFS NeilBrown
2022-01-24  3:48 ` [PATCH 05/23] MM: reclaim mustn't enter FS for SWP_FS_OPS swap-space NeilBrown
2022-01-24  7:31   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 03/23] MM: drop swap_set_page_dirty NeilBrown
2022-01-24  7:28   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 14/23] NFS: swap IO handling is slightly different for O_DIRECT IO NeilBrown
2022-01-24  8:58   ` Christoph Hellwig
2022-01-24 13:22   ` Mark Hemment
2022-01-26 22:51     ` NeilBrown
2022-01-24  3:48 ` [PATCH 22/23] NFS: swap-out must always use STABLE writes NeilBrown
2022-01-26  3:45   ` Trond Myklebust
2022-01-26 21:42     ` NeilBrown
2022-01-24  3:48 ` [PATCH 23/23] SUNRPC: lock against ->sock changing during sysfs read NeilBrown
2022-01-24  3:48 ` [PATCH 08/23] DOC: update documentation for swap_activate and swap_rw NeilBrown
2022-01-24  8:50   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 07/23] MM: perform async writes to SWP_FS_OPS swap-space using ->swap_rw NeilBrown
2022-01-24  8:49   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 02/23] MM: extend block-plugging to cover all swap reads with read-ahead NeilBrown
2022-01-24  7:27   ` Christoph Hellwig
2022-01-26 21:47     ` NeilBrown
2022-01-26 23:09       ` Hugh Dickins
2022-01-27  0:32         ` NeilBrown
2022-01-24  3:48 ` [PATCH 16/23] SUNRPC/auth: async tasks mustn't block waiting for memory NeilBrown
2022-01-24  3:48 ` [PATCH 04/23] MM: move responsibility for setting SWP_FS_OPS to ->swap_activate NeilBrown
2022-01-24  7:30   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 06/23] MM: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space NeilBrown
2022-01-24  8:48   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 15/23] SUNRPC/call_alloc: async tasks mustn't block waiting for memory NeilBrown
2022-01-24  3:48 ` [PATCH 20/23] SUNRPC: improve 'swap' handling: scheduling and PF_MEMALLOC NeilBrown
2022-01-24  3:48 ` [PATCH 01/23] MM: create new mm/swap.h header file NeilBrown
2022-02-07 13:51   ` Geert Uytterhoeven
2022-01-24  3:48 ` NeilBrown [this message]
2022-01-24  8:25   ` [PATCH 09/23] MM: submit multipage reads for SWP_FS_OPS swap-space kernel test robot
2022-01-24  8:25     ` kernel test robot
2022-01-24  8:52   ` Christoph Hellwig
2022-01-24  9:27   ` kernel test robot
2022-01-24  9:27     ` kernel test robot
2022-01-24 13:16   ` Mark Hemment
2022-01-26 22:04     ` NeilBrown
2022-02-08 11:07   ` Geert Uytterhoeven
2022-01-24  3:48 ` [PATCH 12/23] NFS: remove IS_SWAPFILE hack NeilBrown
2022-01-24  8:56   ` Christoph Hellwig
2022-01-24  3:48 ` [PATCH 19/23] NFS: discard NFS_RPC_SWAPFLAGS and RPC_TASK_ROOTCREDS NeilBrown
2022-01-24  3:48 ` [PATCH 17/23] SUNRPC/xprt: async tasks mustn't block waiting for memory NeilBrown
2022-01-24  3:48 ` [PATCH 18/23] SUNRPC: remove scheduling boost for "SWAPPER" tasks NeilBrown
2022-01-24  3:48 ` [PATCH 21/23] NFSv4: keep state manager thread active if swap is enabled NeilBrown
2022-01-24  3:48 ` [PATCH 11/23] VFS: Add FMODE_CAN_ODIRECT file flag NeilBrown
2022-01-24  8:56   ` Christoph Hellwig
2022-01-26 22:14     ` NeilBrown
2022-01-24  3:48 ` [PATCH 10/23] MM: submit multipage write for SWP_FS_OPS swap-space NeilBrown
2022-01-24  8:55   ` Christoph Hellwig
2022-01-24 10:29   ` kernel test robot
2022-01-24 10:29     ` kernel test robot
2022-01-24  3:48 ` [PATCH 13/23] NFS: rename nfs_direct_IO and use as ->swap_rw NeilBrown
2022-01-24  8:57   ` Christoph Hellwig
2022-02-07 17:55 ` [PATCH 00/23 V3] Repair SWAP-over_NFS Geert Uytterhoeven

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=164299611278.26253.14950274629759580371.stgit@noble.brown \
    --to=neilb@suse.de \
    --cc=akpm@linux-foundation.org \
    --cc=anna.schumaker@netapp.com \
    --cc=chuck.lever@oracle.com \
    --cc=dhowells@redhat.com \
    --cc=hch@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=trond.myklebust@hammerspace.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.