All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kairui Song <ryncsn@gmail.com>
To: linux-mm@kvack.org
Cc: "Huang, Ying" <ying.huang@intel.com>,
	Chris Li <chrisl@kernel.org>, Minchan Kim <minchan@kernel.org>,
	Barry Song <v-songbaohua@oppo.com>,
	Ryan Roberts <ryan.roberts@arm.com>, Yu Zhao <yuzhao@google.com>,
	SeongJae Park <sj@kernel.org>,
	David Hildenbrand <david@redhat.com>,
	Yosry Ahmed <yosryahmed@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Matthew Wilcox <willy@infradead.org>,
	Nhat Pham <nphamcs@gmail.com>,
	Chengming Zhou <zhouchengming@bytedance.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org, Kairui Song <kasong@tencent.com>
Subject: [RFC PATCH 04/10] mm/swap: remove cache bypass swapin
Date: Wed, 27 Mar 2024 02:50:26 +0800	[thread overview]
Message-ID: <20240326185032.72159-5-ryncsn@gmail.com> (raw)
In-Reply-To: <20240326185032.72159-1-ryncsn@gmail.com>

From: Kairui Song <kasong@tencent.com>

We used to have the cache bypass swapin path for better performance,
but by removing it, more optimization can be applied and have
an even better overall performance and less hackish.

And these optimizations are not easily doable or not doable at all
without this.

This patch simply removes it, and the performance will drop heavily
for simple swapin, things won't get this worse for real workloads
but still observable. Following commits will fix this and archive a
better performance.

Swapout/in 30G zero pages from ZRAM (This mostly measures overhead
of swap path itself, because zero pages are not compressed but simply
recorded in ZRAM, and performance drops more as SWAP device is getting
full):

Test result of sequential swapin/out:

               Before (us)        After (us)
Swapout:       33619409           33624641
Swapin:        32393771           41614858 (-28.4%)
Swapout (THP): 7817909            7795530
Swapin (THP) : 32452387           41708471 (-28.4%)

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/memory.c     | 18 ++++-------------
 mm/swap.h       | 10 +++++-----
 mm/swap_state.c | 53 ++++++++++---------------------------------------
 mm/swapfile.c   | 13 ------------
 4 files changed, 19 insertions(+), 75 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index dfdb620a9123..357d239ee2f6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3932,7 +3932,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	struct page *page;
 	struct swap_info_struct *si = NULL;
 	rmap_t rmap_flags = RMAP_NONE;
-	bool need_clear_cache = false;
 	bool exclusive = false;
 	swp_entry_t entry;
 	pte_t pte;
@@ -4000,14 +3999,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (!folio) {
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
 		    __swap_count(entry) == 1) {
-			/* skip swapcache and readahead */
 			folio = swapin_direct(entry, GFP_HIGHUSER_MOVABLE, vmf);
-			if (PTR_ERR(folio) == -EBUSY)
-				goto out;
-			need_clear_cache = true;
 		} else {
 			folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf);
-			swapcache = folio;
 		}
 
 		if (!folio) {
@@ -4023,6 +4017,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			goto unlock;
 		}
 
+		swapcache = folio;
 		page = folio_file_page(folio, swp_offset(entry));
 
 		/* Had to read the page from swap area: Major fault */
@@ -4187,7 +4182,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	vmf->orig_pte = pte;
 
 	/* ksm created a completely new copy */
-	if (unlikely(folio != swapcache && swapcache)) {
+	if (unlikely(folio != swapcache)) {
 		folio_add_new_anon_rmap(folio, vma, vmf->address);
 		folio_add_lru_vma(folio, vma);
 	} else {
@@ -4201,7 +4196,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
 
 	folio_unlock(folio);
-	if (folio != swapcache && swapcache) {
+	if (folio != swapcache) {
 		/*
 		 * Hold the lock to avoid the swap entry to be reused
 		 * until we take the PT lock for the pte_same() check
@@ -4227,9 +4222,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (vmf->pte)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
-	/* Clear the swap cache pin for direct swapin after PTL unlock */
-	if (need_clear_cache)
-		swapcache_clear(si, entry);
 	if (si)
 		put_swap_device(si);
 	return ret;
@@ -4240,12 +4232,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	folio_unlock(folio);
 out_release:
 	folio_put(folio);
-	if (folio != swapcache && swapcache) {
+	if (folio != swapcache) {
 		folio_unlock(swapcache);
 		folio_put(swapcache);
 	}
-	if (need_clear_cache)
-		swapcache_clear(si, entry);
 	if (si)
 		put_swap_device(si);
 	return ret;
diff --git a/mm/swap.h b/mm/swap.h
index aee134907a70..ac9573b03432 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,7 +41,6 @@ void __delete_from_swap_cache(struct folio *folio,
 void delete_from_swap_cache(struct folio *folio);
 void clear_shadow_from_swap_cache(int type, unsigned long begin,
 				  unsigned long end);
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
 struct folio *swap_cache_get_folio(swp_entry_t entry,
 		struct vm_area_struct *vma, unsigned long addr);
 struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -100,14 +99,15 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 {
 	return NULL;
 }
-
-static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+static inline struct folio *swapin_direct(swp_entry_t entry, gfp_t flag,
+			struct vm_fault *vmf);
 {
-	return 0;
+	return NULL;
 }
 
-static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
 {
+	return 0;
 }
 
 static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 2a9c6bdff5ea..49ef6250f676 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -880,61 +880,28 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
 }
 
 /**
- * swapin_direct - swap in folios skipping swap cache and readahead
+ * swapin_direct - swap in folios skipping readahead
  * @entry: swap entry of this memory
  * @gfp_mask: memory allocation flags
  * @vmf: fault information
  *
- * Returns the struct folio for entry and addr after the swap entry is read
- * in.
+ * Returns the folio for entry after it is read in.
  */
 struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask,
 			    struct vm_fault *vmf)
 {
-	struct vm_area_struct *vma = vmf->vma;
+	struct mempolicy *mpol;
 	struct folio *folio;
-	void *shadow = NULL;
-
-	/*
-	 * Prevent parallel swapin from proceeding with
-	 * the cache flag. Otherwise, another thread may
-	 * finish swapin first, free the entry, and swapout
-	 * reusing the same entry. It's undetectable as
-	 * pte_same() returns true due to entry reuse.
-	 */
-	if (swapcache_prepare(entry)) {
-		/* Relax a bit to prevent rapid repeated page faults */
-		schedule_timeout_uninterruptible(1);
-		return ERR_PTR(-EBUSY);
-	}
-
-	/* skip swapcache */
-	folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
-				vma, vmf->address, false);
-	if (folio) {
-		__folio_set_locked(folio);
-		__folio_set_swapbacked(folio);
-
-		if (mem_cgroup_swapin_charge_folio(folio,
-					vma->vm_mm, GFP_KERNEL,
-					entry)) {
-			folio_unlock(folio);
-			folio_put(folio);
-			return NULL;
-		}
-		mem_cgroup_swapin_uncharge_swap(entry);
-
-		shadow = get_shadow_from_swap_cache(entry);
-		if (shadow)
-			workingset_refault(folio, shadow);
+	bool page_allocated;
+	pgoff_t ilx;
 
-		folio_add_lru(folio);
+	mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
+	folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
+					&page_allocated, false);
+	mpol_cond_put(mpol);
 
-		/* To provide entry to swap_read_folio() */
-		folio->swap = entry;
+	if (page_allocated)
 		swap_read_folio(folio, true, NULL);
-		folio->private = NULL;
-	}
 
 	return folio;
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4dd894395a0f..ae8d3aa05df7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3389,19 +3389,6 @@ int swapcache_prepare(swp_entry_t entry)
 	return __swap_duplicate(entry, SWAP_HAS_CACHE);
 }
 
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
-{
-	struct swap_cluster_info *ci;
-	unsigned long offset = swp_offset(entry);
-	unsigned char usage;
-
-	ci = lock_cluster_or_swap_info(si, offset);
-	usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
-	unlock_cluster_or_swap_info(si, ci);
-	if (!usage)
-		free_swap_slot(entry);
-}
-
 struct swap_info_struct *swp_swap_info(swp_entry_t entry)
 {
 	return swap_type_to_swap_info(swp_type(entry));
-- 
2.43.0


  parent reply	other threads:[~2024-03-26 19:04 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-26 18:50 [RFC PATCH 00/10] mm/swap: always use swap cache for synchronization Kairui Song
2024-03-26 18:50 ` [RFC PATCH 01/10] mm/filemap: split filemap storing logic into a standalone helper Kairui Song
2024-03-26 18:50 ` [RFC PATCH 02/10] mm/swap: move no readahead swapin code to a stand-alone helper Kairui Song
2024-03-26 18:50 ` [RFC PATCH 03/10] mm/swap: convert swapin_readahead to return a folio Kairui Song
2024-03-26 20:03   ` Matthew Wilcox
2024-03-26 18:50 ` Kairui Song [this message]
2024-03-27  6:30   ` [RFC PATCH 04/10] mm/swap: remove cache bypass swapin Huang, Ying
2024-03-27  6:55     ` Kairui Song
2024-03-27  7:29       ` Huang, Ying
2024-03-26 18:50 ` [RFC PATCH 05/10] mm/swap: clean shadow only in unmap path Kairui Song
2024-03-26 18:50 ` [RFC PATCH 06/10] mm/swap: switch to use multi index entries Kairui Song
2024-03-26 18:50 ` [RFC PATCH 07/10] mm/swap: rename __read_swap_cache_async to swap_cache_alloc_or_get Kairui Song
2024-03-26 18:50 ` [RFC PATCH 08/10] mm/swap: use swap cache as a synchronization layer Kairui Song
2024-03-26 18:50 ` [RFC PATCH 09/10] mm/swap: delay the swap cache lookup for swapin Kairui Song
2024-03-26 18:50 ` [RFC PATCH 10/10] mm/swap: optimize synchronous swapin Kairui Song
2024-03-27  6:22   ` Huang, Ying
2024-03-27  6:37     ` Kairui Song
2024-03-27  6:47       ` Huang, Ying
2024-03-27  7:14         ` Kairui Song
2024-03-27  8:16           ` Huang, Ying
2024-03-27  8:08   ` Barry Song
2024-03-27  8:44     ` Kairui Song
2024-03-27  2:52 ` [RFC PATCH 00/10] mm/swap: always use swap cache for synchronization Huang, Ying
2024-03-27  3:01   ` Kairui Song
2024-03-27  8:27     ` Ryan Roberts
2024-03-27  8:32       ` Huang, Ying
2024-03-27  9:39         ` Ryan Roberts
2024-03-27 11:04       ` Kairui Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240326185032.72159-5-ryncsn@gmail.com \
    --to=ryncsn@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=chrisl@kernel.org \
    --cc=david@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=minchan@kernel.org \
    --cc=nphamcs@gmail.com \
    --cc=ryan.roberts@arm.com \
    --cc=sj@kernel.org \
    --cc=v-songbaohua@oppo.com \
    --cc=willy@infradead.org \
    --cc=ying.huang@intel.com \
    --cc=yosryahmed@google.com \
    --cc=yuzhao@google.com \
    --cc=zhouchengming@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.