All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kairui Song <ryncsn@gmail.com>
To: linux-mm@kvack.org
Cc: "Huang, Ying" <ying.huang@intel.com>,
	Chris Li <chrisl@kernel.org>, Minchan Kim <minchan@kernel.org>,
	Barry Song <v-songbaohua@oppo.com>,
	Ryan Roberts <ryan.roberts@arm.com>, Yu Zhao <yuzhao@google.com>,
	SeongJae Park <sj@kernel.org>,
	David Hildenbrand <david@redhat.com>,
	Yosry Ahmed <yosryahmed@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Matthew Wilcox <willy@infradead.org>,
	Nhat Pham <nphamcs@gmail.com>,
	Chengming Zhou <zhouchengming@bytedance.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org, Kairui Song <kasong@tencent.com>
Subject: [RFC PATCH 06/10] mm/swap: switch to use multi index entries
Date: Wed, 27 Mar 2024 02:50:28 +0800	[thread overview]
Message-ID: <20240326185032.72159-7-ryncsn@gmail.com> (raw)
In-Reply-To: <20240326185032.72159-1-ryncsn@gmail.com>

From: Kairui Song <kasong@tencent.com>

From: Kairui Song <ryncsn@gmail.com>

Since now all explicit shadow clearing is gone and all swapin / swapout
path is all using swap cache, switch swap cache to use multi index so
swapping out of THP will be faster, also using less memory.

Test result of sequential swapin/out of 30G zero page on ZRAM:

               Before (us)        After (us)
Swapout:       33648529           33713283
Swapin:        40667696           40954646
Swapout (THP): 7658664            6921176  (+9.7%)
Swapin (THP) : 40602278           40891953

And after swapping out 30G with THP, the radix node usage dropped by a
lot:

Before: radix_tree_node 73728K
After:  radix_tree_node  7056K (-94%)

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/filemap.c     | 27 +++++++++++++++++
 mm/huge_memory.c | 77 +++++++++++++++++++-----------------------------
 mm/internal.h    |  2 ++
 mm/swap_state.c  | 54 ++++++++++-----------------------
 4 files changed, 75 insertions(+), 85 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 0ccdc9e92764..5e8e3fd26b8d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -919,6 +919,33 @@ static int __filemap_lock_store(struct xa_state *xas, struct folio *folio,
 	return xas_error(xas);
 }
 
+int __filemap_add_swapcache(struct address_space *mapping, struct folio *folio,
+			    pgoff_t index, gfp_t gfp, void **shadowp)
+{
+	XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
+	long nr;
+	int ret;
+
+	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+	mapping_set_update(&xas, mapping);
+
+	nr = folio_nr_pages(folio);
+	folio_ref_add(folio, nr);
+
+	ret = __filemap_lock_store(&xas, folio, index, gfp, shadowp);
+	if (likely(!ret)) {
+		mapping->nrpages += nr;
+		__node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+		__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
+		xas_unlock_irq(&xas);
+	} else {
+		folio_put_refs(folio, nr);
+	}
+
+	return ret;
+}
+
 noinline int __filemap_add_folio(struct address_space *mapping,
 		struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9859aa4f7553..4fd2f74b94a9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2886,14 +2886,12 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
 	lru_add_page_tail(head, page_tail, lruvec, list);
 }
 
-static void __split_huge_page(struct page *page, struct list_head *list,
-		pgoff_t end, unsigned int new_order)
+static void __split_huge_page(struct address_space *mapping, struct page *page,
+			      struct list_head *list, pgoff_t end, unsigned int new_order)
 {
 	struct folio *folio = page_folio(page);
 	struct page *head = &folio->page;
 	struct lruvec *lruvec;
-	struct address_space *swap_cache = NULL;
-	unsigned long offset = 0;
 	int i, nr_dropped = 0;
 	unsigned int new_nr = 1 << new_order;
 	int order = folio_order(folio);
@@ -2902,12 +2900,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	/* complete memcg works before add pages to LRU */
 	split_page_memcg(head, order, new_order);
 
-	if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
-		offset = swp_offset(folio->swap);
-		swap_cache = swap_address_space(folio->swap);
-		xa_lock(&swap_cache->i_pages);
-	}
-
 	/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
 	lruvec = folio_lruvec_lock(folio);
 
@@ -2919,18 +2911,18 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 		if (head[i].index >= end) {
 			struct folio *tail = page_folio(head + i);
 
-			if (shmem_mapping(folio->mapping))
+			if (shmem_mapping(mapping))
 				nr_dropped++;
 			else if (folio_test_clear_dirty(tail))
 				folio_account_cleaned(tail,
-					inode_to_wb(folio->mapping->host));
+					inode_to_wb(mapping->host));
 			__filemap_remove_folio(tail, NULL);
 			folio_put(tail);
 		} else if (!PageAnon(page)) {
-			__xa_store(&folio->mapping->i_pages, head[i].index,
+			__xa_store(&mapping->i_pages, head[i].index,
 					head + i, 0);
-		} else if (swap_cache) {
-			__xa_store(&swap_cache->i_pages, offset + i,
+		} else if (folio_test_swapcache(folio)) {
+			__xa_store(&mapping->i_pages, swp_offset(folio->swap) + i,
 					head + i, 0);
 		}
 	}
@@ -2948,23 +2940,17 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	split_page_owner(head, order, new_order);
 
 	/* See comment in __split_huge_page_tail() */
-	if (folio_test_anon(folio)) {
+	if (mapping) {
 		/* Additional pin to swap cache */
-		if (folio_test_swapcache(folio)) {
-			folio_ref_add(folio, 1 + new_nr);
-			xa_unlock(&swap_cache->i_pages);
-		} else {
-			folio_ref_inc(folio);
-		}
-	} else {
-		/* Additional pin to page cache */
 		folio_ref_add(folio, 1 + new_nr);
-		xa_unlock(&folio->mapping->i_pages);
+		xa_unlock(&mapping->i_pages);
+	} else {
+		folio_ref_inc(folio);
 	}
 	local_irq_enable();
 
 	if (nr_dropped)
-		shmem_uncharge(folio->mapping->host, nr_dropped);
+		shmem_uncharge(mapping->host, nr_dropped);
 	remap_page(folio, nr);
 
 	if (folio_test_swapcache(folio))
@@ -3043,11 +3029,12 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 	struct deferred_split *ds_queue = get_deferred_split_queue(folio);
 	/* reset xarray order to new order after split */
 	XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
+	struct address_space *mapping = folio_mapping(folio);;
 	struct anon_vma *anon_vma = NULL;
-	struct address_space *mapping = NULL;
 	int extra_pins, ret;
 	pgoff_t end;
 	bool is_hzp;
+	gfp_t gfp;
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
@@ -3079,7 +3066,6 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		}
 	}
 
-
 	is_hzp = is_huge_zero_page(&folio->page);
 	if (is_hzp) {
 		pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
@@ -3089,6 +3075,17 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 	if (folio_test_writeback(folio))
 		return -EBUSY;
 
+	if (mapping) {
+		gfp = current_gfp_context(mapping_gfp_mask(mapping) &
+					  GFP_RECLAIM_MASK);
+
+		xas_split_alloc(&xas, folio, folio_order(folio), gfp);
+		if (xas_error(&xas)) {
+			ret = xas_error(&xas);
+			goto out;
+		}
+	}
+
 	if (folio_test_anon(folio)) {
 		/*
 		 * The caller does not necessarily hold an mmap_lock that would
@@ -3104,33 +3101,19 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 			goto out;
 		}
 		end = -1;
-		mapping = NULL;
 		anon_vma_lock_write(anon_vma);
 	} else {
-		gfp_t gfp;
-
-		mapping = folio->mapping;
-
 		/* Truncated ? */
 		if (!mapping) {
 			ret = -EBUSY;
 			goto out;
 		}
 
-		gfp = current_gfp_context(mapping_gfp_mask(mapping) &
-							GFP_RECLAIM_MASK);
-
 		if (!filemap_release_folio(folio, gfp)) {
 			ret = -EBUSY;
 			goto out;
 		}
 
-		xas_split_alloc(&xas, folio, folio_order(folio), gfp);
-		if (xas_error(&xas)) {
-			ret = xas_error(&xas);
-			goto out;
-		}
-
 		anon_vma = NULL;
 		i_mmap_lock_read(mapping);
 
@@ -3189,7 +3172,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 			int nr = folio_nr_pages(folio);
 
 			xas_split(&xas, folio, folio_order(folio));
-			if (folio_test_pmd_mappable(folio) &&
+
+			if (!folio_test_anon(folio) &&
+			    folio_test_pmd_mappable(folio) &&
 			    new_order < HPAGE_PMD_ORDER) {
 				if (folio_test_swapbacked(folio)) {
 					__lruvec_stat_mod_folio(folio,
@@ -3202,7 +3187,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 			}
 		}
 
-		__split_huge_page(page, list, end, new_order);
+		__split_huge_page(mapping, page, list, end, new_order);
 		ret = 0;
 	} else {
 		spin_unlock(&ds_queue->split_queue_lock);
@@ -3218,9 +3203,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 	if (anon_vma) {
 		anon_vma_unlock_write(anon_vma);
 		put_anon_vma(anon_vma);
-	}
-	if (mapping)
+	} else {
 		i_mmap_unlock_read(mapping);
+	}
 out:
 	xas_destroy(&xas);
 	count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
diff --git a/mm/internal.h b/mm/internal.h
index 7e486f2c502c..b2bbfd3c2b50 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1059,6 +1059,8 @@ struct migration_target_control {
  */
 size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
 			      struct folio *folio, loff_t fpos, size_t size);
+int __filemap_add_swapcache(struct address_space *mapping, struct folio *folio,
+			    pgoff_t index, gfp_t gfp, void **shadowp);
 
 /*
  * mm/vmalloc.c
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b84e7b0ea4a5..caf69696f47c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -90,48 +90,22 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
 {
 	struct address_space *address_space = swap_address_space(entry);
 	pgoff_t idx = swp_offset(entry);
-	XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
-	unsigned long i, nr = folio_nr_pages(folio);
-	void *old;
-
-	xas_set_update(&xas, workingset_update_node);
+	int ret;
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 	VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
 	VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
 
-	folio_ref_add(folio, nr);
 	folio_set_swapcache(folio);
 	folio->swap = entry;
 
-	do {
-		xas_lock_irq(&xas);
-		xas_create_range(&xas);
-		if (xas_error(&xas))
-			goto unlock;
-		for (i = 0; i < nr; i++) {
-			VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
-			if (shadowp) {
-				old = xas_load(&xas);
-				if (xa_is_value(old))
-					*shadowp = old;
-			}
-			xas_store(&xas, folio);
-			xas_next(&xas);
-		}
-		address_space->nrpages += nr;
-		__node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
-		__lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
-unlock:
-		xas_unlock_irq(&xas);
-	} while (xas_nomem(&xas, gfp));
-
-	if (!xas_error(&xas))
-		return 0;
+	ret = __filemap_add_swapcache(address_space, folio, idx, gfp, shadowp);
+	if (ret) {
+		folio_clear_swapcache(folio);
+		folio->swap.val = 0;
+	}
 
-	folio_clear_swapcache(folio);
-	folio_ref_sub(folio, nr);
-	return xas_error(&xas);
+	return ret;
 }
 
 /*
@@ -142,7 +116,6 @@ void __delete_from_swap_cache(struct folio *folio,
 			swp_entry_t entry, void *shadow)
 {
 	struct address_space *address_space = swap_address_space(entry);
-	int i;
 	long nr = folio_nr_pages(folio);
 	pgoff_t idx = swp_offset(entry);
 	XA_STATE(xas, &address_space->i_pages, idx);
@@ -153,11 +126,9 @@ void __delete_from_swap_cache(struct folio *folio,
 	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
 	VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
 
-	for (i = 0; i < nr; i++) {
-		void *entry = xas_store(&xas, shadow);
-		VM_BUG_ON_PAGE(entry != folio, entry);
-		xas_next(&xas);
-	}
+	xas_set_order(&xas, idx, folio_order(folio));
+	xas_store(&xas, shadow);
+
 	folio->swap.val = 0;
 	folio_clear_swapcache(folio);
 	address_space->nrpages -= nr;
@@ -252,6 +223,11 @@ void clear_shadow_from_swap_cache(swp_entry_t entry)
 
 	xas_set_update(&xas, workingset_update_node);
 
+	/*
+	 * On unmap, it may delete a larger order shadow here. It's mostly
+	 * fine since not entirely mapped folios are spiltted on swap out
+	 * and leaves shadows with order 0.
+	 */
 	xa_lock_irq(&address_space->i_pages);
 	if (xa_is_value(xas_load(&xas)))
 		xas_store(&xas, NULL);
-- 
2.43.0


  parent reply	other threads:[~2024-03-26 19:04 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-26 18:50 [RFC PATCH 00/10] mm/swap: always use swap cache for synchronization Kairui Song
2024-03-26 18:50 ` [RFC PATCH 01/10] mm/filemap: split filemap storing logic into a standalone helper Kairui Song
2024-03-26 18:50 ` [RFC PATCH 02/10] mm/swap: move no readahead swapin code to a stand-alone helper Kairui Song
2024-03-26 18:50 ` [RFC PATCH 03/10] mm/swap: convert swapin_readahead to return a folio Kairui Song
2024-03-26 20:03   ` Matthew Wilcox
2024-03-26 18:50 ` [RFC PATCH 04/10] mm/swap: remove cache bypass swapin Kairui Song
2024-03-27  6:30   ` Huang, Ying
2024-03-27  6:55     ` Kairui Song
2024-03-27  7:29       ` Huang, Ying
2024-03-26 18:50 ` [RFC PATCH 05/10] mm/swap: clean shadow only in unmap path Kairui Song
2024-03-26 18:50 ` Kairui Song [this message]
2024-03-26 18:50 ` [RFC PATCH 07/10] mm/swap: rename __read_swap_cache_async to swap_cache_alloc_or_get Kairui Song
2024-03-26 18:50 ` [RFC PATCH 08/10] mm/swap: use swap cache as a synchronization layer Kairui Song
2024-03-26 18:50 ` [RFC PATCH 09/10] mm/swap: delay the swap cache lookup for swapin Kairui Song
2024-03-26 18:50 ` [RFC PATCH 10/10] mm/swap: optimize synchronous swapin Kairui Song
2024-03-27  6:22   ` Huang, Ying
2024-03-27  6:37     ` Kairui Song
2024-03-27  6:47       ` Huang, Ying
2024-03-27  7:14         ` Kairui Song
2024-03-27  8:16           ` Huang, Ying
2024-03-27  8:08   ` Barry Song
2024-03-27  8:44     ` Kairui Song
2024-03-27  2:52 ` [RFC PATCH 00/10] mm/swap: always use swap cache for synchronization Huang, Ying
2024-03-27  3:01   ` Kairui Song
2024-03-27  8:27     ` Ryan Roberts
2024-03-27  8:32       ` Huang, Ying
2024-03-27  9:39         ` Ryan Roberts
2024-03-27 11:04       ` Kairui Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240326185032.72159-7-ryncsn@gmail.com \
    --to=ryncsn@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=chrisl@kernel.org \
    --cc=david@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=minchan@kernel.org \
    --cc=nphamcs@gmail.com \
    --cc=ryan.roberts@arm.com \
    --cc=sj@kernel.org \
    --cc=v-songbaohua@oppo.com \
    --cc=willy@infradead.org \
    --cc=ying.huang@intel.com \
    --cc=yosryahmed@google.com \
    --cc=yuzhao@google.com \
    --cc=zhouchengming@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.