All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alex Shi <alex.shi@linux.alibaba.com>
To: vbabka@suse.cz, alex.shi@linux.alibaba.com
Cc: Konstantin Khlebnikov <koct9i@gmail.com>,
	Hugh Dickins <hughd@google.com>, Yu Zhao <yuzhao@google.com>,
	Michal Hocko <mhocko@suse.com>,
	"Matthew Wilcox (Oracle)" <willy@infradead.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org
Subject: [PATCH 1/3] mm/swap.c: pre-sort pages in pagevec for pagevec_lru_move_fn
Date: Tue,  1 Dec 2020 16:02:13 +0800	[thread overview]
Message-ID: <1606809735-43300-1-git-send-email-alex.shi@linux.alibaba.com> (raw)
In-Reply-To: <20201126155553.GT4327@casper.infradead.org>

Pages in pagevec may have different lruvec, so we have to do relock in
function pagevec_lru_move_fn(), but a relock may cause current cpu wait
for long time on the same lock for spinlock fairness reason.

Before per memcg lru_lock, we have to bear the relock since the spinlock
is the only way to serialize page's memcg/lruvec. Now TestClearPageLRU
could be used to isolate pages exculsively, and stable the page's
lruvec/memcg. So it gives us a chance to sort the page's lruvec before
moving action in pagevec_lru_move_fn. Then we don't suffer from the
spinlock's fairness wait.

Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
---
 mm/swap.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 79 insertions(+), 13 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 490553f3f9ef..17d8990e5ca7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -201,29 +201,95 @@ int get_kernel_page(unsigned long start, int write, struct page **pages)
 }
 EXPORT_SYMBOL_GPL(get_kernel_page);
 
+/* Pratt's gaps for shell sort, https://en.wikipedia.org/wiki/Shellsort */
+static int gaps[] = { 6, 4, 3, 2, 1, 0};
+
+/* Shell sort pagevec[] on page's lruvec.*/
+static void shell_sort(struct pagevec *pvec, unsigned long *lvaddr)
+{
+	int g, i, j, n = pagevec_count(pvec);
+
+	for (g=0; gaps[g] > 0 && gaps[g] <= n/2; g++) {
+		int gap = gaps[g];
+
+		for (i = gap; i < n; i++) {
+			unsigned long tmp = lvaddr[i];
+			struct page *page = pvec->pages[i];
+
+			for (j = i - gap; j >= 0 && lvaddr[j] > tmp; j -= gap) {
+				lvaddr[j + gap] = lvaddr[j];
+				pvec->pages[j + gap] = pvec->pages[j];
+			}
+			lvaddr[j + gap] = tmp;
+			pvec->pages[j + gap] = page;
+		}
+	}
+}
+
+/* Get lru bit cleared page and their lruvec address, release the others */
+void sort_isopv(struct pagevec *pvec, struct pagevec *isopv,
+		unsigned long *lvaddr)
+{
+	int i, j;
+	struct pagevec busypv;
+
+	pagevec_init(&busypv);
+
+	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+
+		pvec->pages[i] = NULL;
+
+		/* block memcg migration during page moving between lru */
+		if (!TestClearPageLRU(page)) {
+			pagevec_add(&busypv, page);
+			continue;
+		}
+		lvaddr[j++] = (unsigned long)
+				mem_cgroup_page_lruvec(page, page_pgdat(page));
+		pagevec_add(isopv, page);
+	}
+	pagevec_reinit(pvec);
+	if (pagevec_count(&busypv))
+		release_pages(busypv.pages, busypv.nr);
+
+	shell_sort(isopv, lvaddr);
+}
+
 static void pagevec_lru_move_fn(struct pagevec *pvec,
 	void (*move_fn)(struct page *page, struct lruvec *lruvec))
 {
-	int i;
+	int i, n;
 	struct lruvec *lruvec = NULL;
 	unsigned long flags = 0;
+	unsigned long lvaddr[PAGEVEC_SIZE];
+	struct pagevec isopv;
 
-	for (i = 0; i < pagevec_count(pvec); i++) {
-		struct page *page = pvec->pages[i];
+	pagevec_init(&isopv);
 
-		/* block memcg migration during page moving between lru */
-		if (!TestClearPageLRU(page))
-			continue;
+	sort_isopv(pvec, &isopv, lvaddr);
 
-		lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
-		(*move_fn)(page, lruvec);
+	n = pagevec_count(&isopv);
+	if (!n)
+		return;
 
-		SetPageLRU(page);
+	lruvec = (struct lruvec *)lvaddr[0];
+	spin_lock_irqsave(&lruvec->lru_lock, flags);
+
+	for (i = 0; i < n; i++) {
+		/* lock new lruvec if lruvec changes, we have sorted them */
+		if (lruvec != (struct lruvec *)lvaddr[i]) {
+			spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+			lruvec = (struct lruvec *)lvaddr[i];
+			spin_lock_irqsave(&lruvec->lru_lock, flags);
+		}
+
+		(*move_fn)(isopv.pages[i], lruvec);
+
+		SetPageLRU(isopv.pages[i]);
 	}
-	if (lruvec)
-		unlock_page_lruvec_irqrestore(lruvec, flags);
-	release_pages(pvec->pages, pvec->nr);
-	pagevec_reinit(pvec);
+	spin_unlock_irqrestore(&lruvec->lru_lock, flags);
+	release_pages(isopv.pages, isopv.nr);
 }
 
 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
-- 
2.29.GIT


  parent reply	other threads:[~2020-12-01  8:03 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-20  8:27 [PATCH next] mm/swap.c: reduce lock contention in lru_cache_add Alex Shi
2020-11-20  8:27 ` Alex Shi
2020-11-20 23:19 ` Andrew Morton
2020-11-23  4:46   ` Alex Shi
2020-11-25 15:38 ` Vlastimil Babka
2020-11-26  3:12   ` Alex Shi
2020-11-26 11:05     ` Vlastimil Babka
2020-11-26  4:52 ` Yu Zhao
2020-11-26  6:39   ` Alex Shi
2020-11-26  7:24     ` Yu Zhao
2020-11-26  8:09       ` Alex Shi
2020-11-26 11:22       ` Vlastimil Babka
2020-11-26 15:44         ` Vlastimil Babka
2020-11-26 15:55           ` Matthew Wilcox
2020-11-27  3:14             ` Alex Shi
2020-12-01  8:02             ` Alex Shi [this message]
2020-12-01  8:02               ` [PATCH 2/3] mm/swap.c: bail out early for no memcg and no numa Alex Shi
2020-12-01  8:02               ` [PATCH 3/3] mm/swap.c: extend the usage to pagevec_lru_add Alex Shi
2020-12-01  8:10               ` [PATCH 1/3] mm/swap.c: pre-sort pages in pagevec for pagevec_lru_move_fn Michal Hocko
2020-12-01  8:20                 ` Alex Shi
2020-12-25  9:59             ` [RFC PATCH 0/4] pre sort pages on lruvec in pagevec Alex Shi
2020-12-25  9:59               ` [RFC PATCH 1/4] mm/swap.c: pre-sort pages in pagevec for pagevec_lru_move_fn Alex Shi
2020-12-25  9:59               ` [RFC PATCH 2/4] mm/swap.c: bail out early for no memcg and no numa Alex Shi
2020-12-25  9:59               ` [RFC PATCH 3/4] mm/swap.c: extend the usage to pagevec_lru_add Alex Shi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1606809735-43300-1-git-send-email-alex.shi@linux.alibaba.com \
    --to=alex.shi@linux.alibaba.com \
    --cc=akpm@linux-foundation.org \
    --cc=hughd@google.com \
    --cc=koct9i@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    --cc=vbabka@suse.cz \
    --cc=willy@infradead.org \
    --cc=yuzhao@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.