All of lore.kernel.org
 help / color / mirror / Atom feed
From: Johannes Weiner <hannes@cmpxchg.org>
To: linux-mm@kvack.org
Cc: Rik van Riel <riel@surriel.com>,
	Minchan Kim <minchan.kim@gmail.com>,
	Michal Hocko <mhocko@suse.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Joonsoo Kim <iamjoonsoo.kim@lge.com>,
	linux-kernel@vger.kernel.org, kernel-team@fb.com
Subject: [PATCH 12/14] mm: vmscan: determine anon/file pressure balance at the reclaim root
Date: Wed, 20 May 2020 19:25:23 -0400	[thread overview]
Message-ID: <20200520232525.798933-13-hannes@cmpxchg.org> (raw)
In-Reply-To: <20200520232525.798933-1-hannes@cmpxchg.org>

We split the LRU lists into anon and file, and we rebalance the scan
pressure between them when one of them begins thrashing: if the file
cache experiences workingset refaults, we increase the pressure on
anonymous pages; if the workload is stalled on swapins, we increase
the pressure on the file cache instead.

With cgroups and their nested LRU lists, we currently don't do this
correctly. While recursive cgroup reclaim establishes a relative LRU
order among the pages of all involved cgroups, LRU pressure balancing
is done on an individual cgroup LRU level. As a result, when one
cgroup is thrashing on the filesystem cache while a sibling may have
cold anonymous pages, pressure doesn't get equalized between them.

This patch moves LRU balancing decision to the root of reclaim - the
same level where the LRU order is established.

It does this by tracking LRU cost recursively, so that every level of
the cgroup tree knows the aggregate LRU cost of all memory within its
domain. When the page scanner calculates the scan balance for any
given individual cgroup's LRU list, it uses the values from the
ancestor cgroup that initiated the reclaim cycle.

If one sibling is then thrashing on the cache, it will tip the
pressure balance inside its ancestors, and the next hierarchical
reclaim iteration will go more after the anon pages in the tree.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/memcontrol.h | 13 ++++++++++++
 mm/swap.c                  | 32 ++++++++++++++++++++++++-----
 mm/vmscan.c                | 41 ++++++++++++++++----------------------
 3 files changed, 57 insertions(+), 29 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 32a0b4d47540..d982c80da157 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1303,6 +1303,19 @@ static inline void dec_lruvec_page_state(struct page *page,
 	mod_lruvec_page_state(page, idx, -1);
 }
 
+static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
+{
+	struct mem_cgroup *memcg;
+
+	memcg = lruvec_memcg(lruvec);
+	if (!memcg)
+		return NULL;
+	memcg = parent_mem_cgroup(memcg);
+	if (!memcg)
+		return NULL;
+	return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
+}
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/mm/swap.c b/mm/swap.c
index 2ff91656dea2..3d8aa46c47ff 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -266,11 +266,33 @@ void lru_note_cost(struct page *page)
 {
 	struct lruvec *lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
 
-	/* Record new data point */
-	if (page_is_file_lru(page))
-		lruvec->file_cost++;
-	else
-		lruvec->anon_cost++;
+	do {
+		unsigned long lrusize;
+
+		/* Record cost event */
+		if (page_is_file_lru(page))
+			lruvec->file_cost++;
+		else
+			lruvec->anon_cost++;
+
+		/*
+		 * Decay previous events
+		 *
+		 * Because workloads change over time (and to avoid
+		 * overflow) we keep these statistics as a floating
+		 * average, which ends up weighing recent refaults
+		 * more than old ones.
+		 */
+		lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
+			  lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
+			  lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
+			  lruvec_page_state(lruvec, NR_ACTIVE_FILE);
+
+		if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
+			lruvec->file_cost /= 2;
+			lruvec->anon_cost /= 2;
+		}
+	} while ((lruvec = parent_lruvec(lruvec)));
 }
 
 static void __activate_page(struct page *page, struct lruvec *lruvec,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e7e6868bcbf7..1487ff5d4698 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -79,6 +79,12 @@ struct scan_control {
 	 */
 	struct mem_cgroup *target_mem_cgroup;
 
+	/*
+	 * Scan pressure balancing between anon and file LRUs
+	 */
+	unsigned long	anon_cost;
+	unsigned long	file_cost;
+
 	/* Can active pages be deactivated as part of reclaim? */
 #define DEACTIVATE_ANON 1
 #define DEACTIVATE_FILE 2
@@ -2231,10 +2237,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	int swappiness = mem_cgroup_swappiness(memcg);
 	u64 fraction[2];
 	u64 denominator = 0;	/* gcc */
-	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 	unsigned long anon_prio, file_prio;
 	enum scan_balance scan_balance;
-	unsigned long anon, file;
 	unsigned long totalcost;
 	unsigned long ap, fp;
 	enum lru_list lru;
@@ -2285,7 +2289,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	}
 
 	scan_balance = SCAN_FRACT;
-
 	/*
 	 * Calculate the pressure balance between anon and file pages.
 	 *
@@ -2300,30 +2303,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	anon_prio = swappiness;
 	file_prio = 200 - anon_prio;
 
-	/*
-	 * Because workloads change over time (and to avoid overflow)
-	 * we keep these statistics as a floating average, which ends
-	 * up weighing recent refaults more than old ones.
-	 */
-
-	anon  = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
-	file  = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
-		lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
-
-	spin_lock_irq(&pgdat->lru_lock);
-	totalcost = lruvec->anon_cost + lruvec->file_cost;
-	if (unlikely(totalcost > (anon + file) / 4)) {
-		lruvec->anon_cost /= 2;
-		lruvec->file_cost /= 2;
-		totalcost /= 2;
-	}
+	totalcost = sc->anon_cost + sc->file_cost;
 	ap = anon_prio * (totalcost + 1);
-	ap /= lruvec->anon_cost + 1;
+	ap /= sc->anon_cost + 1;
 
 	fp = file_prio * (totalcost + 1);
-	fp /= lruvec->file_cost + 1;
-	spin_unlock_irq(&pgdat->lru_lock);
+	fp /= sc->file_cost + 1;
 
 	fraction[0] = ap;
 	fraction[1] = fp;
@@ -2679,6 +2664,14 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	nr_reclaimed = sc->nr_reclaimed;
 	nr_scanned = sc->nr_scanned;
 
+	/*
+	 * Determine the scan balance between anon and file LRUs.
+	 */
+	spin_lock_irq(&pgdat->lru_lock);
+	sc->anon_cost = target_lruvec->anon_cost;
+	sc->file_cost = target_lruvec->file_cost;
+	spin_unlock_irq(&pgdat->lru_lock);
+
 	/*
 	 * Target desirable inactive:active list ratios for the anon
 	 * and file LRU lists.
-- 
2.26.2


  parent reply	other threads:[~2020-05-20 23:26 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-20 23:25 [PATCH 00/14] mm: balance LRU lists based on relative thrashing v2 Johannes Weiner
2020-05-20 23:25 ` [PATCH 01/14] mm: fix LRU balancing effect of new transparent huge pages Johannes Weiner
2020-05-27 19:48   ` Shakeel Butt
2020-05-27 19:48     ` Shakeel Butt
2020-05-20 23:25 ` [PATCH 02/14] mm: keep separate anon and file statistics on page reclaim activity Johannes Weiner
2020-05-20 23:25 ` [PATCH 03/14] mm: allow swappiness that prefers reclaiming anon over the file workingset Johannes Weiner
2020-05-20 23:25 ` [PATCH 04/14] mm: fold and remove lru_cache_add_anon() and lru_cache_add_file() Johannes Weiner
2020-05-20 23:25 ` [PATCH 05/14] mm: workingset: let cache workingset challenge anon Johannes Weiner
2020-05-27  2:06   ` Joonsoo Kim
2020-05-27  2:06     ` Joonsoo Kim
2020-05-27 13:43     ` Johannes Weiner
2020-05-28  7:16       ` Joonsoo Kim
2020-05-28  7:16         ` Joonsoo Kim
2020-05-28 17:01         ` Johannes Weiner
2020-05-29  6:48           ` Joonsoo Kim
2020-05-29  6:48             ` Joonsoo Kim
2020-05-29 15:12             ` Johannes Weiner
2020-06-01  6:14               ` Joonsoo Kim
2020-06-01  6:14                 ` Joonsoo Kim
2020-06-01 15:56                 ` Johannes Weiner
2020-06-01 20:44                   ` Johannes Weiner
2020-06-04 13:35                     ` Vlastimil Babka
2020-06-04 15:05                       ` Johannes Weiner
2020-06-12  3:19                         ` Joonsoo Kim
2020-06-12  3:19                           ` Joonsoo Kim
2020-06-15 13:41                           ` Johannes Weiner
2020-06-16  6:09                             ` Joonsoo Kim
2020-06-16  6:09                               ` Joonsoo Kim
2020-06-02  2:34                   ` Joonsoo Kim
2020-06-02  2:34                     ` Joonsoo Kim
2020-06-02 16:47                     ` Johannes Weiner
2020-06-03  5:40                       ` Joonsoo Kim
2020-06-03  5:40                         ` Joonsoo Kim
2020-05-20 23:25 ` [PATCH 06/14] mm: remove use-once cache bias from LRU balancing Johannes Weiner
2020-05-20 23:25 ` [PATCH 07/14] mm: vmscan: drop unnecessary div0 avoidance rounding in get_scan_count() Johannes Weiner
2020-05-20 23:25 ` [PATCH 08/14] mm: base LRU balancing on an explicit cost model Johannes Weiner
2020-05-20 23:25 ` [PATCH 09/14] mm: deactivations shouldn't bias the LRU balance Johannes Weiner
2020-05-22 13:33   ` Qian Cai
2020-05-26 15:55     ` Johannes Weiner
2020-05-27  0:54       ` Qian Cai
2020-05-20 23:25 ` [PATCH 10/14] mm: only count actual rotations as LRU reclaim cost Johannes Weiner
2020-05-20 23:25 ` [PATCH 11/14] mm: balance LRU lists based on relative thrashing Johannes Weiner
2020-05-20 23:25 ` Johannes Weiner [this message]
2020-05-20 23:25 ` [PATCH 13/14] mm: vmscan: reclaim writepage is IO cost Johannes Weiner
2020-05-20 23:25 ` [PATCH 14/14] mm: vmscan: limit the range of LRU type balancing Johannes Weiner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200520232525.798933-13-hannes@cmpxchg.org \
    --to=hannes@cmpxchg.org \
    --cc=akpm@linux-foundation.org \
    --cc=iamjoonsoo.kim@lge.com \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    --cc=minchan.kim@gmail.com \
    --cc=riel@surriel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.