linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Tim Chen <tim.c.chen@linux.intel.com>
To: Michal Hocko <mhocko@suse.cz>
Cc: Tim Chen <tim.c.chen@linux.intel.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Dave Hansen <dave.hansen@intel.com>,
	Ying Huang <ying.huang@intel.com>,
	Dan Williams <dan.j.williams@intel.com>,
	David Rientjes <rientjes@google.com>,
	Shakeel Butt <shakeelb@google.com>,
	linux-mm@kvack.org, cgroups@vger.kernel.org,
	linux-kernel@vger.kernel.org
Subject: [RFC PATCH v1 09/11] mm: Use kswapd to demote pages when toptier memory is tight
Date: Mon,  5 Apr 2021 10:08:33 -0700	[thread overview]
Message-ID: <83c06bf70e38360358c84daab399f18f57e7eba4.1617642417.git.tim.c.chen@linux.intel.com> (raw)
In-Reply-To: <cover.1617642417.git.tim.c.chen@linux.intel.com>

Demote pages from memory cgroup that has excess
toptier memory usage when top tier memory is tight.

When free top tier memory falls below this fraction
"toptier_scale_factor/10000" of overall toptier memory in a node, kswapd
reclaims top tier memory from those mem cgroups that exceeded their
toptier memory soft limit by deomoting the top tier pages to
lower memory tier.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 Documentation/admin-guide/sysctl/vm.rst | 12 +++++
 include/linux/mmzone.h                  |  2 +
 mm/page_alloc.c                         | 14 +++++
 mm/vmscan.c                             | 69 ++++++++++++++++++++++++-
 4 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 9de3847c3469..6b49e2e90953 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -74,6 +74,7 @@ Currently, these files are in /proc/sys/vm:
 - vfs_cache_pressure
 - watermark_boost_factor
 - watermark_scale_factor
+- toptier_scale_factor
 - zone_reclaim_mode
 
 
@@ -962,6 +963,17 @@ too small for the allocation bursts occurring in the system. This knob
 can then be used to tune kswapd aggressiveness accordingly.
 
 
+toptier_scale_factor
+====================
+
+This factor controls when kswapd wakes up to demote pages of those
+cgroups that have exceeded their memory soft limit.
+
+The unit is in fractions of 10,000. The default value of 2000 means the
+if there are less than 20% of free top tier memory in the
+node/system, we will start to demote pages of those memory cgroups
+that have exceeded their soft memory limit.
+
 zone_reclaim_mode
 =================
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bbe649c4fdee..4ee0073d255f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -332,12 +332,14 @@ enum zone_watermarks {
 	WMARK_MIN,
 	WMARK_LOW,
 	WMARK_HIGH,
+	WMARK_TOPTIER,
 	NR_WMARK
 };
 
 #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
 #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
+#define toptier_wmark_pages(z) (z->_watermark[WMARK_TOPTIER] + z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
 struct per_cpu_pages {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 471a2c342c4f..20f3caee60f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7964,6 +7964,20 @@ static void __setup_per_zone_wmarks(void)
 		zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
 		zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
 
+		tmp = mult_frac(zone_managed_pages(zone),
+				toptier_scale_factor, 10000);
+		/*
+		 * Clamp toptier watermark between twice high watermark
+		 * and max managed pages.
+		 */
+		if (tmp < 2 * zone->_watermark[WMARK_HIGH])
+			tmp = 2 * zone->_watermark[WMARK_HIGH];
+		if (tmp > zone_managed_pages(zone))
+			tmp = zone_managed_pages(zone);
+		zone->_watermark[WMARK_TOPTIER] = tmp;
+
+		zone->watermark_boost = 0;
+
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 11bb0c6fa524..270880c8baef 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -185,6 +185,7 @@ static void set_task_reclaim_state(struct task_struct *task,
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
+int toptier_scale_factor = 2000;
 
 #ifdef CONFIG_MEMCG
 /*
@@ -3624,6 +3625,34 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
 	return false;
 }
 
+static bool pgdat_toptier_balanced(pg_data_t *pgdat, int order, int classzone_idx)
+{
+	int i;
+	unsigned long mark;
+	struct zone *zone;
+
+	zone = pgdat->node_zones + ZONE_NORMAL;
+
+	if (!node_state(pgdat->node_id, N_TOPTIER) ||
+	    next_demotion_node(pgdat->node_id) == -1 ||
+	    order > 0 || classzone_idx < ZONE_NORMAL) {
+		return true;
+	}
+
+	zone = pgdat->node_zones + ZONE_NORMAL;
+
+	if (!managed_zone(zone))
+		return true;
+
+	mark = min(toptier_wmark_pages(zone),
+		   zone_managed_pages(zone));
+
+	if (zone_page_state(zone, NR_FREE_PAGES) < mark)
+		return false;
+
+	return true;
+}
+
 /* Clear pgdat state for congested, dirty or under writeback. */
 static void clear_pgdat_congested(pg_data_t *pgdat)
 {
@@ -4049,6 +4078,39 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
 	finish_wait(&pgdat->kswapd_wait, &wait);
 }
 
+static bool toptier_soft_reclaim(pg_data_t *pgdat,
+			      unsigned int reclaim_order,
+			      unsigned int classzone_idx)
+{
+	unsigned long nr_soft_scanned, nr_soft_reclaimed;
+	int ret;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.order = reclaim_order,
+		.may_unmap = 1,
+	};
+
+	if (!node_state(pgdat->node_id, N_TOPTIER) || kthread_should_stop())
+		return false;
+
+	set_task_reclaim_state(current, &sc.reclaim_state);
+
+	if (!pgdat_toptier_balanced(pgdat, 0, classzone_idx)) {
+		nr_soft_scanned = 0;
+		nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat,
+					0, GFP_KERNEL,
+					&nr_soft_scanned, N_TOPTIER);
+	}
+
+	set_task_reclaim_state(current, NULL);
+
+	if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx) &&
+	   !kthread_should_stop())
+		return true;
+	else
+		return false;
+}
+
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process.
@@ -4108,6 +4170,10 @@ static int kswapd(void *p)
 		WRITE_ONCE(pgdat->kswapd_order, 0);
 		WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 
+		if (toptier_soft_reclaim(pgdat, 0,
+					highest_zoneidx))
+			goto kswapd_try_sleep;
+
 		ret = try_to_freeze();
 		if (kthread_should_stop())
 			break;
@@ -4173,7 +4239,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
 
 	/* Hopeless node, leave it to direct reclaim if possible */
 	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
-	    (pgdat_balanced(pgdat, order, highest_zoneidx) &&
+	    (pgdat_toptier_balanced(pgdat, 0, highest_zoneidx) &&
+	     pgdat_balanced(pgdat, order, highest_zoneidx) &&
 	     !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
 		/*
 		 * There may be plenty of free memory available, but it's too
-- 
2.20.1


  parent reply	other threads:[~2021-04-05 18:09 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-05 17:08 [RFC PATCH v1 00/11] Manage the top tier memory in a tiered memory Tim Chen
2021-04-05 17:08 ` [RFC PATCH v1 01/11] mm: Define top tier memory node mask Tim Chen
2021-04-05 17:08 ` [RFC PATCH v1 02/11] mm: Add soft memory limit for mem cgroup Tim Chen
2021-04-05 17:08 ` [RFC PATCH v1 03/11] mm: Account the top tier memory usage per cgroup Tim Chen
2021-04-05 17:08 ` [RFC PATCH v1 04/11] mm: Report top tier memory usage in sysfs Tim Chen
2021-04-05 17:08 ` [RFC PATCH v1 05/11] mm: Add soft_limit_top_tier tree for mem cgroup Tim Chen
2021-04-05 17:08 ` [RFC PATCH v1 06/11] mm: Handle top tier memory in cgroup soft limit memory tree utilities Tim Chen
2021-04-05 17:08 ` [RFC PATCH v1 07/11] mm: Account the total top tier memory in use Tim Chen
2021-04-05 17:08 ` [RFC PATCH v1 08/11] mm: Add toptier option for mem_cgroup_soft_limit_reclaim() Tim Chen
2021-04-05 17:08 ` Tim Chen [this message]
2021-04-05 17:08 ` [RFC PATCH v1 10/11] mm: Set toptier_scale_factor via sysctl Tim Chen
2021-04-05 17:08 ` [RFC PATCH v1 11/11] mm: Wakeup kswapd if toptier memory need soft reclaim Tim Chen
2021-04-06  9:08 ` [RFC PATCH v1 00/11] Manage the top tier memory in a tiered memory Michal Hocko
2021-04-07 22:33   ` Tim Chen
2021-04-08 11:52     ` Michal Hocko
2021-04-09 23:26       ` Tim Chen
2021-04-12 19:20         ` Shakeel Butt
2021-04-14  8:59           ` Jonathan Cameron
2021-04-15  0:42           ` Tim Chen
2021-04-13  2:15         ` Huang, Ying
2021-04-13  8:33         ` Michal Hocko
2021-04-12 14:03       ` Shakeel Butt
2021-04-08 17:18 ` Shakeel Butt
2021-04-08 18:00   ` Yang Shi
2021-04-08 20:29     ` Shakeel Butt
2021-04-08 20:50       ` Yang Shi
2021-04-12 14:03         ` Shakeel Butt
2021-04-09  7:24       ` Michal Hocko
2021-04-15 22:31         ` Tim Chen
2021-04-16  6:38           ` Michal Hocko
2021-04-14 23:22       ` Tim Chen
2021-04-09  2:58     ` Huang, Ying
2021-04-09 20:50       ` Yang Shi
2021-04-15 22:25   ` Tim Chen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=83c06bf70e38360358c84daab399f18f57e7eba4.1617642417.git.tim.c.chen@linux.intel.com \
    --to=tim.c.chen@linux.intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=dan.j.williams@intel.com \
    --cc=dave.hansen@intel.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.cz \
    --cc=rientjes@google.com \
    --cc=shakeelb@google.com \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).