[RFC] mm: memcg: add priority for soft limit reclaiming

From: Hillf Danton <hdanton@sina.com>
To: Michal Hocko <mhocko@suse.cz>, Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Shakeel Butt <shakeelb@google.com>, Roman Gushchin <guro@fb.com>,
	Matthew Wilcox <willy@infradead.org>,
	Hillf Danton <hdanton@sina.com>
Subject: [RFC] mm: memcg: add priority for soft limit reclaiming
Date: Thu, 19 Sep 2019 21:13:32 +0800	[thread overview]
Message-ID: <20190919131332.4180-1-hdanton@sina.com> (raw)


Currently memory controler is playing increasingly important role in
how memory is used and how pages are reclaimed on memory pressure.

In daily works memcg is often created for critical tasks and their pre
configured memory usage is supposed to be met even on memory pressure.
Administrator wants to make it configurable that the pages consumed by
memcg-B can be reclaimed by page allocations invoked not by memcg-A but
by memcg-C.

That configurability is addressed by adding priority for soft limit
reclaiming to make sure that no pages will be reclaimed from memcg of
higer priortiy in favor of memcg of lower priority.

Pages are reclaimed with no priority being taken into account by default
unless user turns it on, and then they are responsible for their smart
activities almost the same way as they play realtime FIFO/RR games.

Priority is available only in the direct reclaiming context in order to
advoid churning in the complex kswapd behavior.

Cc: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Hillf Danton <hdanton@sina.com>
---

--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -230,6 +230,21 @@ struct mem_cgroup {
 	int		under_oom;
 
 	int	swappiness;
+	/*
+	 * slrp, soft limit reclaiming priority
+	 *
+	 * 0, by default, no slrp considered on soft reclaiming.
+	 *
+	 * 1-32, user configurable in ascending order,
+	 * 	no page will be reclaimed from memcg of higher slrp in
+	 * 	favor of memcg of lower slrp.
+	 *
+	 * only in direct reclaiming context now.
+	 */
+	int	slrp;
+#define MEMCG_SLRP_MIN 1
+#define MEMCG_SLRP_MAX 32
+
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -647,7 +647,8 @@ static void mem_cgroup_remove_from_trees
 }
 
 static struct mem_cgroup_per_node *
-__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
+__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz,
+					int slrp)
 {
 	struct mem_cgroup_per_node *mz;
 
@@ -664,7 +665,7 @@ retry:
 	 * position in the tree.
 	 */
 	__mem_cgroup_remove_exceeded(mz, mctz);
-	if (!soft_limit_excess(mz->memcg) ||
+	if (!soft_limit_excess(mz->memcg) || mz->memcg->slrp > slrp ||
 	    !css_tryget_online(&mz->memcg->css))
 		goto retry;
 done:
@@ -672,12 +673,13 @@ done:
 }
 
 static struct mem_cgroup_per_node *
-mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
+mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz,
+					int slrp)
 {
 	struct mem_cgroup_per_node *mz;
 
 	spin_lock_irq(&mctz->lock);
-	mz = __mem_cgroup_largest_soft_limit_node(mctz);
+	mz = __mem_cgroup_largest_soft_limit_node(mctz, slrp);
 	spin_unlock_irq(&mctz->lock);
 	return mz;
 }
@@ -2972,6 +2974,31 @@ static int mem_cgroup_resize_max(struct
 	return ret;
 }
 
+static int mem_cgroup_get_slrp(void)
+{
+	int slrp;
+
+	if (current->flags & PF_KTHREAD) {
+		/*
+		 * now slrp does not churn in background reclaiming to
+		 * make life simple
+		 */
+		slrp = 0;
+	} else {
+		struct mem_cgroup *memcg;
+
+		rcu_read_lock();
+		memcg = mem_cgroup_from_task(current);
+		if (!memcg || memcg == root_mem_cgroup)
+			slrp = 0;
+		else
+			slrp = memcg->slrp;
+		rcu_read_unlock();
+	}
+
+	return slrp;
+}
+
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 					    gfp_t gfp_mask,
 					    unsigned long *total_scanned)
@@ -2980,6 +3007,7 @@ unsigned long mem_cgroup_soft_limit_recl
 	struct mem_cgroup_per_node *mz, *next_mz = NULL;
 	unsigned long reclaimed;
 	int loop = 0;
+	int slrp;
 	struct mem_cgroup_tree_per_node *mctz;
 	unsigned long excess;
 	unsigned long nr_scanned;
@@ -2997,6 +3025,7 @@ unsigned long mem_cgroup_soft_limit_recl
 	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
 		return 0;
 
+	slrp = mem_cgroup_get_slrp();
 	/*
 	 * This loop can run a while, specially if mem_cgroup's continuously
 	 * keep exceeding their soft limit and putting the system under
@@ -3006,7 +3035,7 @@ unsigned long mem_cgroup_soft_limit_recl
 		if (next_mz)
 			mz = next_mz;
 		else
-			mz = mem_cgroup_largest_soft_limit_node(mctz);
+			mz = mem_cgroup_largest_soft_limit_node(mctz, slrp);
 		if (!mz)
 			break;
 
@@ -3024,8 +3053,8 @@ unsigned long mem_cgroup_soft_limit_recl
 		 */
 		next_mz = NULL;
 		if (!reclaimed)
-			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
-
+			next_mz = __mem_cgroup_largest_soft_limit_node(mctz,
+							slrp);
 		excess = soft_limit_excess(mz->memcg);
 		/*
 		 * One school of thought says that we should not add
@@ -5817,6 +5846,37 @@ static ssize_t memory_oom_group_write(st
 	return nbytes;
 }
 
+static int memory_slrp_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+	seq_printf(m, "%d\n", memcg->slrp);
+
+	return 0;
+}
+
+static ssize_t memory_slrp_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, slrp;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtoint(buf, 0, &slrp);
+	if (ret)
+		return ret;
+
+	if (slrp < MEMCG_SLRP_MIN || MEMCG_SLRP_MAX < slrp)
+		return -EINVAL;
+
+	memcg->slrp = slrp;
+
+	return nbytes;
+}
+
 static struct cftype memory_files[] = {
 	{
 		.name = "current",
@@ -5870,6 +5930,12 @@ static struct cftype memory_files[] = {
 		.seq_show = memory_oom_group_show,
 		.write = memory_oom_group_write,
 	},
+	{
+		.name = "slrp",
+		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+		.seq_show = memory_slrp_show,
+		.write = memory_slrp_write,
+	},
 	{ }	/* terminate */
 };