linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Jianlin Lv <iecedge@gmail.com>
To: tj@kernel.org, lizefan.x@bytedance.com, hannes@cmpxchg.org,
	corbet@lwn.net, mhocko@kernel.org, roman.gushchin@linux.dev,
	shakeelb@google.com, muchun.song@linux.dev,
	akpm@linux-foundation.org, yosryahmed@google.com,
	willy@infradead.org, linmiaohe@huawei.com,
	wangkefeng.wang@huawei.com, laoar.shao@gmail.com,
	yuzhao@google.com, wuyun.abel@bytedance.com, david@redhat.com,
	ying.huang@intel.com, peterx@redhat.com, vishal.moola@gmail.com,
	hughd@google.com
Cc: cgroups@vger.kernel.org, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	jianlv@ebay.com, iecedge@gmail.com
Subject: [PATCH] memcg: add interface to force disable swap
Date: Sat,  7 Oct 2023 21:09:05 +0800	[thread overview]
Message-ID: <20231007130905.78554-1-jianlv@ebay.com> (raw)

From: Jianlin Lv <iecedge@gmail.com>

Global reclaim will swap even if swappiness is set to 0. In particular
case, users wish to be able to completely disable swap for specific
processes. One scenario is that if JVM memory pages falls into swap,
the performance will noticeably reduce and the GC pauses tend to increase
to levels not tolerable by most applications.
If it's possible to only disable swap out for specific processes, it can
address the JVM GC pauses issues, and at the same time, memory reclaim
pressure is also manageable.

This patch adds "memory.swap_force_disable" control file to support disable
swap for non-root cgroup. When process is associated with a cgroup,
'echo 1 > memory.swap_force_disable' will forbid anon pages be swapped out.
This patch also adds read and write handler of the control file.

Signed-off-by: Jianlin Lv <iecedge@gmail.com>
---
 .../admin-guide/cgroup-v1/memory.rst          | 15 ++++++++++
 include/linux/memcontrol.h                    |  1 +
 include/linux/swap.h                          | 15 ++++++++++
 mm/memcontrol.c                               | 28 +++++++++++++++++++
 mm/vmscan.c                                   |  3 +-
 5 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index ff456871bf4b..be84b98bc6fe 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -86,6 +86,7 @@ Brief summary of control files.
  memory.pressure_level		     set memory pressure notifications
  memory.swappiness		     set/show swappiness parameter of vmscan
 				     (See sysctl's vm.swappiness)
+ memory.swap_force_disable           set/show force disable swap
  memory.move_charge_at_immigrate     set/show controls of moving charges
                                      This knob is deprecated and shouldn't be
                                      used.
@@ -615,6 +616,20 @@ enforces that 0 swappiness really prevents from any swapping even if
 there is a swap storage available. This might lead to memcg OOM killer
 if there are no file pages to reclaim.
 
+swap_force_disable is used to allow control group to disable swap even if swap
+storage is available. This feature is disabled by default. If you want to
+disable swap for specified processes, swap_force_disable can be setup by
+following commands::
+
+	# cd /sys/fs/cgroup/memory/
+	# mkdir test
+	# cd test
+	# echo 1 > memory.swap_force_disable
+	# echo <PID> > cgroup.procs
+
+.. note::
+	swap_force_disable only take effect for non-root cgroups.
+
 5.4 failcnt
 -----------
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e4e24da16d2c..b26dcb0756c0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -246,6 +246,7 @@ struct mem_cgroup {
 	int		under_oom;
 
 	int	swappiness;
+	int	swap_force_disable;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 493487ed7c38..b202de576984 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -624,6 +624,21 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 }
 #endif
 
+#ifdef CONFIG_MEMCG
+static inline int mem_cgroup_swap_force_disable(struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
+		return 0;
+
+	return memcg->swap_force_disable;
+}
+#else
+static inline int mem_cgroup_swap_force_disable(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+#endif
+
 #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
 void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
 static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5b009b233ab8..024750444c79 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4196,6 +4196,28 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 	return 0;
 }
 
+static u64 mem_cgroup_swap_force_disable_read(struct cgroup_subsys_state *css,
+					struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	return mem_cgroup_swap_force_disable(memcg);
+}
+
+static int mem_cgroup_swap_force_disable_write(struct cgroup_subsys_state *css,
+					struct cftype *cft, u64 val)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	/* cannot set to root cgroup and only 0 and 1 are allowed */
+	if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
+		return -EINVAL;
+
+	memcg->swap_force_disable = val;
+
+	return 0;
+}
+
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
 	struct mem_cgroup_threshold_ary *t;
@@ -5064,6 +5086,11 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.read_u64 = mem_cgroup_swappiness_read,
 		.write_u64 = mem_cgroup_swappiness_write,
 	},
+	{
+		.name = "swap_force_disable",
+		.read_u64 = mem_cgroup_swap_force_disable_read,
+		.write_u64 = mem_cgroup_swap_force_disable_write,
+	},
 	{
 		.name = "move_charge_at_immigrate",
 		.read_u64 = mem_cgroup_move_charge_read,
@@ -5367,6 +5394,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
 	if (parent) {
 		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
+		WRITE_ONCE(memcg->swap_force_disable, mem_cgroup_swap_force_disable(parent));
 		WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
 
 		page_counter_init(&memcg->memory, &parent->memory);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6f13394b112e..5fdb4ac07007 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3029,6 +3029,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	unsigned long anon_cost, file_cost, total_cost;
 	int swappiness = mem_cgroup_swappiness(memcg);
+	int swap_force_disable = mem_cgroup_swap_force_disable(memcg);
 	u64 fraction[ANON_AND_FILE];
 	u64 denominator = 0;	/* gcc */
 	enum scan_balance scan_balance;
@@ -3036,7 +3037,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	enum lru_list lru;
 
 	/* If we have no swap space, do not bother scanning anon folios. */
-	if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
+	if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc) || swap_force_disable) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
-- 
2.34.1



             reply	other threads:[~2023-10-07 13:09 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-10-07 13:09 Jianlin Lv [this message]
2023-10-08  1:14 ` [PATCH] memcg: add interface to force disable swap Huang, Ying
2023-10-08  7:52   ` Jianlin Lv
2023-10-08  8:24     ` Huang, Ying
2023-10-08  9:34       ` Jianlin Lv
2023-10-09  5:58         ` Huang, Ying
2023-10-09  7:34           ` Michal Hocko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231007130905.78554-1-jianlv@ebay.com \
    --to=iecedge@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=corbet@lwn.net \
    --cc=david@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=jianlv@ebay.com \
    --cc=laoar.shao@gmail.com \
    --cc=linmiaohe@huawei.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizefan.x@bytedance.com \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=peterx@redhat.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeelb@google.com \
    --cc=tj@kernel.org \
    --cc=vishal.moola@gmail.com \
    --cc=wangkefeng.wang@huawei.com \
    --cc=willy@infradead.org \
    --cc=wuyun.abel@bytedance.com \
    --cc=ying.huang@intel.com \
    --cc=yosryahmed@google.com \
    --cc=yuzhao@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).