All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yosry Ahmed <yosryahmed@google.com>
To: "Tejun Heo" <tj@kernel.org>, "Josef Bacik" <josef@toxicpanda.com>,
	"Jens Axboe" <axboe@kernel.dk>,
	"Zefan Li" <lizefan.x@bytedance.com>,
	"Johannes Weiner" <hannes@cmpxchg.org>,
	"Michal Hocko" <mhocko@kernel.org>,
	"Roman Gushchin" <roman.gushchin@linux.dev>,
	"Shakeel Butt" <shakeelb@google.com>,
	"Muchun Song" <muchun.song@linux.dev>,
	"Andrew Morton" <akpm@linux-foundation.org>,
	"Michal Koutný" <mkoutny@suse.com>
Cc: Vasily Averin <vasily.averin@linux.dev>,
	cgroups@vger.kernel.org, linux-block@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	bpf@vger.kernel.org, Yosry Ahmed <yosryahmed@google.com>
Subject: [PATCH v1 6/9] memcg: sleep during flushing stats in safe contexts
Date: Tue, 28 Mar 2023 06:16:35 +0000	[thread overview]
Message-ID: <20230328061638.203420-7-yosryahmed@google.com> (raw)
In-Reply-To: <20230328061638.203420-1-yosryahmed@google.com>

Currently, all contexts that flush memcg stats do so with sleeping not
allowed. Some of these contexts are perfectly safe to sleep in, such as
reading cgroup files from userspace or the background periodic flusher.

Refactor the code to make mem_cgroup_flush_stats() non-atomic (aka
sleepable), and provide a separate atomic version. The atomic version is
used in reclaim, refault, writeback, and in mem_cgroup_usage(). All
other code paths are left to use the non-atomic version. This includes
callbacks for userspace reads and the periodic flusher.

Since refault is the only caller of mem_cgroup_flush_stats_ratelimited(),
this function is changed to call the atomic version of
mem_cgroup_flush_stats(). Reclaim and refault code paths are modified
to do non-atomic flushing in separate later patches -- so
mem_cgroup_flush_stats_ratelimited() will eventually become non-atomic.

Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---
 include/linux/memcontrol.h |  5 ++++
 mm/memcontrol.c            | 58 ++++++++++++++++++++++++++++++++------
 mm/vmscan.c                |  2 +-
 3 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ac3f3b3a45e2..a4bc3910a2eb 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1037,6 +1037,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 }
 
 void mem_cgroup_flush_stats(void);
+void mem_cgroup_flush_stats_atomic(void);
 void mem_cgroup_flush_stats_ratelimited(void);
 
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
@@ -1535,6 +1536,10 @@ static inline void mem_cgroup_flush_stats(void)
 {
 }
 
+static inline void mem_cgroup_flush_stats_atomic(void)
+{
+}
+
 static inline void mem_cgroup_flush_stats_ratelimited(void)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 64ff33e02c96..57e8cbf701f3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -634,7 +634,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 	}
 }
 
-static void __mem_cgroup_flush_stats(void)
+static bool mem_cgroup_pre_stats_flush(void)
 {
 	/*
 	 * We always flush the entire tree, so concurrent flushers can just
@@ -642,24 +642,57 @@ static void __mem_cgroup_flush_stats(void)
 	 * from memcg flushers (e.g. reclaim, refault, etc).
 	 */
 	if (atomic_xchg(&stats_flush_ongoing, 1))
-		return;
+		return false;
 
 	WRITE_ONCE(flush_next_time, jiffies_64 + 2*FLUSH_TIME);
-	cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
+	return true;
+}
+
+static void mem_cgroup_post_stats_flush(void)
+{
 	atomic_set(&stats_flush_threshold, 0);
 	atomic_set(&stats_flush_ongoing, 0);
 }
 
-void mem_cgroup_flush_stats(void)
+static bool mem_cgroup_should_flush_stats(void)
 {
-	if (atomic_read(&stats_flush_threshold) > num_online_cpus())
-		__mem_cgroup_flush_stats();
+	return atomic_read(&stats_flush_threshold) > num_online_cpus();
+}
+
+/* atomic functions, safe to call from any context */
+static void __mem_cgroup_flush_stats_atomic(void)
+{
+	if (mem_cgroup_pre_stats_flush()) {
+		cgroup_rstat_flush_atomic(root_mem_cgroup->css.cgroup);
+		mem_cgroup_post_stats_flush();
+	}
+}
+
+void mem_cgroup_flush_stats_atomic(void)
+{
+	if (mem_cgroup_should_flush_stats())
+		__mem_cgroup_flush_stats_atomic();
 }
 
 void mem_cgroup_flush_stats_ratelimited(void)
 {
 	if (time_after64(jiffies_64, READ_ONCE(flush_next_time)))
-		mem_cgroup_flush_stats();
+		mem_cgroup_flush_stats_atomic();
+}
+
+/* non-atomic functions, only safe from sleepable contexts */
+static void __mem_cgroup_flush_stats(void)
+{
+	if (mem_cgroup_pre_stats_flush()) {
+		cgroup_rstat_flush(root_mem_cgroup->css.cgroup);
+		mem_cgroup_post_stats_flush();
+	}
+}
+
+void mem_cgroup_flush_stats(void)
+{
+	if (mem_cgroup_should_flush_stats())
+		__mem_cgroup_flush_stats();
 }
 
 static void flush_memcg_stats_dwork(struct work_struct *w)
@@ -3684,9 +3717,12 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 		 * done from irq context; use stale stats in this case.
 		 * Arguably, usage threshold events are not reliable on the root
 		 * memcg anyway since its usage is ill-defined.
+		 *
+		 * Additionally, other call paths through memcg_check_events()
+		 * disable irqs, so make sure we are flushing stats atomically.
 		 */
 		if (in_task())
-			mem_cgroup_flush_stats();
+			mem_cgroup_flush_stats_atomic();
 		val = memcg_page_state(memcg, NR_FILE_PAGES) +
 			memcg_page_state(memcg, NR_ANON_MAPPED);
 		if (swap)
@@ -4609,7 +4645,11 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
 
-	mem_cgroup_flush_stats();
+	/*
+	 * wb_writeback() takes a spinlock and calls
+	 * wb_over_bg_thresh()->mem_cgroup_wb_stats(). Do not sleep.
+	 */
+	mem_cgroup_flush_stats_atomic();
 
 	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
 	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c1c5e8b24b8..a9511ccb936f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2845,7 +2845,7 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
 	 * Flush the memory cgroup stats, so that we read accurate per-memcg
 	 * lruvec stats for heuristics.
 	 */
-	mem_cgroup_flush_stats();
+	mem_cgroup_flush_stats_atomic();
 
 	/*
 	 * Determine the scan balance between anon and file LRUs.
-- 
2.40.0.348.gf938b09366-goog


  parent reply	other threads:[~2023-03-28  6:17 UTC|newest]

Thread overview: 68+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-28  6:16 [PATCH v1 0/9] memcg: make rstat flushing irq and sleep friendly Yosry Ahmed
2023-03-28  6:16 ` Yosry Ahmed
2023-03-28  6:16 ` [PATCH v1 1/9] cgroup: rename cgroup_rstat_flush_"irqsafe" to "atomic" Yosry Ahmed
2023-03-28  6:16   ` Yosry Ahmed
2023-03-28 13:24   ` Shakeel Butt
2023-03-28 13:24     ` Shakeel Butt
2023-03-28 17:42   ` Johannes Weiner
2023-03-28 17:42     ` Johannes Weiner
2023-03-28  6:16 ` [PATCH v1 2/9] memcg: rename mem_cgroup_flush_stats_"delayed" to "ratelimited" Yosry Ahmed
2023-03-28 13:25   ` Shakeel Butt
2023-03-28 13:25     ` Shakeel Butt
2023-03-28 17:42   ` Johannes Weiner
2023-03-28 17:42     ` Johannes Weiner
2023-03-28  6:16 ` [PATCH v1 3/9] memcg: do not flush stats in irq context Yosry Ahmed
2023-03-28 13:26   ` Shakeel Butt
2023-03-28 13:26     ` Shakeel Butt
2023-03-28 17:43   ` Johannes Weiner
2023-03-28  6:16 ` [PATCH v1 4/9] cgroup: rstat: add WARN_ON_ONCE() if flushing outside task context Yosry Ahmed
2023-03-28 14:59   ` Shakeel Butt
2023-03-28 14:59     ` Shakeel Butt
2023-03-28 17:49   ` Johannes Weiner
2023-03-28 18:59     ` Yosry Ahmed
2023-03-28 18:59       ` Yosry Ahmed
2023-03-28 22:18       ` Yosry Ahmed
2023-03-28 22:18         ` Yosry Ahmed
2023-03-28  6:16 ` [PATCH v1 5/9] memcg: replace stats_flush_lock with an atomic Yosry Ahmed
2023-03-28 14:15   ` Shakeel Butt
2023-03-28 14:15     ` Shakeel Butt
2023-03-28 18:52     ` Yosry Ahmed
2023-03-28 18:52       ` Yosry Ahmed
2023-03-28 19:28       ` Shakeel Butt
2023-03-28 19:28         ` Shakeel Butt
2023-03-28 19:34         ` Yosry Ahmed
2023-03-28 19:34           ` Yosry Ahmed
2023-03-28 19:42           ` Yosry Ahmed
2023-03-28 17:53   ` Johannes Weiner
2023-03-28 17:53     ` Johannes Weiner
2023-03-28  6:16 ` Yosry Ahmed [this message]
2023-03-28 15:09   ` [PATCH v1 6/9] memcg: sleep during flushing stats in safe contexts Shakeel Butt
2023-03-28 15:09     ` Shakeel Butt
2023-03-28 18:35   ` Johannes Weiner
2023-03-28 18:45     ` Yosry Ahmed
2023-03-28 18:45       ` Yosry Ahmed
2023-03-28 19:06       ` Johannes Weiner
2023-03-28 19:06         ` Johannes Weiner
2023-03-28 19:26         ` Yosry Ahmed
2023-03-28 19:26           ` Yosry Ahmed
2023-03-28  6:16 ` [PATCH v1 7/9] workingset: memcg: sleep when flushing stats in workingset_refault() Yosry Ahmed
2023-03-28 15:18   ` Shakeel Butt
2023-03-28 15:18     ` Shakeel Butt
2023-03-28 18:47     ` Johannes Weiner
2023-03-28 18:47       ` Johannes Weiner
2023-03-28 19:25     ` Yosry Ahmed
2023-03-28 18:43   ` Johannes Weiner
2023-03-28 18:43     ` Johannes Weiner
2023-03-28  6:16 ` [PATCH v1 8/9] vmscan: memcg: sleep when flushing stats during reclaim Yosry Ahmed
2023-03-28  6:16   ` Yosry Ahmed
2023-03-28 15:19   ` Shakeel Butt
2023-03-28 15:19     ` Shakeel Butt
2023-03-28 19:01     ` Yosry Ahmed
2023-03-28 19:01       ` Yosry Ahmed
2023-03-28 19:29       ` Shakeel Butt
2023-03-28 19:29         ` Shakeel Butt
2023-03-28 18:49   ` Johannes Weiner
2023-03-28  6:16 ` [PATCH v1 9/9] memcg: do not modify rstat tree for zero updates Yosry Ahmed
2023-03-28 15:20   ` Shakeel Butt
2023-03-28 15:20     ` Shakeel Butt
2023-03-28 18:50   ` Johannes Weiner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230328061638.203420-7-yosryahmed@google.com \
    --to=yosryahmed@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=axboe@kernel.dk \
    --cc=bpf@vger.kernel.org \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=josef@toxicpanda.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lizefan.x@bytedance.com \
    --cc=mhocko@kernel.org \
    --cc=mkoutny@suse.com \
    --cc=muchun.song@linux.dev \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeelb@google.com \
    --cc=tj@kernel.org \
    --cc=vasily.averin@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.