Re: [PATCH v2 3/5] mm: memcg: make stats flushing threshold per-memcg

From: Yosry Ahmed <yosryahmed@google.com>
To: Oliver Sang <oliver.sang@intel.com>
Cc: "Feng Tang" <feng.tang@intel.com>,
	"Shakeel Butt" <shakeelb@google.com>,
	"oe-lkp@lists.linux.dev" <oe-lkp@lists.linux.dev>,
	lkp <lkp@intel.com>,
	"cgroups@vger.kernel.org" <cgroups@vger.kernel.org>,
	"linux-mm@kvack.org" <linux-mm@kvack.org>,
	"Huang, Ying" <ying.huang@intel.com>,
	"Yin, Fengwei" <fengwei.yin@intel.com>,
	"Andrew Morton" <akpm@linux-foundation.org>,
	"Johannes Weiner" <hannes@cmpxchg.org>,
	"Michal Hocko" <mhocko@kernel.org>,
	"Roman Gushchin" <roman.gushchin@linux.dev>,
	"Muchun Song" <muchun.song@linux.dev>,
	"Ivan Babrou" <ivan@cloudflare.com>, "Tejun Heo" <tj@kernel.org>,
	"Michal Koutný" <mkoutny@suse.com>,
	"Waiman Long" <longman@redhat.com>,
	"kernel-team@cloudflare.com" <kernel-team@cloudflare.com>,
	"Wei Xu" <weixugc@google.com>, "Greg Thelen" <gthelen@google.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH v2 3/5] mm: memcg: make stats flushing threshold per-memcg
Date: Tue, 24 Oct 2023 00:14:42 -0700	[thread overview]
Message-ID: <CAJD7tka7hmOD6KPmJBJa+TscbYEMmTjS+Jh2utPfTbKkfvwD9A@mail.gmail.com> (raw)
In-Reply-To: <ZTdqpcDFVHhFwWMc@xsang-OptiPlex-9020>

[-- Attachment #1: Type: text/plain, Size: 1891 bytes --]

On Mon, Oct 23, 2023 at 11:56 PM Oliver Sang <oliver.sang@intel.com> wrote:
>
> hi, Yosry Ahmed,
>
> On Mon, Oct 23, 2023 at 07:13:50PM -0700, Yosry Ahmed wrote:
>
> ...
>
> >
> > I still could not run the benchmark, but I used a version of
> > fallocate1.c that does 1 million iterations. I ran 100 in parallel.
> > This showed ~13% regression with the patch, so not the same as the
> > will-it-scale version, but it could be an indicator.
> >
> > With that, I did not see any improvement with the fixlet above or
> > ___cacheline_aligned_in_smp. So you can scratch that.
> >
> > I did, however, see some improvement with reducing the indirection
> > layers by moving stats_updates directly into struct mem_cgroup. The
> > regression in my manual testing went down to 9%. Still not great, but
> > I am wondering how this reflects on the benchmark. If you're able to
> > test it that would be great, the diff is below. Meanwhile I am still
> > looking for other improvements that can be made.
>
> we applied previous patch-set as below:
>
> c5f50d8b23c79 (linux-review/Yosry-Ahmed/mm-memcg-change-flush_next_time-to-flush_last_time/20231010-112257) mm: memcg: restore subtree stats flushing
> ac8a48ba9e1ca mm: workingset: move the stats flush into workingset_test_recent()
> 51d74c18a9c61 mm: memcg: make stats flushing threshold per-memcg
> 130617edc1cd1 mm: memcg: move vmstats structs definition above flushing code
> 26d0ee342efc6 mm: memcg: change flush_next_time to flush_last_time
> 25478183883e6 Merge branch 'mm-nonmm-unstable' into mm-everything   <---- the base our tool picked for the patch set
>
> I tried to apply below patch to either 51d74c18a9c61 or c5f50d8b23c79,
> but failed. could you guide how to apply this patch?
> Thanks
>

Thanks for looking into this. I rebased the diff on top of
c5f50d8b23c79. Please find it attached.

[-- Attachment #2: 0001-memcg-move-stats_updates-to-struct-mem_cgroup.patch --]
[-- Type: application/octet-stream, Size: 4667 bytes --]

From 0b0dffdfe192382a3aacfa313beee68b33bf7d86 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Tue, 24 Oct 2023 07:02:02 +0000
Subject: [PATCH] memcg: move stats_updates to struct mem_cgroup

---
 include/linux/memcontrol.h |  4 ++++
 mm/memcontrol.c            | 33 ++++++++++++++++++---------------
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f64ac140083ee..b4dfcd8b9cc1c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -270,6 +270,9 @@ struct mem_cgroup {
 
 	CACHELINE_PADDING(_pad1_);
 
+	/* Stats updates since the last flush */
+	atomic64_t		stats_updates;
+
 	/* memory.stat */
 	struct memcg_vmstats	*vmstats;
 
@@ -309,6 +312,7 @@ struct mem_cgroup {
 	atomic_t		moving_account;
 	struct task_struct	*move_lock_task;
 
+	unsigned int __percpu *stats_updates_percpu;
 	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
 
 #ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 182b4f215fc64..e5d2f3d4d8747 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -627,9 +627,6 @@ struct memcg_vmstats_percpu {
 	/* Cgroup1: threshold notifications & softlimit tree updates */
 	unsigned long		nr_page_events;
 	unsigned long		targets[MEM_CGROUP_NTARGETS];
-
-	/* Stats updates since the last flush */
-	unsigned int		stats_updates;
 };
 
 struct memcg_vmstats {
@@ -644,9 +641,6 @@ struct memcg_vmstats {
 	/* Pending child counts during tree propagation */
 	long			state_pending[MEMCG_NR_STAT];
 	unsigned long		events_pending[NR_MEMCG_EVENTS];
-
-	/* Stats updates since the last flush */
-	atomic64_t		stats_updates;
 };
 
 /*
@@ -695,14 +689,14 @@ static void memcg_stats_unlock(void)
 
 static bool memcg_should_flush_stats(struct mem_cgroup *memcg)
 {
-	return atomic64_read(&memcg->vmstats->stats_updates) >
+	return atomic64_read(&memcg->stats_updates) >
 		MEMCG_CHARGE_BATCH * num_online_cpus();
 }
 
 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 {
 	int cpu = smp_processor_id();
-	unsigned int x;
+	unsigned int *stats_updates_percpu;
 
 	if (!val)
 		return;
@@ -710,10 +704,10 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 	cgroup_rstat_updated(memcg->css.cgroup, cpu);
 
 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
-		x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates,
-					  abs(val));
+		stats_updates_percpu = this_cpu_ptr(memcg->stats_updates_percpu);
 
-		if (x < MEMCG_CHARGE_BATCH)
+		*stats_updates_percpu += abs(val);
+		if (*stats_updates_percpu < MEMCG_CHARGE_BATCH)
 			continue;
 
 		/*
@@ -721,8 +715,8 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 		 * redundant. Avoid the overhead of the atomic update.
 		 */
 		if (!memcg_should_flush_stats(memcg))
-			atomic64_add(x, &memcg->vmstats->stats_updates);
-		__this_cpu_write(memcg->vmstats_percpu->stats_updates, 0);
+			atomic64_add(*stats_updates_percpu, &memcg->stats_updates);
+		*stats_updates_percpu = 0;
 	}
 }
 
@@ -5467,6 +5461,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 		free_mem_cgroup_per_node_info(memcg, node);
 	kfree(memcg->vmstats);
 	free_percpu(memcg->vmstats_percpu);
+	free_percpu(memcg->stats_updates_percpu);
 	kfree(memcg);
 }
 
@@ -5504,6 +5499,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	if (!memcg->vmstats_percpu)
 		goto fail;
 
+	memcg->stats_updates_percpu = alloc_percpu_gfp(unsigned int,
+						       GFP_KERNEL_ACCOUNT);
+	if (!memcg->stats_updates_percpu)
+		goto fail;
+
 	for_each_node(node)
 		if (alloc_mem_cgroup_per_node_info(memcg, node))
 			goto fail;
@@ -5735,10 +5735,12 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
 	struct memcg_vmstats_percpu *statc;
+	int *stats_updates_percpu;
 	long delta, delta_cpu, v;
 	int i, nid;
 
 	statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
+	stats_updates_percpu = per_cpu_ptr(memcg->stats_updates_percpu, cpu);
 
 	for (i = 0; i < MEMCG_NR_STAT; i++) {
 		/*
@@ -5826,9 +5828,10 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 			}
 		}
 	}
+	*stats_updates_percpu = 0;
 	/* We are in a per-cpu loop here, only do the atomic write once */
-	if (atomic64_read(&memcg->vmstats->stats_updates))
-		atomic64_set(&memcg->vmstats->stats_updates, 0);
+	if (atomic64_read(&memcg->stats_updates))
+		atomic64_set(&memcg->stats_updates, 0);
 }
 
 #ifdef CONFIG_MMU
-- 
2.42.0.758.gaed0368e0e-goog