[PATCH RFC 3/5] mm: memcg/percpu: per-memcg percpu memory statistics

From: Roman Gushchin <guro@fb.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	Dennis Zhou <dennis@kernel.org>, Tejun Heo <tj@kernel.org>,
	Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>,
	Michal Hocko <mhocko@kernel.org>,
	Shakeel Butt <shakeelb@google.com>, <linux-mm@kvack.org>,
	<kernel-team@fb.com>, <linux-kernel@vger.kernel.org>,
	Roman Gushchin <guro@fb.com>
Subject: [PATCH RFC 3/5] mm: memcg/percpu: per-memcg percpu memory statistics
Date: Tue, 19 May 2020 13:18:04 -0700	[thread overview]
Message-ID: <20200519201806.2308480-4-guro@fb.com> (raw)
In-Reply-To: <20200519201806.2308480-1-guro@fb.com>

Percpu memory can represent a noticeable chunk of the total
memory consumption, especially on big machines with many CPUs.
Let's track percpu memory usage for each memcg and display
it in memory.stat.

A percpu allocation is usually scattered over multiple pages
(and nodes), and can be significantly smaller than a page.
So let's add a byte-sized counter on the memcg level:
MEMCG_PERCPU_B. Byte-sized vmstat infra created for slabs
can be perfectly reused for percpu case.

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 Documentation/admin-guide/cgroup-v2.rst |  4 ++++
 include/linux/memcontrol.h              |  8 ++++++++
 mm/memcontrol.c                         |  4 +++-
 mm/percpu.c                             | 10 ++++++++++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index fed4e1d2a343..aa8cb6dadadc 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1276,6 +1276,10 @@ PAGE_SIZE multiple when read back.
 		Amount of memory used for storing in-kernel data
 		structures.
 
+	  percpu
+		Amount of memory used for storing per-cpu kernel
+		data structures.
+
 	  sock
 		Amount of memory used in network transmission buffers
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 968b73cad428..a3086c832799 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -32,11 +32,19 @@ struct kmem_cache;
 enum memcg_stat_item {
 	MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
 	MEMCG_SOCK,
+	MEMCG_PERCPU_B,
 	/* XXX: why are these zone and not node counters? */
 	MEMCG_KERNEL_STACK_KB,
 	MEMCG_NR_STAT,
 };
 
+static __always_inline bool memcg_stat_item_in_bytes(enum memcg_stat_item item)
+{
+	if (item == MEMCG_PERCPU_B)
+		return true;
+	return vmstat_item_in_bytes(item);
+}
+
 enum memcg_memory_event {
 	MEMCG_LOW,
 	MEMCG_HIGH,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 84a1fb848a85..b3cf8c5e7f71 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -783,7 +783,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 	if (mem_cgroup_disabled())
 		return;
 
-	if (vmstat_item_in_bytes(idx))
+	if (memcg_stat_item_in_bytes(idx))
 		threshold <<= PAGE_SHIFT;
 
 	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
@@ -1490,6 +1490,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
 	seq_buf_printf(&s, "slab %llu\n",
 		       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
 			     memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
+	seq_buf_printf(&s, "percpu %llu\n",
+		       (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
 	seq_buf_printf(&s, "sock %llu\n",
 		       (u64)memcg_page_state(memcg, MEMCG_SOCK) *
 		       PAGE_SIZE);
diff --git a/mm/percpu.c b/mm/percpu.c
index 4615a0a809b8..71de0bae96f0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1603,6 +1603,11 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
 
 	if (chunk) {
 		chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
+
+		rcu_read_lock();
+		mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+				size * num_possible_cpus());
+		rcu_read_unlock();
 	} else {
 		obj_cgroup_uncharge(objcg, size * num_possible_cpus());
 		obj_cgroup_put(objcg);
@@ -1621,6 +1626,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
 
 	obj_cgroup_uncharge(objcg, size * num_possible_cpus());
 
+	rcu_read_lock();
+	mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+			-(size * num_possible_cpus()));
+	rcu_read_unlock();
+
 	obj_cgroup_put(objcg);
 }
 
-- 
2.25.4