linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: hannes@cmpxchg.org, mhocko@kernel.org, vdavydov.dev@gmail.com
Cc: guro@fb.com, riel@surriel.com, akpm@linux-foundation.org,
	linux-kernel@vger.kernel.org, kernel-team@fb.com,
	cgroups@vger.kernel.org, linux-mm@kvack.org,
	Tejun Heo <tj@kernel.org>
Subject: [PATCH 1/3] mm: memcontrol: Use cgroup_rstat for event accounting
Date: Sat, 24 Mar 2018 09:08:59 -0700	[thread overview]
Message-ID: <20180324160901.512135-2-tj@kernel.org> (raw)
In-Reply-To: <20180324160901.512135-1-tj@kernel.org>

To fix scalability issue, a983b5ebee57 ("mm: memcontrol: fix excessive
complexity in memory.stat reporting") made the per-cpu counters
batch-overflow into the global one instead of summing them up on
reads.

While the approach works for statistics which don't care about
controlled errors, it doesn't work for events.  For example, when a
process in a cgroup is oom killed, a notification is generated on the
memory.events file.  A user reading the file after receiving the
notification must be able to see the increased oom_kill event counter
but often won't be able to due to the per-cpu batching.

The problem the original commit tried to fix was avoiding excessively
high complexity on reads.  This can be solved using cgroup_rstat
instead which has the following properties.

* Per-cpu stat updates with a small bookkeeping overhead.

* Lazy, accurate and on-demand hierarchical stat propagation with the
  complexity of O(number of cgroups which have been active in the
  subtree since last read).

This patch converts event accounting to use cgroup_rstat.

* mem_cgroup_stat_cpu->last_events[] and mem_cgroup->pending_events[]
  are added to track propagation.  As memcg makes use of both local
  and hierarchical stats, mem_cgroup->tree_events[] is added to track
  hierarchical numbers.

* The per-cpu counters are unsigned long and the collected counters
  are unsigned long long.  This makes stat updates simple while
  avoiding overflows in the collected counters on 32bit machines.  The
  existing code used unsigned long longs in some places but didn't
  cover enough to avoid overflows.

* memcg_sum_events() and tree_events() are replaced with direct
  accesses to mem_cgroup->events[] and ->tree_events[].  The accesses
  are wrapped between cgroup_rstat_flush_hold() and
  cgroup_rstat_flush_release().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Roman Gushchin <guro@fb.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  19 ++++----
 mm/memcontrol.c            | 114 ++++++++++++++++++++++++---------------------
 2 files changed, 70 insertions(+), 63 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c46016b..f1afbf6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -91,6 +91,9 @@ struct mem_cgroup_stat_cpu {
 	unsigned long events[MEMCG_NR_EVENTS];
 	unsigned long nr_page_events;
 	unsigned long targets[MEM_CGROUP_NTARGETS];
+
+	/* for cgroup rstat delta calculation */
+	unsigned long last_events[MEMCG_NR_EVENTS];
 };
 
 struct mem_cgroup_reclaim_iter {
@@ -233,7 +236,11 @@ struct mem_cgroup {
 
 	struct mem_cgroup_stat_cpu __percpu *stat_cpu;
 	atomic_long_t		stat[MEMCG_NR_STAT];
-	atomic_long_t		events[MEMCG_NR_EVENTS];
+
+	/* events is managed by cgroup rstat */
+	unsigned long long	events[MEMCG_NR_EVENTS];	/* local */
+	unsigned long long	tree_events[MEMCG_NR_EVENTS];	/* subtree */
+	unsigned long long	pending_events[MEMCG_NR_EVENTS];/* propagation */
 
 	unsigned long		socket_pressure;
 
@@ -649,17 +656,11 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 static inline void __count_memcg_events(struct mem_cgroup *memcg,
 					int idx, unsigned long count)
 {
-	unsigned long x;
-
 	if (mem_cgroup_disabled())
 		return;
 
-	x = count + __this_cpu_read(memcg->stat_cpu->events[idx]);
-	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
-		atomic_long_add(x, &memcg->events[idx]);
-		x = 0;
-	}
-	__this_cpu_write(memcg->stat_cpu->events[idx], x);
+	__this_cpu_add(memcg->stat_cpu->events[idx], count);
+	cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
 }
 
 static inline void count_memcg_events(struct mem_cgroup *memcg,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 670e99b..82cb532 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -542,12 +542,6 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 	return mz;
 }
 
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
-				      int event)
-{
-	return atomic_long_read(&memcg->events[event]);
-}
-
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 					 struct page *page,
 					 bool compound, int nr_pages)
@@ -1838,14 +1832,6 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 					atomic_long_add(x, &pn->lruvec_stat[i]);
 			}
 		}
-
-		for (i = 0; i < MEMCG_NR_EVENTS; i++) {
-			long x;
-
-			x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
-			if (x)
-				atomic_long_add(x, &memcg->events[i]);
-		}
 	}
 
 	return 0;
@@ -2683,19 +2669,6 @@ static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
 	}
 }
 
-static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
-{
-	struct mem_cgroup *iter;
-	int i;
-
-	memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
-
-	for_each_mem_cgroup_tree(iter, memcg) {
-		for (i = 0; i < MEMCG_NR_EVENTS; i++)
-			events[i] += memcg_sum_events(iter, i);
-	}
-}
-
 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	unsigned long val = 0;
@@ -3107,6 +3080,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 
+	cgroup_rstat_flush_hold(memcg->css.cgroup);
+
 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
@@ -3116,8 +3091,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 	}
 
 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
-		seq_printf(m, "%s %lu\n", memcg1_event_names[i],
-			   memcg_sum_events(memcg, memcg1_events[i]));
+		seq_printf(m, "%s %llu\n", memcg1_event_names[i],
+			   memcg->events[memcg1_events[i]]);
 
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
@@ -3146,13 +3121,9 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
 	}
 
-	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
-		unsigned long long val = 0;
-
-		for_each_mem_cgroup_tree(mi, memcg)
-			val += memcg_sum_events(mi, memcg1_events[i]);
-		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
-	}
+	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
+			   memcg->tree_events[memcg1_events[i]]);
 
 	for (i = 0; i < NR_LRU_LISTS; i++) {
 		unsigned long long val = 0;
@@ -3185,7 +3156,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
 	}
 #endif
-
+	cgroup_rstat_flush_release();
 	return 0;
 }
 
@@ -3538,9 +3509,13 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
 
+	cgroup_rstat_flush_hold(memcg->css.cgroup);
+
 	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
 	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
-	seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
+	seq_printf(sf, "oom_kill %llu\n", memcg->events[OOM_KILL]);
+
+	cgroup_rstat_flush_release();
 	return 0;
 }
 
@@ -4327,6 +4302,32 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	memcg_wb_domain_size_changed(memcg);
 }
 
+static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+	struct mem_cgroup_stat_cpu *statc = per_cpu_ptr(memcg->stat_cpu, cpu);
+	unsigned long v, delta;
+	int i;
+
+	for (i = 0; i < MEMCG_NR_EVENTS; i++) {
+		/* calculate the delta to propagate and add to local stat */
+		v = READ_ONCE(statc->events[i]);
+		delta = v - statc->last_events[i];
+		statc->last_events[i] = v;
+		memcg->events[i] += delta;
+
+		/* transfer the pending stat into delta */
+		delta += memcg->pending_events[i];
+		memcg->pending_events[i] = 0;
+
+		/* propagate delta into tree stat and parent's pending */
+		memcg->tree_events[i] += delta;
+		if (parent)
+			parent->pending_events[i] += delta;
+	}
+}
+
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 static int mem_cgroup_do_precharge(unsigned long count)
@@ -5191,12 +5192,15 @@ static int memory_events_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 
-	seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW));
-	seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
-	seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
-	seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
-	seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
+	cgroup_rstat_flush_hold(memcg->css.cgroup);
+
+	seq_printf(m, "low %llu\n", memcg->events[MEMCG_LOW]);
+	seq_printf(m, "high %llu\n", memcg->events[MEMCG_HIGH]);
+	seq_printf(m, "max %llu\n", memcg->events[MEMCG_MAX]);
+	seq_printf(m, "oom %llu\n", memcg->events[MEMCG_OOM]);
+	seq_printf(m, "oom_kill %llu\n", memcg->events[OOM_KILL]);
 
+	cgroup_rstat_flush_release();
 	return 0;
 }
 
@@ -5204,7 +5208,7 @@ static int memory_stat_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	unsigned long stat[MEMCG_NR_STAT];
-	unsigned long events[MEMCG_NR_EVENTS];
+	unsigned long long *events = memcg->tree_events;
 	int i;
 
 	/*
@@ -5219,7 +5223,7 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	 */
 
 	tree_stat(memcg, stat);
-	tree_events(memcg, events);
+	cgroup_rstat_flush_hold(memcg->css.cgroup);
 
 	seq_printf(m, "anon %llu\n",
 		   (u64)stat[MEMCG_RSS] * PAGE_SIZE);
@@ -5259,18 +5263,18 @@ static int memory_stat_show(struct seq_file *m, void *v)
 
 	/* Accumulated memory events */
 
-	seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
-	seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+	seq_printf(m, "pgfault %llu\n", events[PGFAULT]);
+	seq_printf(m, "pgmajfault %llu\n", events[PGMAJFAULT]);
 
-	seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
-	seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
+	seq_printf(m, "pgrefill %llu\n", events[PGREFILL]);
+	seq_printf(m, "pgscan %llu\n", events[PGSCAN_KSWAPD] +
 		   events[PGSCAN_DIRECT]);
-	seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
+	seq_printf(m, "pgsteal %llu\n", events[PGSTEAL_KSWAPD] +
 		   events[PGSTEAL_DIRECT]);
-	seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
-	seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
-	seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
-	seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
+	seq_printf(m, "pgactivate %llu\n", events[PGACTIVATE]);
+	seq_printf(m, "pgdeactivate %llu\n", events[PGDEACTIVATE]);
+	seq_printf(m, "pglazyfree %llu\n", events[PGLAZYFREE]);
+	seq_printf(m, "pglazyfreed %llu\n", events[PGLAZYFREED]);
 
 	seq_printf(m, "workingset_refault %lu\n",
 		   stat[WORKINGSET_REFAULT]);
@@ -5279,6 +5283,7 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	seq_printf(m, "workingset_nodereclaim %lu\n",
 		   stat[WORKINGSET_NODERECLAIM]);
 
+	cgroup_rstat_flush_release();
 	return 0;
 }
 
@@ -5327,6 +5332,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.css_released = mem_cgroup_css_released,
 	.css_free = mem_cgroup_css_free,
 	.css_reset = mem_cgroup_css_reset,
+	.css_rstat_flush = mem_cgroup_css_rstat_flush,
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.post_attach = mem_cgroup_move_task,
-- 
2.9.5

  reply	other threads:[~2018-03-24 16:09 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-24 16:08 [PATCHSET] mm, memcontrol: Make cgroup_rstat available to controllers Tejun Heo
2018-03-24 16:08 ` Tejun Heo [this message]
2018-04-04 14:08   ` [PATCH 1/3] mm: memcontrol: Use cgroup_rstat for event accounting Johannes Weiner
2018-04-04 14:18     ` Johannes Weiner
2018-04-04 14:34       ` Michal Hocko
2018-04-04 16:58         ` Tejun Heo
2018-04-05 17:55           ` [PATCH] mm: memcg: make sure memory.events is uptodate when waking pollers Johannes Weiner
2018-04-05 19:45             ` Tejun Heo
2018-04-06 12:03             ` Michal Hocko
2018-03-24 16:09 ` [PATCH 2/3] mm: memcontrol: Use cgroup_rstat for stat accounting Tejun Heo
2018-03-24 16:09 ` [PATCH 3/3] mm: memcontrol: Remove lruvec_stat Tejun Heo
2018-04-04 14:13   ` Johannes Weiner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180324160901.512135-2-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=guro@fb.com \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=riel@surriel.com \
    --cc=vdavydov.dev@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).