All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: memcg: update memcg OOM messages on cgroup2
@ 2018-08-03 17:57 Tejun Heo
  2018-08-03 20:30 ` Roman Gushchin
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Tejun Heo @ 2018-08-03 17:57 UTC (permalink / raw)
  To: Andrew Morton, Roman Gushchin, Johannes Weiner, Michal Hocko,
	Vladimir Davydov
  Cc: cgroups, linux-mm, kernel-team

mem_cgroup_print_oom_info() currently prints the same info for cgroup1
and cgroup2 OOMs.  It doesn't make much sense on cgroup2, which
doesn't use memsw or separate kmem accounting - the information
reported is both superflous and insufficient.  This patch updates the
memcg OOM messages on cgroup2 so that

* It prints memory and swap usages and limits used on cgroup2.

* It shows the same information as memory.stat.

I took out the recursive printing for cgroup2 because the amount of
output could be a lot and the benefits aren't clear.

What do you guys think?

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/memcontrol.c |  165 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 96 insertions(+), 69 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b3143e..86133e50a0b2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -177,6 +177,7 @@ struct mem_cgroup_event {
 
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
+static void __memory_stat_show(struct seq_file *m, struct mem_cgroup *memcg);
 
 /* Stuffs for move charges at task migration. */
 /*
@@ -1146,33 +1147,49 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 
 	rcu_read_unlock();
 
-	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
-		K((u64)page_counter_read(&memcg->memory)),
-		K((u64)memcg->memory.max), memcg->memory.failcnt);
-	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
-		K((u64)page_counter_read(&memcg->memsw)),
-		K((u64)memcg->memsw.max), memcg->memsw.failcnt);
-	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
-		K((u64)page_counter_read(&memcg->kmem)),
-		K((u64)memcg->kmem.max), memcg->kmem.failcnt);
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+		pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
+			K((u64)page_counter_read(&memcg->memory)),
+			K((u64)memcg->memory.max), memcg->memory.failcnt);
+		pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+			K((u64)page_counter_read(&memcg->memsw)),
+			K((u64)memcg->memsw.max), memcg->memsw.failcnt);
+		pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+			K((u64)page_counter_read(&memcg->kmem)),
+			K((u64)memcg->kmem.max), memcg->kmem.failcnt);
 
-	for_each_mem_cgroup_tree(iter, memcg) {
-		pr_info("Memory cgroup stats for ");
-		pr_cont_cgroup_path(iter->css.cgroup);
-		pr_cont(":");
+		for_each_mem_cgroup_tree(iter, memcg) {
+			pr_info("Memory cgroup stats for ");
+			pr_cont_cgroup_path(iter->css.cgroup);
+			pr_cont(":");
+
+			for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+				if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
+					continue;
+				pr_cont(" %s:%luKB", memcg1_stat_names[i],
+					K(memcg_page_state(iter, memcg1_stats[i])));
+			}
 
-		for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
-			if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
-				continue;
-			pr_cont(" %s:%luKB", memcg1_stat_names[i],
-				K(memcg_page_state(iter, memcg1_stats[i])));
+			for (i = 0; i < NR_LRU_LISTS; i++)
+				pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
+					K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+
+			pr_cont("\n");
 		}
+	} else {
+		pr_info("memory %llu (max %llu)\n",
+			(u64)page_counter_read(&memcg->memory) * PAGE_SIZE,
+			(u64)memcg->memory.max * PAGE_SIZE);
 
-		for (i = 0; i < NR_LRU_LISTS; i++)
-			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
-				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+		if (memcg->swap.max == PAGE_COUNTER_MAX)
+			pr_info("swap %llu\n",
+				(u64)page_counter_read(&memcg->swap) * PAGE_SIZE);
+		else
+			pr_info("swap %llu (max %llu)\n",
+				(u64)page_counter_read(&memcg->swap) * PAGE_SIZE,
+				(u64)memcg->swap.max * PAGE_SIZE);
 
-		pr_cont("\n");
+		__memory_stat_show(NULL, memcg);
 	}
 }
 
@@ -5246,9 +5263,15 @@ static int memory_events_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int memory_stat_show(struct seq_file *m, void *v)
+#define seq_pr_info(m, fmt, ...) do {					\
+	if ((m))							\
+		seq_printf(m, fmt, ##__VA_ARGS__);			\
+	else								\
+		printk(KERN_INFO fmt, ##__VA_ARGS__);			\
+} while (0)
+
+static void __memory_stat_show(struct seq_file *m, struct mem_cgroup *memcg)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	unsigned long stat[MEMCG_NR_STAT];
 	unsigned long events[NR_VM_EVENT_ITEMS];
 	int i;
@@ -5267,26 +5290,26 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	tree_stat(memcg, stat);
 	tree_events(memcg, events);
 
-	seq_printf(m, "anon %llu\n",
-		   (u64)stat[MEMCG_RSS] * PAGE_SIZE);
-	seq_printf(m, "file %llu\n",
-		   (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
-	seq_printf(m, "kernel_stack %llu\n",
-		   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
-	seq_printf(m, "slab %llu\n",
-		   (u64)(stat[NR_SLAB_RECLAIMABLE] +
-			 stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
-	seq_printf(m, "sock %llu\n",
-		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
-
-	seq_printf(m, "shmem %llu\n",
-		   (u64)stat[NR_SHMEM] * PAGE_SIZE);
-	seq_printf(m, "file_mapped %llu\n",
-		   (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
-	seq_printf(m, "file_dirty %llu\n",
-		   (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
-	seq_printf(m, "file_writeback %llu\n",
-		   (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
+	seq_pr_info(m, "anon %llu\n",
+		    (u64)stat[MEMCG_RSS] * PAGE_SIZE);
+	seq_pr_info(m, "file %llu\n",
+		    (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
+	seq_pr_info(m, "kernel_stack %llu\n",
+		    (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
+	seq_pr_info(m, "slab %llu\n",
+		    (u64)(stat[NR_SLAB_RECLAIMABLE] +
+			  stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+	seq_pr_info(m, "sock %llu\n",
+		    (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
+
+	seq_pr_info(m, "shmem %llu\n",
+		    (u64)stat[NR_SHMEM] * PAGE_SIZE);
+	seq_pr_info(m, "file_mapped %llu\n",
+		    (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
+	seq_pr_info(m, "file_dirty %llu\n",
+		    (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
+	seq_pr_info(m, "file_writeback %llu\n",
+		    (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
 
 	for (i = 0; i < NR_LRU_LISTS; i++) {
 		struct mem_cgroup *mi;
@@ -5294,37 +5317,41 @@ static int memory_stat_show(struct seq_file *m, void *v)
 
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_nr_lru_pages(mi, BIT(i));
-		seq_printf(m, "%s %llu\n",
-			   mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
+		seq_pr_info(m, "%s %llu\n",
+			    mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
 	}
 
-	seq_printf(m, "slab_reclaimable %llu\n",
-		   (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
-	seq_printf(m, "slab_unreclaimable %llu\n",
-		   (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+	seq_pr_info(m, "slab_reclaimable %llu\n",
+		    (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
+	seq_pr_info(m, "slab_unreclaimable %llu\n",
+		    (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
 
 	/* Accumulated memory events */
 
-	seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
-	seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
-
-	seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
-	seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
-		   events[PGSCAN_DIRECT]);
-	seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
-		   events[PGSTEAL_DIRECT]);
-	seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
-	seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
-	seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
-	seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
-
-	seq_printf(m, "workingset_refault %lu\n",
-		   stat[WORKINGSET_REFAULT]);
-	seq_printf(m, "workingset_activate %lu\n",
-		   stat[WORKINGSET_ACTIVATE]);
-	seq_printf(m, "workingset_nodereclaim %lu\n",
-		   stat[WORKINGSET_NODERECLAIM]);
+	seq_pr_info(m, "pgfault %lu\n", events[PGFAULT]);
+	seq_pr_info(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+
+	seq_pr_info(m, "pgrefill %lu\n", events[PGREFILL]);
+	seq_pr_info(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
+		    events[PGSCAN_DIRECT]);
+	seq_pr_info(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
+		    events[PGSTEAL_DIRECT]);
+	seq_pr_info(m, "pgactivate %lu\n", events[PGACTIVATE]);
+	seq_pr_info(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
+	seq_pr_info(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
+	seq_pr_info(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
 
+	seq_pr_info(m, "workingset_refault %lu\n",
+		    stat[WORKINGSET_REFAULT]);
+	seq_pr_info(m, "workingset_activate %lu\n",
+		    stat[WORKINGSET_ACTIVATE]);
+	seq_pr_info(m, "workingset_nodereclaim %lu\n",
+		    stat[WORKINGSET_NODERECLAIM]);
+}
+
+static int memory_stat_show(struct seq_file *m, void *v)
+{
+	__memory_stat_show(m, mem_cgroup_from_css(seq_css(m)));
 	return 0;
 }
 

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-03 17:57 [PATCH] mm: memcg: update memcg OOM messages on cgroup2 Tejun Heo
@ 2018-08-03 20:30 ` Roman Gushchin
  2018-08-06 15:55   ` Johannes Weiner
  2018-08-06 15:48 ` Johannes Weiner
  2018-08-06 16:15 ` [PATCH v2] " Tejun Heo
  2 siblings, 1 reply; 14+ messages in thread
From: Roman Gushchin @ 2018-08-03 20:30 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Fri, Aug 03, 2018 at 10:57:43AM -0700, Tejun Heo wrote:
> mem_cgroup_print_oom_info() currently prints the same info for cgroup1
> and cgroup2 OOMs.  It doesn't make much sense on cgroup2, which
> doesn't use memsw or separate kmem accounting - the information
> reported is both superflous and insufficient.  This patch updates the
> memcg OOM messages on cgroup2 so that
> 
> * It prints memory and swap usages and limits used on cgroup2.
> 
> * It shows the same information as memory.stat.
> 
> I took out the recursive printing for cgroup2 because the amount of
> output could be a lot and the benefits aren't clear.
> 
> What do you guys think?

It makes total sense for me.
I'd only add an example of output into the commit message,
as well as an explicit statement that output format is changing
for v2 only.

Small nit below.

> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> ---
>  mm/memcontrol.c |  165 ++++++++++++++++++++++++++++++++------------------------
>  1 file changed, 96 insertions(+), 69 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8c0280b3143e..86133e50a0b2 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -177,6 +177,7 @@ struct mem_cgroup_event {
>  
>  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
>  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
> +static void __memory_stat_show(struct seq_file *m, struct mem_cgroup *memcg);
>  
>  /* Stuffs for move charges at task migration. */
>  /*
> @@ -1146,33 +1147,49 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
>  
>  	rcu_read_unlock();
>  
> -	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
> -		K((u64)page_counter_read(&memcg->memory)),
> -		K((u64)memcg->memory.max), memcg->memory.failcnt);
> -	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
> -		K((u64)page_counter_read(&memcg->memsw)),
> -		K((u64)memcg->memsw.max), memcg->memsw.failcnt);
> -	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
> -		K((u64)page_counter_read(&memcg->kmem)),
> -		K((u64)memcg->kmem.max), memcg->kmem.failcnt);
> +	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
> +		pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
> +			K((u64)page_counter_read(&memcg->memory)),
> +			K((u64)memcg->memory.max), memcg->memory.failcnt);
> +		pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
> +			K((u64)page_counter_read(&memcg->memsw)),
> +			K((u64)memcg->memsw.max), memcg->memsw.failcnt);
> +		pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
> +			K((u64)page_counter_read(&memcg->kmem)),
> +			K((u64)memcg->kmem.max), memcg->kmem.failcnt);
>  
> -	for_each_mem_cgroup_tree(iter, memcg) {
> -		pr_info("Memory cgroup stats for ");
> -		pr_cont_cgroup_path(iter->css.cgroup);
> -		pr_cont(":");
> +		for_each_mem_cgroup_tree(iter, memcg) {
> +			pr_info("Memory cgroup stats for ");
> +			pr_cont_cgroup_path(iter->css.cgroup);
> +			pr_cont(":");
> +
> +			for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
> +				if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
> +					continue;
> +				pr_cont(" %s:%luKB", memcg1_stat_names[i],
> +					K(memcg_page_state(iter, memcg1_stats[i])));
> +			}
>  
> -		for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
> -			if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
> -				continue;
> -			pr_cont(" %s:%luKB", memcg1_stat_names[i],
> -				K(memcg_page_state(iter, memcg1_stats[i])));
> +			for (i = 0; i < NR_LRU_LISTS; i++)
> +				pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
> +					K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
> +
> +			pr_cont("\n");
>  		}
> +	} else {
> +		pr_info("memory %llu (max %llu)\n",
> +			(u64)page_counter_read(&memcg->memory) * PAGE_SIZE,
> +			(u64)memcg->memory.max * PAGE_SIZE);
>  
> -		for (i = 0; i < NR_LRU_LISTS; i++)
> -			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
> -				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
> +		if (memcg->swap.max == PAGE_COUNTER_MAX)
> +			pr_info("swap %llu\n",
> +				(u64)page_counter_read(&memcg->swap) * PAGE_SIZE);
> +		else
> +			pr_info("swap %llu (max %llu)\n",
> +				(u64)page_counter_read(&memcg->swap) * PAGE_SIZE,
> +				(u64)memcg->swap.max * PAGE_SIZE);
>  
> -		pr_cont("\n");
> +		__memory_stat_show(NULL, memcg);
>  	}
>  }
>  
> @@ -5246,9 +5263,15 @@ static int memory_events_show(struct seq_file *m, void *v)
>  	return 0;
>  }
>  
> -static int memory_stat_show(struct seq_file *m, void *v)
> +#define seq_pr_info(m, fmt, ...) do {					\
> +	if ((m))							\
> +		seq_printf(m, fmt, ##__VA_ARGS__);			\
> +	else								\
> +		printk(KERN_INFO fmt, ##__VA_ARGS__);			\
> +} while (0)
> +
> +static void __memory_stat_show(struct seq_file *m, struct mem_cgroup *memcg)
>  {
> -	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
>  	unsigned long stat[MEMCG_NR_STAT];
>  	unsigned long events[NR_VM_EVENT_ITEMS];
>  	int i;
> @@ -5267,26 +5290,26 @@ static int memory_stat_show(struct seq_file *m, void *v)
>  	tree_stat(memcg, stat);
>  	tree_events(memcg, events);
>  
> -	seq_printf(m, "anon %llu\n",
> -		   (u64)stat[MEMCG_RSS] * PAGE_SIZE);
> -	seq_printf(m, "file %llu\n",
> -		   (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
> -	seq_printf(m, "kernel_stack %llu\n",
> -		   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
> -	seq_printf(m, "slab %llu\n",
> -		   (u64)(stat[NR_SLAB_RECLAIMABLE] +
> -			 stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
> -	seq_printf(m, "sock %llu\n",
> -		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
> -
> -	seq_printf(m, "shmem %llu\n",
> -		   (u64)stat[NR_SHMEM] * PAGE_SIZE);
> -	seq_printf(m, "file_mapped %llu\n",
> -		   (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
> -	seq_printf(m, "file_dirty %llu\n",
> -		   (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
> -	seq_printf(m, "file_writeback %llu\n",
> -		   (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
> +	seq_pr_info(m, "anon %llu\n",
> +		    (u64)stat[MEMCG_RSS] * PAGE_SIZE);
> +	seq_pr_info(m, "file %llu\n",
> +		    (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
> +	seq_pr_info(m, "kernel_stack %llu\n",
> +		    (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
> +	seq_pr_info(m, "slab %llu\n",
> +		    (u64)(stat[NR_SLAB_RECLAIMABLE] +
> +			  stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
> +	seq_pr_info(m, "sock %llu\n",
> +		    (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
> +
> +	seq_pr_info(m, "shmem %llu\n",
> +		    (u64)stat[NR_SHMEM] * PAGE_SIZE);
> +	seq_pr_info(m, "file_mapped %llu\n",
> +		    (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
> +	seq_pr_info(m, "file_dirty %llu\n",
> +		    (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
> +	seq_pr_info(m, "file_writeback %llu\n",
> +		    (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
>  
>  	for (i = 0; i < NR_LRU_LISTS; i++) {
>  		struct mem_cgroup *mi;
> @@ -5294,37 +5317,41 @@ static int memory_stat_show(struct seq_file *m, void *v)
>  
>  		for_each_mem_cgroup_tree(mi, memcg)
>  			val += mem_cgroup_nr_lru_pages(mi, BIT(i));
> -		seq_printf(m, "%s %llu\n",
> -			   mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
> +		seq_pr_info(m, "%s %llu\n",
> +			    mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
>  	}
>  
> -	seq_printf(m, "slab_reclaimable %llu\n",
> -		   (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
> -	seq_printf(m, "slab_unreclaimable %llu\n",
> -		   (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
> +	seq_pr_info(m, "slab_reclaimable %llu\n",
> +		    (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
> +	seq_pr_info(m, "slab_unreclaimable %llu\n",
> +		    (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
>  
>  	/* Accumulated memory events */
>  
> -	seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
> -	seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
> -
> -	seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
> -	seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
> -		   events[PGSCAN_DIRECT]);
> -	seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
> -		   events[PGSTEAL_DIRECT]);
> -	seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
> -	seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
> -	seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
> -	seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
> -
> -	seq_printf(m, "workingset_refault %lu\n",
> -		   stat[WORKINGSET_REFAULT]);
> -	seq_printf(m, "workingset_activate %lu\n",
> -		   stat[WORKINGSET_ACTIVATE]);
> -	seq_printf(m, "workingset_nodereclaim %lu\n",
> -		   stat[WORKINGSET_NODERECLAIM]);
> +	seq_pr_info(m, "pgfault %lu\n", events[PGFAULT]);
> +	seq_pr_info(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
> +
> +	seq_pr_info(m, "pgrefill %lu\n", events[PGREFILL]);
> +	seq_pr_info(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
> +		    events[PGSCAN_DIRECT]);
> +	seq_pr_info(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
> +		    events[PGSTEAL_DIRECT]);
> +	seq_pr_info(m, "pgactivate %lu\n", events[PGACTIVATE]);
> +	seq_pr_info(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
> +	seq_pr_info(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
> +	seq_pr_info(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
>  
> +	seq_pr_info(m, "workingset_refault %lu\n",
> +		    stat[WORKINGSET_REFAULT]);
> +	seq_pr_info(m, "workingset_activate %lu\n",
> +		    stat[WORKINGSET_ACTIVATE]);
> +	seq_pr_info(m, "workingset_nodereclaim %lu\n",
> +		    stat[WORKINGSET_NODERECLAIM]);

I'm not sure we need all theses stats in the oom report.
I'd drop the events part.

Thanks!

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-03 17:57 [PATCH] mm: memcg: update memcg OOM messages on cgroup2 Tejun Heo
  2018-08-03 20:30 ` Roman Gushchin
@ 2018-08-06 15:48 ` Johannes Weiner
  2018-08-06 16:15 ` [PATCH v2] " Tejun Heo
  2 siblings, 0 replies; 14+ messages in thread
From: Johannes Weiner @ 2018-08-06 15:48 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, Roman Gushchin, Michal Hocko, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Fri, Aug 03, 2018 at 10:57:43AM -0700, Tejun Heo wrote:
> mem_cgroup_print_oom_info() currently prints the same info for cgroup1
> and cgroup2 OOMs.  It doesn't make much sense on cgroup2, which
> doesn't use memsw or separate kmem accounting - the information
> reported is both superflous and insufficient.  This patch updates the
> memcg OOM messages on cgroup2 so that
> 
> * It prints memory and swap usages and limits used on cgroup2.
> 
> * It shows the same information as memory.stat.

That does look a lot more useful to me than the stuff we print now.

> I took out the recursive printing for cgroup2 because the amount of
> output could be a lot and the benefits aren't clear.

Agreed.

> What do you guys think?
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-03 20:30 ` Roman Gushchin
@ 2018-08-06 15:55   ` Johannes Weiner
  0 siblings, 0 replies; 14+ messages in thread
From: Johannes Weiner @ 2018-08-06 15:55 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: Tejun Heo, Andrew Morton, Michal Hocko, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Fri, Aug 03, 2018 at 01:30:49PM -0700, Roman Gushchin wrote:
> On Fri, Aug 03, 2018 at 10:57:43AM -0700, Tejun Heo wrote:
> > +	seq_pr_info(m, "pgfault %lu\n", events[PGFAULT]);
> > +	seq_pr_info(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
> > +
> > +	seq_pr_info(m, "pgrefill %lu\n", events[PGREFILL]);
> > +	seq_pr_info(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
> > +		    events[PGSCAN_DIRECT]);
> > +	seq_pr_info(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
> > +		    events[PGSTEAL_DIRECT]);
> > +	seq_pr_info(m, "pgactivate %lu\n", events[PGACTIVATE]);
> > +	seq_pr_info(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
> > +	seq_pr_info(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
> > +	seq_pr_info(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
> >  
> > +	seq_pr_info(m, "workingset_refault %lu\n",
> > +		    stat[WORKINGSET_REFAULT]);
> > +	seq_pr_info(m, "workingset_activate %lu\n",
> > +		    stat[WORKINGSET_ACTIVATE]);
> > +	seq_pr_info(m, "workingset_nodereclaim %lu\n",
> > +		    stat[WORKINGSET_NODERECLAIM]);
> 
> I'm not sure we need all theses stats in the oom report.
> I'd drop the events part.

This info dump usually races with ongoing reclaim and frees. The VM
state might have changed quite a bit by the time this is all written
out, which sometimes makes it hard to rootcause from just the state
snapshots. Knowing what the VM was doing before the OOM is helpful.

I'd prefer we keep those in.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-03 17:57 [PATCH] mm: memcg: update memcg OOM messages on cgroup2 Tejun Heo
  2018-08-03 20:30 ` Roman Gushchin
  2018-08-06 15:48 ` Johannes Weiner
@ 2018-08-06 16:15 ` Tejun Heo
  2018-08-06 16:34   ` Roman Gushchin
                     ` (2 more replies)
  2 siblings, 3 replies; 14+ messages in thread
From: Tejun Heo @ 2018-08-06 16:15 UTC (permalink / raw)
  To: Andrew Morton, Roman Gushchin, Johannes Weiner, Michal Hocko,
	Vladimir Davydov
  Cc: cgroups, linux-mm, kernel-team

mem_cgroup_print_oom_info() currently prints the same info for cgroup1
and cgroup2 OOMs.  It doesn't make much sense on cgroup2, which
doesn't use memsw or separate kmem accounting - the information
reported is both superflous and insufficient.  This patch updates the
memcg OOM messages on cgroup2 so that

* It prints memory and swap usages and limits used on cgroup2.

* It shows the same information as memory.stat.

I took out the recursive printing for cgroup2 because the amount of
output could be a lot and the benefits aren't clear.  An example dump
follows.

[   40.854197] stress invoked oom-killer: gfp_mask=0x6000c0(GFP_KERNEL), nodemask=(null), order=0, oo0
[   40.855239] stress cpuset=/ mems_allowed=0
[   40.855665] CPU: 6 PID: 1990 Comm: stress Not tainted 4.18.0-rc7-work+ #281
[   40.856260] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
[   40.857000] Call Trace:
[   40.857222]  dump_stack+0x5e/0x8b
[   40.857517]  dump_header+0x74/0x2fc
[   40.859106]  oom_kill_process+0x225/0x490
[   40.859449]  out_of_memory+0x111/0x530
[   40.859780]  mem_cgroup_out_of_memory+0x4b/0x80
[   40.860161]  mem_cgroup_oom_synchronize+0x3ff/0x450
[   40.861334]  pagefault_out_of_memory+0x2f/0x74
[   40.861718]  __do_page_fault+0x3de/0x460
[   40.862347]  page_fault+0x1e/0x30
[   40.862636] RIP: 0033:0x5566cd5aadd0
[   40.862940] Code: 0f 84 3c 02 00 00 8b 54 24 0c 31 c0 85 d2 0f 94 c0 89 04 24 41 83 fd 02 0f 8f f6
[   40.864558] RSP: 002b:00007ffd979ced40 EFLAGS: 00010206
[   40.865005] RAX: 0000000001f4f000 RBX: 00007f3a397d8010 RCX: 00007f3a397d8010
[   40.865615] RDX: 0000000000000000 RSI: 0000000004001000 RDI: 0000000000000000
[   40.866220] RBP: 00005566cd5abbb4 R08: 00000000ffffffff R09: 0000000000000000
[   40.866845] R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
[   40.867452] R13: 0000000000000002 R14: 0000000000001000 R15: 0000000004000000
[   40.868091] Task in /test-cgroup killed as a result of limit of /test-cgroup
[   40.868726] memory 33554432 (max 33554432)
[   40.869096] swap 0
[   40.869280] anon 32845824
[   40.869519] file 0
[   40.869730] kernel_stack 0
[   40.869966] slab 163840
[   40.870191] sock 0
[   40.870374] shmem 0
[   40.870566] file_mapped 0
[   40.870801] file_dirty 0
[   40.871039] file_writeback 0
[   40.871292] inactive_anon 0
[   40.871542] active_anon 32944128
[   40.871821] inactive_file 0
[   40.872077] active_file 0
[   40.872309] unevictable 0
[   40.872543] slab_reclaimable 0
[   40.872806] slab_unreclaimable 163840
[   40.873136] pgfault 8085
[   40.873358] pgmajfault 0
[   40.873589] pgrefill 0
[   40.873800] pgscan 0
[   40.873991] pgsteal 0
[   40.874202] pgactivate 0
[   40.874424] pgdeactivate 0
[   40.874663] pglazyfree 0
[   40.874881] pglazyfreed 0
[   40.875121] workingset_refault 0
[   40.875401] workingset_activate 0
[   40.875689] workingset_nodereclaim 0
[   40.875996] [ pid ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name
[   40.876789] [ 1969]     0  1969     5121      970    86016        0             0 bash
[   40.877546] [ 1989]     0  1989     1998      260    61440        0             0 stress
[   40.878256] [ 1990]     0  1990    18383     8055   126976        0             0 stress
[   40.878955] Memory cgroup out of memory: Kill process 1990 (stress) score 987 or sacrifice child
[   40.879803] Killed process 1990 (stress) total-vm:73532kB, anon-rss:32008kB, file-rss:212kB, shmemB

v2: Updated commit message to include an example dump as suggested by
    Roman.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/memcontrol.c |  165 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 96 insertions(+), 69 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b3143e..86133e50a0b2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -177,6 +177,7 @@ struct mem_cgroup_event {
 
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
+static void __memory_stat_show(struct seq_file *m, struct mem_cgroup *memcg);
 
 /* Stuffs for move charges at task migration. */
 /*
@@ -1146,33 +1147,49 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 
 	rcu_read_unlock();
 
-	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
-		K((u64)page_counter_read(&memcg->memory)),
-		K((u64)memcg->memory.max), memcg->memory.failcnt);
-	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
-		K((u64)page_counter_read(&memcg->memsw)),
-		K((u64)memcg->memsw.max), memcg->memsw.failcnt);
-	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
-		K((u64)page_counter_read(&memcg->kmem)),
-		K((u64)memcg->kmem.max), memcg->kmem.failcnt);
+	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+		pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
+			K((u64)page_counter_read(&memcg->memory)),
+			K((u64)memcg->memory.max), memcg->memory.failcnt);
+		pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+			K((u64)page_counter_read(&memcg->memsw)),
+			K((u64)memcg->memsw.max), memcg->memsw.failcnt);
+		pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+			K((u64)page_counter_read(&memcg->kmem)),
+			K((u64)memcg->kmem.max), memcg->kmem.failcnt);
 
-	for_each_mem_cgroup_tree(iter, memcg) {
-		pr_info("Memory cgroup stats for ");
-		pr_cont_cgroup_path(iter->css.cgroup);
-		pr_cont(":");
+		for_each_mem_cgroup_tree(iter, memcg) {
+			pr_info("Memory cgroup stats for ");
+			pr_cont_cgroup_path(iter->css.cgroup);
+			pr_cont(":");
+
+			for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+				if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
+					continue;
+				pr_cont(" %s:%luKB", memcg1_stat_names[i],
+					K(memcg_page_state(iter, memcg1_stats[i])));
+			}
 
-		for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
-			if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
-				continue;
-			pr_cont(" %s:%luKB", memcg1_stat_names[i],
-				K(memcg_page_state(iter, memcg1_stats[i])));
+			for (i = 0; i < NR_LRU_LISTS; i++)
+				pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
+					K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+
+			pr_cont("\n");
 		}
+	} else {
+		pr_info("memory %llu (max %llu)\n",
+			(u64)page_counter_read(&memcg->memory) * PAGE_SIZE,
+			(u64)memcg->memory.max * PAGE_SIZE);
 
-		for (i = 0; i < NR_LRU_LISTS; i++)
-			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
-				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+		if (memcg->swap.max == PAGE_COUNTER_MAX)
+			pr_info("swap %llu\n",
+				(u64)page_counter_read(&memcg->swap) * PAGE_SIZE);
+		else
+			pr_info("swap %llu (max %llu)\n",
+				(u64)page_counter_read(&memcg->swap) * PAGE_SIZE,
+				(u64)memcg->swap.max * PAGE_SIZE);
 
-		pr_cont("\n");
+		__memory_stat_show(NULL, memcg);
 	}
 }
 
@@ -5246,9 +5263,15 @@ static int memory_events_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int memory_stat_show(struct seq_file *m, void *v)
+#define seq_pr_info(m, fmt, ...) do {					\
+	if ((m))							\
+		seq_printf(m, fmt, ##__VA_ARGS__);			\
+	else								\
+		printk(KERN_INFO fmt, ##__VA_ARGS__);			\
+} while (0)
+
+static void __memory_stat_show(struct seq_file *m, struct mem_cgroup *memcg)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	unsigned long stat[MEMCG_NR_STAT];
 	unsigned long events[NR_VM_EVENT_ITEMS];
 	int i;
@@ -5267,26 +5290,26 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	tree_stat(memcg, stat);
 	tree_events(memcg, events);
 
-	seq_printf(m, "anon %llu\n",
-		   (u64)stat[MEMCG_RSS] * PAGE_SIZE);
-	seq_printf(m, "file %llu\n",
-		   (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
-	seq_printf(m, "kernel_stack %llu\n",
-		   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
-	seq_printf(m, "slab %llu\n",
-		   (u64)(stat[NR_SLAB_RECLAIMABLE] +
-			 stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
-	seq_printf(m, "sock %llu\n",
-		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
-
-	seq_printf(m, "shmem %llu\n",
-		   (u64)stat[NR_SHMEM] * PAGE_SIZE);
-	seq_printf(m, "file_mapped %llu\n",
-		   (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
-	seq_printf(m, "file_dirty %llu\n",
-		   (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
-	seq_printf(m, "file_writeback %llu\n",
-		   (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
+	seq_pr_info(m, "anon %llu\n",
+		    (u64)stat[MEMCG_RSS] * PAGE_SIZE);
+	seq_pr_info(m, "file %llu\n",
+		    (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
+	seq_pr_info(m, "kernel_stack %llu\n",
+		    (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
+	seq_pr_info(m, "slab %llu\n",
+		    (u64)(stat[NR_SLAB_RECLAIMABLE] +
+			  stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+	seq_pr_info(m, "sock %llu\n",
+		    (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
+
+	seq_pr_info(m, "shmem %llu\n",
+		    (u64)stat[NR_SHMEM] * PAGE_SIZE);
+	seq_pr_info(m, "file_mapped %llu\n",
+		    (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
+	seq_pr_info(m, "file_dirty %llu\n",
+		    (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
+	seq_pr_info(m, "file_writeback %llu\n",
+		    (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
 
 	for (i = 0; i < NR_LRU_LISTS; i++) {
 		struct mem_cgroup *mi;
@@ -5294,37 +5317,41 @@ static int memory_stat_show(struct seq_file *m, void *v)
 
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_nr_lru_pages(mi, BIT(i));
-		seq_printf(m, "%s %llu\n",
-			   mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
+		seq_pr_info(m, "%s %llu\n",
+			    mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
 	}
 
-	seq_printf(m, "slab_reclaimable %llu\n",
-		   (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
-	seq_printf(m, "slab_unreclaimable %llu\n",
-		   (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+	seq_pr_info(m, "slab_reclaimable %llu\n",
+		    (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
+	seq_pr_info(m, "slab_unreclaimable %llu\n",
+		    (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
 
 	/* Accumulated memory events */
 
-	seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
-	seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
-
-	seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
-	seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
-		   events[PGSCAN_DIRECT]);
-	seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
-		   events[PGSTEAL_DIRECT]);
-	seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
-	seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
-	seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
-	seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
-
-	seq_printf(m, "workingset_refault %lu\n",
-		   stat[WORKINGSET_REFAULT]);
-	seq_printf(m, "workingset_activate %lu\n",
-		   stat[WORKINGSET_ACTIVATE]);
-	seq_printf(m, "workingset_nodereclaim %lu\n",
-		   stat[WORKINGSET_NODERECLAIM]);
+	seq_pr_info(m, "pgfault %lu\n", events[PGFAULT]);
+	seq_pr_info(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+
+	seq_pr_info(m, "pgrefill %lu\n", events[PGREFILL]);
+	seq_pr_info(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
+		    events[PGSCAN_DIRECT]);
+	seq_pr_info(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
+		    events[PGSTEAL_DIRECT]);
+	seq_pr_info(m, "pgactivate %lu\n", events[PGACTIVATE]);
+	seq_pr_info(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
+	seq_pr_info(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
+	seq_pr_info(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
 
+	seq_pr_info(m, "workingset_refault %lu\n",
+		    stat[WORKINGSET_REFAULT]);
+	seq_pr_info(m, "workingset_activate %lu\n",
+		    stat[WORKINGSET_ACTIVATE]);
+	seq_pr_info(m, "workingset_nodereclaim %lu\n",
+		    stat[WORKINGSET_NODERECLAIM]);
+}
+
+static int memory_stat_show(struct seq_file *m, void *v)
+{
+	__memory_stat_show(m, mem_cgroup_from_css(seq_css(m)));
 	return 0;
 }
 

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-06 16:15 ` [PATCH v2] " Tejun Heo
@ 2018-08-06 16:34   ` Roman Gushchin
  2018-08-06 18:08   ` Andrew Morton
  2018-08-06 20:06   ` Michal Hocko
  2 siblings, 0 replies; 14+ messages in thread
From: Roman Gushchin @ 2018-08-06 16:34 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Mon, Aug 06, 2018 at 09:15:29AM -0700, Tejun Heo wrote:
> mem_cgroup_print_oom_info() currently prints the same info for cgroup1
> and cgroup2 OOMs.  It doesn't make much sense on cgroup2, which
> doesn't use memsw or separate kmem accounting - the information
> reported is both superflous and insufficient.  This patch updates the
> memcg OOM messages on cgroup2 so that
> 
> * It prints memory and swap usages and limits used on cgroup2.
> 
> * It shows the same information as memory.stat.
> 
> I took out the recursive printing for cgroup2 because the amount of
> output could be a lot and the benefits aren't clear.  An example dump
> follows.
> 
> [   40.854197] stress invoked oom-killer: gfp_mask=0x6000c0(GFP_KERNEL), nodemask=(null), order=0, oo0
> [   40.855239] stress cpuset=/ mems_allowed=0
> [   40.855665] CPU: 6 PID: 1990 Comm: stress Not tainted 4.18.0-rc7-work+ #281
> [   40.856260] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
> [   40.857000] Call Trace:
> [   40.857222]  dump_stack+0x5e/0x8b
> [   40.857517]  dump_header+0x74/0x2fc
> [   40.859106]  oom_kill_process+0x225/0x490
> [   40.859449]  out_of_memory+0x111/0x530
> [   40.859780]  mem_cgroup_out_of_memory+0x4b/0x80
> [   40.860161]  mem_cgroup_oom_synchronize+0x3ff/0x450
> [   40.861334]  pagefault_out_of_memory+0x2f/0x74
> [   40.861718]  __do_page_fault+0x3de/0x460
> [   40.862347]  page_fault+0x1e/0x30
> [   40.862636] RIP: 0033:0x5566cd5aadd0
> [   40.862940] Code: 0f 84 3c 02 00 00 8b 54 24 0c 31 c0 85 d2 0f 94 c0 89 04 24 41 83 fd 02 0f 8f f6
> [   40.864558] RSP: 002b:00007ffd979ced40 EFLAGS: 00010206
> [   40.865005] RAX: 0000000001f4f000 RBX: 00007f3a397d8010 RCX: 00007f3a397d8010
> [   40.865615] RDX: 0000000000000000 RSI: 0000000004001000 RDI: 0000000000000000
> [   40.866220] RBP: 00005566cd5abbb4 R08: 00000000ffffffff R09: 0000000000000000
> [   40.866845] R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
> [   40.867452] R13: 0000000000000002 R14: 0000000000001000 R15: 0000000004000000
> [   40.868091] Task in /test-cgroup killed as a result of limit of /test-cgroup
> [   40.868726] memory 33554432 (max 33554432)
> [   40.869096] swap 0
> [   40.869280] anon 32845824
> [   40.869519] file 0
> [   40.869730] kernel_stack 0
> [   40.869966] slab 163840
> [   40.870191] sock 0
> [   40.870374] shmem 0
> [   40.870566] file_mapped 0
> [   40.870801] file_dirty 0
> [   40.871039] file_writeback 0
> [   40.871292] inactive_anon 0
> [   40.871542] active_anon 32944128
> [   40.871821] inactive_file 0
> [   40.872077] active_file 0
> [   40.872309] unevictable 0
> [   40.872543] slab_reclaimable 0
> [   40.872806] slab_unreclaimable 163840
> [   40.873136] pgfault 8085
> [   40.873358] pgmajfault 0
> [   40.873589] pgrefill 0
> [   40.873800] pgscan 0
> [   40.873991] pgsteal 0
> [   40.874202] pgactivate 0
> [   40.874424] pgdeactivate 0
> [   40.874663] pglazyfree 0
> [   40.874881] pglazyfreed 0
> [   40.875121] workingset_refault 0
> [   40.875401] workingset_activate 0
> [   40.875689] workingset_nodereclaim 0
> [   40.875996] [ pid ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name
> [   40.876789] [ 1969]     0  1969     5121      970    86016        0             0 bash
> [   40.877546] [ 1989]     0  1989     1998      260    61440        0             0 stress
> [   40.878256] [ 1990]     0  1990    18383     8055   126976        0             0 stress
> [   40.878955] Memory cgroup out of memory: Kill process 1990 (stress) score 987 or sacrifice child
> [   40.879803] Killed process 1990 (stress) total-vm:73532kB, anon-rss:32008kB, file-rss:212kB, shmemB
> 
> v2: Updated commit message to include an example dump as suggested by
>     Roman.

Looks good to me!
I'd still drop the mm events output, but I'm fine with it too.

Acked-by: Roman Gushchin <guro@fb.com>

Thank you!

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-06 16:15 ` [PATCH v2] " Tejun Heo
  2018-08-06 16:34   ` Roman Gushchin
@ 2018-08-06 18:08   ` Andrew Morton
  2018-08-06 18:19     ` Tejun Heo
  2018-08-06 20:06   ` Michal Hocko
  2 siblings, 1 reply; 14+ messages in thread
From: Andrew Morton @ 2018-08-06 18:08 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Roman Gushchin, Johannes Weiner, Michal Hocko, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Mon, 6 Aug 2018 09:15:29 -0700 Tejun Heo <tj@kernel.org> wrote:

> mem_cgroup_print_oom_info() currently prints the same info for cgroup1
> and cgroup2 OOMs.  It doesn't make much sense on cgroup2, which
> doesn't use memsw or separate kmem accounting - the information
> reported is both superflous and insufficient.  This patch updates the
> memcg OOM messages on cgroup2 so that
> 
> * It prints memory and swap usages and limits used on cgroup2.
> 
> * It shows the same information as memory.stat.
> 
> I took out the recursive printing for cgroup2 because the amount of
> output could be a lot and the benefits aren't clear.  An example dump
> follows.

This conflicts rather severely with Shakeel's "memcg: reduce memcg tree
traversals for stats collection".  Can we please park this until after
4.19-rc1?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-06 18:08   ` Andrew Morton
@ 2018-08-06 18:19     ` Tejun Heo
  2018-09-19 17:17       ` Johannes Weiner
  0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2018-08-06 18:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Roman Gushchin, Johannes Weiner, Michal Hocko, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Mon, Aug 06, 2018 at 11:08:45AM -0700, Andrew Morton wrote:
> On Mon, 6 Aug 2018 09:15:29 -0700 Tejun Heo <tj@kernel.org> wrote:
> 
> > mem_cgroup_print_oom_info() currently prints the same info for cgroup1
> > and cgroup2 OOMs.  It doesn't make much sense on cgroup2, which
> > doesn't use memsw or separate kmem accounting - the information
> > reported is both superflous and insufficient.  This patch updates the
> > memcg OOM messages on cgroup2 so that
> > 
> > * It prints memory and swap usages and limits used on cgroup2.
> > 
> > * It shows the same information as memory.stat.
> > 
> > I took out the recursive printing for cgroup2 because the amount of
> > output could be a lot and the benefits aren't clear.  An example dump
> > follows.
> 
> This conflicts rather severely with Shakeel's "memcg: reduce memcg tree
> traversals for stats collection".  Can we please park this until after
> 4.19-rc1?

Sure, or I can refresh the patch on top of -mm too.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-06 16:15 ` [PATCH v2] " Tejun Heo
  2018-08-06 16:34   ` Roman Gushchin
  2018-08-06 18:08   ` Andrew Morton
@ 2018-08-06 20:06   ` Michal Hocko
  2018-08-06 20:19     ` Tejun Heo
  2 siblings, 1 reply; 14+ messages in thread
From: Michal Hocko @ 2018-08-06 20:06 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, Roman Gushchin, Johannes Weiner, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Mon 06-08-18 09:15:29, Tejun Heo wrote:
> mem_cgroup_print_oom_info() currently prints the same info for cgroup1
> and cgroup2 OOMs.  It doesn't make much sense on cgroup2, which
> doesn't use memsw or separate kmem accounting - the information
> reported is both superflous and insufficient.  This patch updates the
> memcg OOM messages on cgroup2 so that
> 
> * It prints memory and swap usages and limits used on cgroup2.
> 
> * It shows the same information as memory.stat.
> 
> I took out the recursive printing for cgroup2 because the amount of
> output could be a lot and the benefits aren't clear.  An example dump
> follows.
> 
> [   40.854197] stress invoked oom-killer: gfp_mask=0x6000c0(GFP_KERNEL), nodemask=(null), order=0, oo0
> [   40.855239] stress cpuset=/ mems_allowed=0
> [   40.855665] CPU: 6 PID: 1990 Comm: stress Not tainted 4.18.0-rc7-work+ #281
> [   40.856260] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
> [   40.857000] Call Trace:
> [   40.857222]  dump_stack+0x5e/0x8b
> [   40.857517]  dump_header+0x74/0x2fc
> [   40.859106]  oom_kill_process+0x225/0x490
> [   40.859449]  out_of_memory+0x111/0x530
> [   40.859780]  mem_cgroup_out_of_memory+0x4b/0x80
> [   40.860161]  mem_cgroup_oom_synchronize+0x3ff/0x450
> [   40.861334]  pagefault_out_of_memory+0x2f/0x74
> [   40.861718]  __do_page_fault+0x3de/0x460
> [   40.862347]  page_fault+0x1e/0x30
> [   40.862636] RIP: 0033:0x5566cd5aadd0
> [   40.862940] Code: 0f 84 3c 02 00 00 8b 54 24 0c 31 c0 85 d2 0f 94 c0 89 04 24 41 83 fd 02 0f 8f f6
> [   40.864558] RSP: 002b:00007ffd979ced40 EFLAGS: 00010206
> [   40.865005] RAX: 0000000001f4f000 RBX: 00007f3a397d8010 RCX: 00007f3a397d8010
> [   40.865615] RDX: 0000000000000000 RSI: 0000000004001000 RDI: 0000000000000000
> [   40.866220] RBP: 00005566cd5abbb4 R08: 00000000ffffffff R09: 0000000000000000
> [   40.866845] R10: 0000000000000022 R11: 0000000000000246 R12: ffffffffffffffff
> [   40.867452] R13: 0000000000000002 R14: 0000000000001000 R15: 0000000004000000
> [   40.868091] Task in /test-cgroup killed as a result of limit of /test-cgroup
> [   40.868726] memory 33554432 (max 33554432)
> [   40.869096] swap 0
> [   40.869280] anon 32845824
> [   40.869519] file 0
> [   40.869730] kernel_stack 0
> [   40.869966] slab 163840
> [   40.870191] sock 0
> [   40.870374] shmem 0
> [   40.870566] file_mapped 0
> [   40.870801] file_dirty 0
> [   40.871039] file_writeback 0
> [   40.871292] inactive_anon 0
> [   40.871542] active_anon 32944128
> [   40.871821] inactive_file 0
> [   40.872077] active_file 0
> [   40.872309] unevictable 0
> [   40.872543] slab_reclaimable 0
> [   40.872806] slab_unreclaimable 163840
> [   40.873136] pgfault 8085
> [   40.873358] pgmajfault 0
> [   40.873589] pgrefill 0
> [   40.873800] pgscan 0
> [   40.873991] pgsteal 0
> [   40.874202] pgactivate 0
> [   40.874424] pgdeactivate 0
> [   40.874663] pglazyfree 0
> [   40.874881] pglazyfreed 0
> [   40.875121] workingset_refault 0
> [   40.875401] workingset_activate 0
> [   40.875689] workingset_nodereclaim 0

Is there really any reason to have each couner on a seprate line? This
is just too much of an output for a single oom report. I do get why you
are not really thrilled by the hierarchical numbers but can we keep
counters in a single line please?
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-06 20:06   ` Michal Hocko
@ 2018-08-06 20:19     ` Tejun Heo
  2018-08-07  7:13       ` Michal Hocko
  0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2018-08-06 20:19 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Roman Gushchin, Johannes Weiner, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

Hello, Michal.

On Mon, Aug 06, 2018 at 10:06:37PM +0200, Michal Hocko wrote:
> Is there really any reason to have each couner on a seprate line? This
> is just too much of an output for a single oom report. I do get why you
> are not really thrilled by the hierarchical numbers but can we keep
> counters in a single line please?

Hmm... maybe, but can you please consider the followings?

* It's the same information as memory.stat but would be in a different
  format and will likely be a bit of an eyeful.

* It can easily become a really long line.  Each kernel log can be ~1k
  in length and there can be other limits in the log pipeline
  (e.g. netcons).

* The information is already multi-line and cgroup oom kills don't
  take down the system, so there's no need to worry about scroll back
  that much.  Also, not printing recursive info means the output is
  well-bound.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-06 20:19     ` Tejun Heo
@ 2018-08-07  7:13       ` Michal Hocko
  2018-08-07 15:09         ` Tejun Heo
  0 siblings, 1 reply; 14+ messages in thread
From: Michal Hocko @ 2018-08-07  7:13 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, Roman Gushchin, Johannes Weiner, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Mon 06-08-18 13:19:07, Tejun Heo wrote:
> Hello, Michal.
> 
> On Mon, Aug 06, 2018 at 10:06:37PM +0200, Michal Hocko wrote:
> > Is there really any reason to have each couner on a seprate line? This
> > is just too much of an output for a single oom report. I do get why you
> > are not really thrilled by the hierarchical numbers but can we keep
> > counters in a single line please?
> 
> Hmm... maybe, but can you please consider the followings?
> 
> * It's the same information as memory.stat but would be in a different
>   format and will likely be a bit of an eyeful.
>
> * It can easily become a really long line.  Each kernel log can be ~1k
>   in length and there can be other limits in the log pipeline
>   (e.g. netcons).

Are we getting close to those limits?

> * The information is already multi-line and cgroup oom kills don't
>   take down the system, so there's no need to worry about scroll back
>   that much.  Also, not printing recursive info means the output is
>   well-bound.

Well, on the other hand you can have a lot of memcgs under OOM and then
swamp the log a lot.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-07  7:13       ` Michal Hocko
@ 2018-08-07 15:09         ` Tejun Heo
  2018-08-07 17:54           ` Michal Hocko
  0 siblings, 1 reply; 14+ messages in thread
From: Tejun Heo @ 2018-08-07 15:09 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Andrew Morton, Roman Gushchin, Johannes Weiner, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Tue, Aug 07, 2018 at 09:13:32AM +0200, Michal Hocko wrote:
> > * It's the same information as memory.stat but would be in a different
> >   format and will likely be a bit of an eyeful.
> >
> > * It can easily become a really long line.  Each kernel log can be ~1k
> >   in length and there can be other limits in the log pipeline
> >   (e.g. netcons).
> 
> Are we getting close to those limits?

Yeah, I think the stats we have can already go close to or over 500
bytes easily, which is already pushing the netcons udp packet size
limit.

> > * The information is already multi-line and cgroup oom kills don't
> >   take down the system, so there's no need to worry about scroll back
> >   that much.  Also, not printing recursive info means the output is
> >   well-bound.
> 
> Well, on the other hand you can have a lot of memcgs under OOM and then
> swamp the log a lot.

idk, the info dump is already multi-line.  If we have a lot of memcgs
under OOM, we're already kinda messed up (e.g. we can't tell which
line is for which oom).  This adds to that to a certain extent but not
by much.  In practice, this doesn't seem to be a signficant problem.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-07 15:09         ` Tejun Heo
@ 2018-08-07 17:54           ` Michal Hocko
  0 siblings, 0 replies; 14+ messages in thread
From: Michal Hocko @ 2018-08-07 17:54 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, Roman Gushchin, Johannes Weiner, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Tue 07-08-18 08:09:44, Tejun Heo wrote:
> On Tue, Aug 07, 2018 at 09:13:32AM +0200, Michal Hocko wrote:
> > > * It's the same information as memory.stat but would be in a different
> > >   format and will likely be a bit of an eyeful.
> > >
> > > * It can easily become a really long line.  Each kernel log can be ~1k
> > >   in length and there can be other limits in the log pipeline
> > >   (e.g. netcons).
> > 
> > Are we getting close to those limits?
> 
> Yeah, I think the stats we have can already go close to or over 500
> bytes easily, which is already pushing the netcons udp packet size
> limit.
> 
> > > * The information is already multi-line and cgroup oom kills don't
> > >   take down the system, so there's no need to worry about scroll back
> > >   that much.  Also, not printing recursive info means the output is
> > >   well-bound.
> > 
> > Well, on the other hand you can have a lot of memcgs under OOM and then
> > swamp the log a lot.
> 
> idk, the info dump is already multi-line.  If we have a lot of memcgs
> under OOM, we're already kinda messed up (e.g. we can't tell which
> line is for which oom). 

Well, I am not really worried about interleaved oom reports because they
do use oom_lock so this shouldn't be a problem. I just meant to say that
a lot of memcg ooms will swamp the log and having more lines doesn't
really help.

That being said. I will not really push hard. If there is a general
consensus with this output I will not stand in the way. But I believe
that more compact oom report is both nicer and easier to read. At least
from my POV and I have processed countless number of those.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] mm: memcg: update memcg OOM messages on cgroup2
  2018-08-06 18:19     ` Tejun Heo
@ 2018-09-19 17:17       ` Johannes Weiner
  0 siblings, 0 replies; 14+ messages in thread
From: Johannes Weiner @ 2018-09-19 17:17 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, Roman Gushchin, Michal Hocko, Vladimir Davydov,
	cgroups, linux-mm, kernel-team

On Mon, Aug 06, 2018 at 11:19:26AM -0700, Tejun Heo wrote:
> On Mon, Aug 06, 2018 at 11:08:45AM -0700, Andrew Morton wrote:
> > On Mon, 6 Aug 2018 09:15:29 -0700 Tejun Heo <tj@kernel.org> wrote:
> > 
> > > mem_cgroup_print_oom_info() currently prints the same info for cgroup1
> > > and cgroup2 OOMs.  It doesn't make much sense on cgroup2, which
> > > doesn't use memsw or separate kmem accounting - the information
> > > reported is both superflous and insufficient.  This patch updates the
> > > memcg OOM messages on cgroup2 so that
> > > 
> > > * It prints memory and swap usages and limits used on cgroup2.
> > > 
> > > * It shows the same information as memory.stat.
> > > 
> > > I took out the recursive printing for cgroup2 because the amount of
> > > output could be a lot and the benefits aren't clear.  An example dump
> > > follows.
> > 
> > This conflicts rather severely with Shakeel's "memcg: reduce memcg tree
> > traversals for stats collection".  Can we please park this until after
> > 4.19-rc1?
> 
> Sure, or I can refresh the patch on top of -mm too.

Now that 4.19 is released, do you mind refreshing this for 4.20?

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-09-19 17:17 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-03 17:57 [PATCH] mm: memcg: update memcg OOM messages on cgroup2 Tejun Heo
2018-08-03 20:30 ` Roman Gushchin
2018-08-06 15:55   ` Johannes Weiner
2018-08-06 15:48 ` Johannes Weiner
2018-08-06 16:15 ` [PATCH v2] " Tejun Heo
2018-08-06 16:34   ` Roman Gushchin
2018-08-06 18:08   ` Andrew Morton
2018-08-06 18:19     ` Tejun Heo
2018-09-19 17:17       ` Johannes Weiner
2018-08-06 20:06   ` Michal Hocko
2018-08-06 20:19     ` Tejun Heo
2018-08-07  7:13       ` Michal Hocko
2018-08-07 15:09         ` Tejun Heo
2018-08-07 17:54           ` Michal Hocko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.