linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
       [not found] <20200131043324.wkArXTf6-%akpm@linux-foundation.org>
@ 2020-04-17 14:33 ` Tetsuo Handa
  2020-04-17 15:26   ` Michal Hocko
  0 siblings, 1 reply; 21+ messages in thread
From: Tetsuo Handa @ 2020-04-17 14:33 UTC (permalink / raw)
  To: akpm; +Cc: guro, mhocko, linux-mm, rientjes, shakeelb, Yafang Shao

On 2020/01/31 13:33, akpm@linux-foundation.org wrote:
> The patch titled
>      Subject: mm, oom: avoid printk() iteration under RCU
> has been removed from the -mm tree.  Its filename was
>      mm-oom-avoid-printk-iteration-under-rcu.patch
> 
> This patch was dropped because it was nacked

This patch was once nacked by Michal Hocko. But based on
https://lkml.kernel.org/r/CALOAHbArHa-QZ2hXnLOhEJ3Ti=C69-LPtjZB9zqPcuTKeHsV4g@mail.gmail.com
that dump_tasks() will become a real world problem with slow consoles (even if
printk() becomes asynchronous), I propose you to revive this patch as a first step for
implementing workable ratelimit for dump_tasks(). This patch can be applied as-is
on current linux.git and linux-next.git (only offset difference).

> 
> ------------------------------------------------------
> From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Subject: mm, oom: avoid printk() iteration under RCU
> 
> Currently dump_tasks() might call printk() for many thousands times under
> RCU, which might take many minutes for slow consoles.  Therefore, split
> dump_tasks() into three stages; take a snapshot of possible OOM victim
> candidates under RCU, dump the snapshot from reschedulable context, and
> destroy the snapshot.
> 
> In a future patch, the first stage would be moved to select_bad_process()
> and the third stage would be moved to after oom_kill_process(), and will
> simplify refcount handling.
> 
> [akpm@linux-foundation.org: make dump_tasks::list non-static]
> Link: http://lkml.kernel.org/r/1563360901-8277-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> Cc: Shakeel Butt <shakeelb@google.com>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Roman Gushchin <guro@fb.com>
> Cc: David Rientjes <rientjes@google.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---
> 
>  include/linux/sched.h |    1 
>  mm/oom_kill.c         |   67 +++++++++++++++++++---------------------
>  2 files changed, 34 insertions(+), 34 deletions(-)
> 
> --- a/include/linux/sched.h~mm-oom-avoid-printk-iteration-under-rcu
> +++ a/include/linux/sched.h
> @@ -1261,6 +1261,7 @@ struct task_struct {
>  #ifdef CONFIG_MMU
>  	struct task_struct		*oom_reaper_list;
>  #endif
> +	struct list_head		oom_victim_list;
>  #ifdef CONFIG_VMAP_STACK
>  	struct vm_struct		*stack_vm_area;
>  #endif
> --- a/mm/oom_kill.c~mm-oom-avoid-printk-iteration-under-rcu
> +++ a/mm/oom_kill.c
> @@ -377,36 +377,13 @@ static void select_bad_process(struct oo
>  	}
>  }
>  
> -static int dump_task(struct task_struct *p, void *arg)
> -{
> -	struct oom_control *oc = arg;
> -	struct task_struct *task;
> -
> -	if (oom_unkillable_task(p))
> -		return 0;
>  
> -	/* p may not have freeable memory in nodemask */
> -	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> -		return 0;
> -
> -	task = find_lock_task_mm(p);
> -	if (!task) {
> -		/*
> -		 * This is a kthread or all of p's threads have already
> -		 * detached their mm's.  There's no need to report
> -		 * them; they can't be oom killed anyway.
> -		 */
> -		return 0;
> +static int add_candidate_task(struct task_struct *p, void *arg)
> +{
> +	if (!oom_unkillable_task(p)) {
> +		get_task_struct(p);
> +		list_add_tail(&p->oom_victim_list, (struct list_head *) arg);
>  	}
> -
> -	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> -		task->pid, from_kuid(&init_user_ns, task_uid(task)),
> -		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> -		mm_pgtables_bytes(task->mm),
> -		get_mm_counter(task->mm, MM_SWAPENTS),
> -		task->signal->oom_score_adj, task->comm);
> -	task_unlock(task);
> -
>  	return 0;
>  }
>  
> @@ -422,19 +399,41 @@ static int dump_task(struct task_struct
>   */
>  static void dump_tasks(struct oom_control *oc)
>  {
> -	pr_info("Tasks state (memory values in pages):\n");
> -	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +	LIST_HEAD(list);
> +	struct task_struct *p;
> +	struct task_struct *t;
>  
>  	if (is_memcg_oom(oc))
> -		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
> +		mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list);
>  	else {
> -		struct task_struct *p;
> -
>  		rcu_read_lock();
>  		for_each_process(p)
> -			dump_task(p, oc);
> +			add_candidate_task(p, &list);
>  		rcu_read_unlock();
>  	}
> +	pr_info("Tasks state (memory values in pages):\n");
> +	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +	list_for_each_entry(p, &list, oom_victim_list) {
> +		cond_resched();
> +		/* p may not have freeable memory in nodemask */
> +		if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> +			continue;
> +		/* All of p's threads might have already detached their mm's. */
> +		t = find_lock_task_mm(p);
> +		if (!t)
> +			continue;
> +		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> +			t->pid, from_kuid(&init_user_ns, task_uid(t)),
> +			t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
> +			mm_pgtables_bytes(t->mm),
> +			get_mm_counter(t->mm, MM_SWAPENTS),
> +			t->signal->oom_score_adj, t->comm);
> +		task_unlock(t);
> +	}
> +	list_for_each_entry_safe(p, t, &list, oom_victim_list) {
> +		list_del(&p->oom_victim_list);
> +		put_task_struct(p);
> +	}
>  }
>  
>  static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
> _
> 
> Patches currently in -mm which might be from penguin-kernel@I-love.SAKURA.ne.jp are
> 
> 



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-17 14:33 ` [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree Tetsuo Handa
@ 2020-04-17 15:26   ` Michal Hocko
  2020-04-18 10:13     ` Tetsuo Handa
  0 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2020-04-17 15:26 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, guro, linux-mm, rientjes, shakeelb, Yafang Shao

On Fri 17-04-20 23:33:41, Tetsuo Handa wrote:
> On 2020/01/31 13:33, akpm@linux-foundation.org wrote:
> > The patch titled
> >      Subject: mm, oom: avoid printk() iteration under RCU
> > has been removed from the -mm tree.  Its filename was
> >      mm-oom-avoid-printk-iteration-under-rcu.patch
> > 
> > This patch was dropped because it was nacked
> 
> This patch was once nacked by Michal Hocko. But based on
> https://lkml.kernel.org/r/CALOAHbArHa-QZ2hXnLOhEJ3Ti=C69-LPtjZB9zqPcuTKeHsV4g@mail.gmail.com
> that dump_tasks() will become a real world problem with slow consoles (even if
> printk() becomes asynchronous), I propose you to revive this patch as a first step for
> implementing workable ratelimit for dump_tasks(). This patch can be applied as-is
> on current linux.git and linux-next.git (only offset difference).

My nack still applies. Slow consoles are really a pain and they are
going to be addressed by the upcoming printk work from a large part.
Your patch doesn't address the problem it merely shift it around while
making code even more obscure and harder to maintain.

For the specific issue that you are pointing out, a simple and working
workaround is to reduce the loglevel or disable dump_tasks. There is no
real reason to add ugly kluges IMHO.

> > ------------------------------------------------------
> > From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> > Subject: mm, oom: avoid printk() iteration under RCU
> > 
> > Currently dump_tasks() might call printk() for many thousands times under
> > RCU, which might take many minutes for slow consoles.  Therefore, split
> > dump_tasks() into three stages; take a snapshot of possible OOM victim
> > candidates under RCU, dump the snapshot from reschedulable context, and
> > destroy the snapshot.
> > 
> > In a future patch, the first stage would be moved to select_bad_process()
> > and the third stage would be moved to after oom_kill_process(), and will
> > simplify refcount handling.
> > 
> > [akpm@linux-foundation.org: make dump_tasks::list non-static]
> > Link: http://lkml.kernel.org/r/1563360901-8277-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
> > Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> > Cc: Shakeel Butt <shakeelb@google.com>
> > Cc: Michal Hocko <mhocko@suse.com>
> > Cc: Roman Gushchin <guro@fb.com>
> > Cc: David Rientjes <rientjes@google.com>
> > Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> > ---
> > 
> >  include/linux/sched.h |    1 
> >  mm/oom_kill.c         |   67 +++++++++++++++++++---------------------
> >  2 files changed, 34 insertions(+), 34 deletions(-)
> > 
> > --- a/include/linux/sched.h~mm-oom-avoid-printk-iteration-under-rcu
> > +++ a/include/linux/sched.h
> > @@ -1261,6 +1261,7 @@ struct task_struct {
> >  #ifdef CONFIG_MMU
> >  	struct task_struct		*oom_reaper_list;
> >  #endif
> > +	struct list_head		oom_victim_list;
> >  #ifdef CONFIG_VMAP_STACK
> >  	struct vm_struct		*stack_vm_area;
> >  #endif
> > --- a/mm/oom_kill.c~mm-oom-avoid-printk-iteration-under-rcu
> > +++ a/mm/oom_kill.c
> > @@ -377,36 +377,13 @@ static void select_bad_process(struct oo
> >  	}
> >  }
> >  
> > -static int dump_task(struct task_struct *p, void *arg)
> > -{
> > -	struct oom_control *oc = arg;
> > -	struct task_struct *task;
> > -
> > -	if (oom_unkillable_task(p))
> > -		return 0;
> >  
> > -	/* p may not have freeable memory in nodemask */
> > -	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> > -		return 0;
> > -
> > -	task = find_lock_task_mm(p);
> > -	if (!task) {
> > -		/*
> > -		 * This is a kthread or all of p's threads have already
> > -		 * detached their mm's.  There's no need to report
> > -		 * them; they can't be oom killed anyway.
> > -		 */
> > -		return 0;
> > +static int add_candidate_task(struct task_struct *p, void *arg)
> > +{
> > +	if (!oom_unkillable_task(p)) {
> > +		get_task_struct(p);
> > +		list_add_tail(&p->oom_victim_list, (struct list_head *) arg);
> >  	}
> > -
> > -	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> > -		task->pid, from_kuid(&init_user_ns, task_uid(task)),
> > -		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> > -		mm_pgtables_bytes(task->mm),
> > -		get_mm_counter(task->mm, MM_SWAPENTS),
> > -		task->signal->oom_score_adj, task->comm);
> > -	task_unlock(task);
> > -
> >  	return 0;
> >  }
> >  
> > @@ -422,19 +399,41 @@ static int dump_task(struct task_struct
> >   */
> >  static void dump_tasks(struct oom_control *oc)
> >  {
> > -	pr_info("Tasks state (memory values in pages):\n");
> > -	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> > +	LIST_HEAD(list);
> > +	struct task_struct *p;
> > +	struct task_struct *t;
> >  
> >  	if (is_memcg_oom(oc))
> > -		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
> > +		mem_cgroup_scan_tasks(oc->memcg, add_candidate_task, &list);
> >  	else {
> > -		struct task_struct *p;
> > -
> >  		rcu_read_lock();
> >  		for_each_process(p)
> > -			dump_task(p, oc);
> > +			add_candidate_task(p, &list);
> >  		rcu_read_unlock();
> >  	}
> > +	pr_info("Tasks state (memory values in pages):\n");
> > +	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> > +	list_for_each_entry(p, &list, oom_victim_list) {
> > +		cond_resched();
> > +		/* p may not have freeable memory in nodemask */
> > +		if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
> > +			continue;
> > +		/* All of p's threads might have already detached their mm's. */
> > +		t = find_lock_task_mm(p);
> > +		if (!t)
> > +			continue;
> > +		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> > +			t->pid, from_kuid(&init_user_ns, task_uid(t)),
> > +			t->tgid, t->mm->total_vm, get_mm_rss(t->mm),
> > +			mm_pgtables_bytes(t->mm),
> > +			get_mm_counter(t->mm, MM_SWAPENTS),
> > +			t->signal->oom_score_adj, t->comm);
> > +		task_unlock(t);
> > +	}
> > +	list_for_each_entry_safe(p, t, &list, oom_victim_list) {
> > +		list_del(&p->oom_victim_list);
> > +		put_task_struct(p);
> > +	}
> >  }
> >  
> >  static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
> > _
> > 
> > Patches currently in -mm which might be from penguin-kernel@I-love.SAKURA.ne.jp are
> > 
> > 
> 

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-17 15:26   ` Michal Hocko
@ 2020-04-18 10:13     ` Tetsuo Handa
  2020-04-20  7:33       ` Michal Hocko
  0 siblings, 1 reply; 21+ messages in thread
From: Tetsuo Handa @ 2020-04-18 10:13 UTC (permalink / raw)
  To: Michal Hocko, akpm; +Cc: guro, linux-mm, rientjes, shakeelb, Yafang Shao

On 2020/04/18 0:26, Michal Hocko wrote:
> On Fri 17-04-20 23:33:41, Tetsuo Handa wrote:
>> On 2020/01/31 13:33, akpm@linux-foundation.org wrote:
>>> The patch titled
>>>      Subject: mm, oom: avoid printk() iteration under RCU
>>> has been removed from the -mm tree.  Its filename was
>>>      mm-oom-avoid-printk-iteration-under-rcu.patch
>>>
>>> This patch was dropped because it was nacked
>>
>> This patch was once nacked by Michal Hocko. But based on
>> https://lkml.kernel.org/r/CALOAHbArHa-QZ2hXnLOhEJ3Ti=C69-LPtjZB9zqPcuTKeHsV4g@mail.gmail.com
>> that dump_tasks() will become a real world problem with slow consoles (even if
>> printk() becomes asynchronous), I propose you to revive this patch as a first step for
>> implementing workable ratelimit for dump_tasks(). This patch can be applied as-is
>> on current linux.git and linux-next.git (only offset difference).
> 
> My nack still applies. Slow consoles are really a pain and they are
> going to be addressed by the upcoming printk work from a large part.
> Your patch doesn't address the problem it merely shift it around while
> making code even more obscure and harder to maintain.

Your nack is wrong. The upcoming printk work will not address the problem.

> 
> For the specific issue that you are pointing out, a simple and working
> workaround is to reduce the loglevel or disable dump_tasks. There is no
> real reason to add ugly kluges IMHO.

I do need dump_tasks() output in order to understand memory usage as of
invocation of the OOM killer (rather than verifying whether the OOM killer
chose appropriate OOM victim). I really do not want to reduce the loglevel
or disable dump_tasks().

You are misunderstanding the problem. Asynchronous printk() cannot solve a problem
that printk() buffer is needlessly flooded with OOM related messages, which can
result in loss of non OOM-related messages if console is slow and can result in
bloating of log files if console is not slow. My proposal is to minimize duplicated
OOM-related messages by keeping dump_task() synchronous (even if printk() becomes
asynchronous) by offloading the printing to a workqueue context. This patch is the
first step for making it possible to offload the printing to a workqueue context.
Waiting for printk() to become asynchronous is completely irrelevant.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-18 10:13     ` Tetsuo Handa
@ 2020-04-20  7:33       ` Michal Hocko
  2020-04-22  6:40         ` Tetsuo Handa
  0 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2020-04-20  7:33 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, guro, linux-mm, rientjes, shakeelb, Yafang Shao

On Sat 18-04-20 19:13:57, Tetsuo Handa wrote:
[...]
> > For the specific issue that you are pointing out, a simple and working
> > workaround is to reduce the loglevel or disable dump_tasks. There is no
> > real reason to add ugly kluges IMHO.
> 
> I do need dump_tasks() output in order to understand memory usage as of
> invocation of the OOM killer (rather than verifying whether the OOM killer
> chose appropriate OOM victim). I really do not want to reduce the loglevel
> or disable dump_tasks().
> 
> You are misunderstanding the problem. Asynchronous printk() cannot solve a problem
> that printk() buffer is needlessly flooded with OOM related messages, which can
> result in loss of non OOM-related messages if console is slow and can result in
> bloating of log files if console is not slow.

I do agree with this statement. And that is the _primamry_ point why I
believe your patch doesn't solve _anything_. Why? Because it doesn't
reduce the amount of the output. You merely shift it to a different
context which adds complexity as I've mentioned already. The only thing
you really "fix" is that the potentially long taking printk is not done
from the locked oom context. This is what the async printk will address
AFAIU.

That being said, this is not the first time we are in this discussion
and I find repeating the same thing unproductive.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-20  7:33       ` Michal Hocko
@ 2020-04-22  6:40         ` Tetsuo Handa
  2020-04-22  6:59           ` Michal Hocko
  0 siblings, 1 reply; 21+ messages in thread
From: Tetsuo Handa @ 2020-04-22  6:40 UTC (permalink / raw)
  To: Michal Hocko; +Cc: akpm, guro, linux-mm, rientjes, shakeelb, Yafang Shao

On 2020/04/20 16:33, Michal Hocko wrote:
> On Sat 18-04-20 19:13:57, Tetsuo Handa wrote:
> [...]
>>> For the specific issue that you are pointing out, a simple and working
>>> workaround is to reduce the loglevel or disable dump_tasks. There is no
>>> real reason to add ugly kluges IMHO.
>>
>> I do need dump_tasks() output in order to understand memory usage as of
>> invocation of the OOM killer (rather than verifying whether the OOM killer
>> chose appropriate OOM victim). I really do not want to reduce the loglevel
>> or disable dump_tasks().
>>
>> You are misunderstanding the problem. Asynchronous printk() cannot solve a problem
>> that printk() buffer is needlessly flooded with OOM related messages, which can
>> result in loss of non OOM-related messages if console is slow and can result in
>> bloating of log files if console is not slow.
> 
> I do agree with this statement. And that is the _primamry_ point why I
> believe your patch doesn't solve _anything_.

This patch does avoid RCU stall problem.

>                                              Why? Because it doesn't
> reduce the amount of the output. You merely shift it to a different
> context which adds complexity as I've mentioned already. The only thing
> you really "fix" is that the potentially long taking printk is not done
> from the locked oom context.

"The potentially long taking printk is not done from the locked oom context"
is an improvement (which we can do without waiting for async printk changes).

>                              This is what the async printk will address
> AFAIU.

No! You are completely wrong!! Async printk CANNOT reduce the amount of the
output. Rather, async printk might increase the amount of the output (because
it allows printk() users to think as if printk() is so fast).  Shifting to a
deferrable context (like a workqueue) allows reducing the amount of the output.
To shift to a deferrable context, this patch (which maintains state of
not-yet-reported OOM victim candidates) is the first step.

> 
> That being said, this is not the first time we are in this discussion
> and I find repeating the same thing unproductive.

Your repeating refusal is really unproductive.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-22  6:40         ` Tetsuo Handa
@ 2020-04-22  6:59           ` Michal Hocko
  2020-04-23  5:34             ` Tetsuo Handa
  0 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2020-04-22  6:59 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: akpm, guro, linux-mm, rientjes, shakeelb, Yafang Shao

On Wed 22-04-20 15:40:19, Tetsuo Handa wrote:
> On 2020/04/20 16:33, Michal Hocko wrote:
> > On Sat 18-04-20 19:13:57, Tetsuo Handa wrote:
> > [...]
> >>> For the specific issue that you are pointing out, a simple and working
> >>> workaround is to reduce the loglevel or disable dump_tasks. There is no
> >>> real reason to add ugly kluges IMHO.
> >>
> >> I do need dump_tasks() output in order to understand memory usage as of
> >> invocation of the OOM killer (rather than verifying whether the OOM killer
> >> chose appropriate OOM victim). I really do not want to reduce the loglevel
> >> or disable dump_tasks().
> >>
> >> You are misunderstanding the problem. Asynchronous printk() cannot solve a problem
> >> that printk() buffer is needlessly flooded with OOM related messages, which can
> >> result in loss of non OOM-related messages if console is slow and can result in
> >> bloating of log files if console is not slow.
> > 
> > I do agree with this statement. And that is the _primamry_ point why I
> > believe your patch doesn't solve _anything_.
> 
> This patch does avoid RCU stall problem.

Which has been observed only under artificial oom hammering which
doesn't resemble any real life workload AFAIR. So while I agree that RCU
stall is annoying it is not worth addressing with the invasive change
you are proposing. This is a simple matter of cost vs. benefit
evaluation.

> >                                              Why? Because it doesn't
> > reduce the amount of the output. You merely shift it to a different
> > context which adds complexity as I've mentioned already. The only thing
> > you really "fix" is that the potentially long taking printk is not done
> > from the locked oom context.
> 
> "The potentially long taking printk is not done from the locked oom context"
> is an improvement (which we can do without waiting for async printk changes).
> 
> >                              This is what the async printk will address
> > AFAIU.
> 
> No! You are completely wrong!! Async printk CANNOT reduce the amount of the
> output.

Which is exactly what I claim above. Async printk would, however, deal
with the problem of the locked context part of the problem because
the heavy lifting is not done from the caller context. Please read my
response carefully.

> Rather, async printk might increase the amount of the output (because
> it allows printk() users to think as if printk() is so fast).

You make no sense.

This is just waste of time (again). I am done with this thread.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-22  6:59           ` Michal Hocko
@ 2020-04-23  5:34             ` Tetsuo Handa
  2020-04-23  6:35               ` Yafang Shao
  0 siblings, 1 reply; 21+ messages in thread
From: Tetsuo Handa @ 2020-04-23  5:34 UTC (permalink / raw)
  To: Michal Hocko; +Cc: akpm, guro, linux-mm, rientjes, shakeelb, Yafang Shao

On 2020/04/22 15:59, Michal Hocko wrote:
>>>                              This is what the async printk will address
>>> AFAIU.
>>
>> No! You are completely wrong!! Async printk CANNOT reduce the amount of the
>> output.
> 
> Which is exactly what I claim above. Async printk would, however, deal
> with the problem of the locked context part of the problem because
> the heavy lifting is not done from the caller context. Please read my
> response carefully.

No! My proposal (which offloads to a deferrable context) avoids doing the
heavy lifting (for printk buffer users) from the printk caller context. I'm
saying that "don't pile up too much backlog onto the printk buffer (by abusing
async printk) in order to make sure that kernel messages which are more
important than reporting OOM victim candidates will be processed immediately".

dump_tasks() remains definitely a printk() abuser which is capable of pushing
many thousands of printk() messages in one second if async printk were available.
Async printk CANNOT deal with the problem that too much backlog causes important
messages to be delayed for too long. Please read my explanation carefully.

Also, Sergey Senozhatsky (printk maintainer) said at
https://lkml.kernel.org/r/20200423015008.GA246741@jagdpanzerIV.localdomain
that it is UNKNOWN when async printk is merged. Async printk is not a
silver-bullet breakthrough. Async printk cannot work without cooperation from
printk() users. Please really stop letting the printk() do the heavy lifting.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23  5:34             ` Tetsuo Handa
@ 2020-04-23  6:35               ` Yafang Shao
  2020-04-23  7:34                 ` Michal Hocko
  0 siblings, 1 reply; 21+ messages in thread
From: Yafang Shao @ 2020-04-23  6:35 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Michal Hocko, Andrew Morton, Roman Gushchin, linux-mm,
	David Rientjes, Shakeel Butt

On Thu, Apr 23, 2020 at 1:35 PM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> On 2020/04/22 15:59, Michal Hocko wrote:
> >>>                              This is what the async printk will address
> >>> AFAIU.
> >>
> >> No! You are completely wrong!! Async printk CANNOT reduce the amount of the
> >> output.
> >
> > Which is exactly what I claim above. Async printk would, however, deal
> > with the problem of the locked context part of the problem because
> > the heavy lifting is not done from the caller context. Please read my
> > response carefully.
>
> No! My proposal (which offloads to a deferrable context) avoids doing the
> heavy lifting (for printk buffer users) from the printk caller context. I'm
> saying that "don't pile up too much backlog onto the printk buffer (by abusing
> async printk) in order to make sure that kernel messages which are more
> important than reporting OOM victim candidates will be processed immediately".
>
> dump_tasks() remains definitely a printk() abuser which is capable of pushing
> many thousands of printk() messages in one second if async printk were available.
> Async printk CANNOT deal with the problem that too much backlog causes important
> messages to be delayed for too long. Please read my explanation carefully.
>

Agreed. Too much oom reports still be a issue even if the printk() is asyn.
I think the aysnc printk() won't care about wheter the data is
improtant or not, so the user of printk() (even if it is asyn) should
have a good management of these data especially if these data may
consume all or most of the printk buffer.

> Also, Sergey Senozhatsky (printk maintainer) said at
> https://lkml.kernel.org/r/20200423015008.GA246741@jagdpanzerIV.localdomain
> that it is UNKNOWN when async printk is merged. Async printk is not a
> silver-bullet breakthrough. Async printk cannot work without cooperation from
> printk() users. Please really stop letting the printk() do the heavy lifting.
>


-- 
Thanks
Yafang


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23  6:35               ` Yafang Shao
@ 2020-04-23  7:34                 ` Michal Hocko
  2020-04-23 10:22                   ` Yafang Shao
  0 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2020-04-23  7:34 UTC (permalink / raw)
  To: Yafang Shao
  Cc: Tetsuo Handa, Andrew Morton, Roman Gushchin, linux-mm,
	David Rientjes, Shakeel Butt

On Thu 23-04-20 14:35:22, Yafang Shao wrote:
> On Thu, Apr 23, 2020 at 1:35 PM Tetsuo Handa
[...]
> > dump_tasks() remains definitely a printk() abuser which is capable of pushing
> > many thousands of printk() messages in one second if async printk were available.
> > Async printk CANNOT deal with the problem that too much backlog causes important
> > messages to be delayed for too long. Please read my explanation carefully.
> >
> 
> Agreed. Too much oom reports still be a issue even if the printk() is asyn.

I believe nobody is disputing this part. We are talking about two things
here and I believe that contributes to a confusion considerably
1) dump_tasks being a large noise generator to the kernel log buffer
2) a heavy printk load from the oom context

There is no good answer for 1). We simply print a lot of data that
scales with the number eligible tasks and that might be thousands.
We have done quite a lot of work to make the data collecting part of the
process as optimal as possible but having this feature enabled by
default is simply a package we have to carry with us. printk doesn't
cope with such a load really great currently. There might be some future
changes but the underline is that no matter how printk gets optimized
there is still the payload to be printed. No matter this happens
transparently async or explicitly done in a detached context.

2) is about the sync nature of the printk _right_now_ and that causes
delays in the allocator context while the system is OOM. There are locks
held both by the OOM context and in the call chain to the allocator
potentially. The longer the oom context is going to take the longer is
the agony going to take. Here is where the async printing might help
because it would push out the heavy lifting to a different context.

There is a clear agreement in this part.  The whole discussion in this
thread is about how to achieve that. There are two ways. Develop a code
to do that for this very specific case (aka push out to a worker) or
rely on printk doing that for us and potentially many other places in a
similar situation. I am definitely for the later option because a) it
adds less code we have to maintain and b) it is a more generic solution.

For the current or older kernels there are two ways to workaround for
the problem and floods of oom killer events doesn't seem to be be a
regular production system state (I would even dare to claim that
something is terribly wrong if yes) so no quick&dirty hacks are due.
Either tune the log level or simply disable dump_tasks. It is an useful
tool in some cases but not really necessary in the vast majority of
cases.

> I think the aysnc printk() won't care about wheter the data is
> improtant or not, so the user of printk() (even if it is asyn) should
> have a good management of these data especially if these data may
> consume all or most of the printk buffer.

Not sure what you mean here. We do have an option to tune the ring
buffer (both size and log levels) and dump_tasks specifically.

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23  7:34                 ` Michal Hocko
@ 2020-04-23 10:22                   ` Yafang Shao
  2020-04-23 11:07                     ` Michal Hocko
  0 siblings, 1 reply; 21+ messages in thread
From: Yafang Shao @ 2020-04-23 10:22 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tetsuo Handa, Andrew Morton, Roman Gushchin, linux-mm,
	David Rientjes, Shakeel Butt

On Thu, Apr 23, 2020 at 3:34 PM Michal Hocko <mhocko@suse.com> wrote:
>
> On Thu 23-04-20 14:35:22, Yafang Shao wrote:
> > On Thu, Apr 23, 2020 at 1:35 PM Tetsuo Handa
> [...]
> > > dump_tasks() remains definitely a printk() abuser which is capable of pushing
> > > many thousands of printk() messages in one second if async printk were available.
> > > Async printk CANNOT deal with the problem that too much backlog causes important
> > > messages to be delayed for too long. Please read my explanation carefully.
> > >
> >
> > Agreed. Too much oom reports still be a issue even if the printk() is asyn.
>
> I believe nobody is disputing this part. We are talking about two things
> here and I believe that contributes to a confusion considerably
> 1) dump_tasks being a large noise generator to the kernel log buffer
> 2) a heavy printk load from the oom context
>
> There is no good answer for 1). We simply print a lot of data that
> scales with the number eligible tasks and that might be thousands.
> We have done quite a lot of work to make the data collecting part of the
> process as optimal as possible but having this feature enabled by
> default is simply a package we have to carry with us. printk doesn't
> cope with such a load really great currently. There might be some future
> changes but the underline is that no matter how printk gets optimized
> there is still the payload to be printed. No matter this happens
> transparently async or explicitly done in a detached context.
>
> 2) is about the sync nature of the printk _right_now_ and that causes
> delays in the allocator context while the system is OOM. There are locks
> held both by the OOM context and in the call chain to the allocator
> potentially. The longer the oom context is going to take the longer is
> the agony going to take. Here is where the async printing might help
> because it would push out the heavy lifting to a different context.
>
> There is a clear agreement in this part.  The whole discussion in this
> thread is about how to achieve that. There are two ways. Develop a code
> to do that for this very specific case (aka push out to a worker) or
> rely on printk doing that for us and potentially many other places in a
> similar situation. I am definitely for the later option because a) it
> adds less code we have to maintain and b) it is a more generic solution.
>
> For the current or older kernels there are two ways to workaround for
> the problem and floods of oom killer events doesn't seem to be be a
> regular production system state (I would even dare to claim that
> something is terribly wrong if yes) so no quick&dirty hacks are due.
> Either tune the log level or simply disable dump_tasks. It is an useful
> tool in some cases but not really necessary in the vast majority of
> cases.
>

Thanks for your explanation.
We have an agreement here.

> > I think the aysnc printk() won't care about wheter the data is
> > improtant or not, so the user of printk() (even if it is asyn) should
> > have a good management of these data especially if these data may
> > consume all or most of the printk buffer.
>
> Not sure what you mean here. We do have an option to tune the ring
> buffer (both size and log levels) and dump_tasks specifically.
>

There're drawbacks in both of these two options.

printk() is multiple-producer and mutiple-consumer.
OOM is one of the producers.
logfile (/var/log/messages) and console are two of the consumers.
Now let's see the drawback of each option.

- tune the ring buffer (both size and log levels)
All the producers are effected.
For example, if you tune the log levels, then all producers have the
same loglevel with dump_stack() can't show in the console.
Tuning the size may be not scalable, because we don't know how slow
the console is and tuning it too big is a waste of memory.

- tune the dump_tasks specifically (vm.oom_dump_tasks)
All the consumers are effected.
The logfile is fast enough, so we expect that these dump_tasks could
be printed into the logfile.
The console is so slow that we don't want to print into it.
A possilbe way to fix it is improve vm.oom_dump_tasks.
    vm.oom_dump_tasks : 1 - dump into all consumers
                                         2 - don't dump into console
                                         0 - don't dump into any of
the consumers
But someone may still needs these dump_tasks from the console.

As I'm not familiar with asyn printk(), my understanding may be not correct.

-- 
Thanks
Yafang


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23 10:22                   ` Yafang Shao
@ 2020-04-23 11:07                     ` Michal Hocko
  2020-04-23 13:28                       ` Tetsuo Handa
  0 siblings, 1 reply; 21+ messages in thread
From: Michal Hocko @ 2020-04-23 11:07 UTC (permalink / raw)
  To: Yafang Shao
  Cc: Tetsuo Handa, Andrew Morton, Roman Gushchin, linux-mm,
	David Rientjes, Shakeel Butt

On Thu 23-04-20 18:22:22, Yafang Shao wrote:
> On Thu, Apr 23, 2020 at 3:34 PM Michal Hocko <mhocko@suse.com> wrote:
[...]
> > > I think the aysnc printk() won't care about wheter the data is
> > > improtant or not, so the user of printk() (even if it is asyn) should
> > > have a good management of these data especially if these data may
> > > consume all or most of the printk buffer.
> >
> > Not sure what you mean here. We do have an option to tune the ring
> > buffer (both size and log levels) and dump_tasks specifically.
> >
> 
> There're drawbacks in both of these two options.

They surely are and they were meant as a workaround until printk is more
capable to handle the data flow which we need.

> printk() is multiple-producer and mutiple-consumer.
> OOM is one of the producers.
> logfile (/var/log/messages) and console are two of the consumers.
> Now let's see the drawback of each option.
> 
> - tune the ring buffer (both size and log levels)
> All the producers are effected.
> For example, if you tune the log levels, then all producers have the
> same loglevel with dump_stack() can't show in the console.

The existing loglevels we use are not really carved in stone and we can
prioritize some more than others. dump_tasks is KERN_INFO already and
this is quite a low priority so you shouldn't really miss much when
omitting it. But I wouldn't mind making it KERN_DEBUG.

> Tuning the size may be not scalable, because we don't know how slow
> the console is and tuning it too big is a waste of memory.
> 
> - tune the dump_tasks specifically (vm.oom_dump_tasks)
> All the consumers are effected.
> The logfile is fast enough, so we expect that these dump_tasks could
> be printed into the logfile.
> The console is so slow that we don't want to print into it.
> A possilbe way to fix it is improve vm.oom_dump_tasks.
>     vm.oom_dump_tasks : 1 - dump into all consumers
>                                          2 - don't dump into console
>                                          0 - don't dump into any of

How would that be implemented. I do not know of a way to tell printk
which consoles to use for the output.  Anyway, isn't this something
that can be configured on the printk level. In other words send only
important information to slow consoles?

> the consumers
> But someone may still needs these dump_tasks from the console.

Well, most people simply do not care. I know a bold statement but it's
been couple of years since I am staring at oom reports on regular bases
and I have used that information only in a very marginal percentage of them.
Sure different people need different stuff but nothing really prevents
people from enabling the feature if they feel like. With an obvious
caveat that it can generate a lot of output and that is not a great fit
for slow consoles.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23 11:07                     ` Michal Hocko
@ 2020-04-23 13:28                       ` Tetsuo Handa
  2020-04-23 14:06                         ` Michal Hocko
                                           ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Tetsuo Handa @ 2020-04-23 13:28 UTC (permalink / raw)
  To: Michal Hocko, Yafang Shao
  Cc: Andrew Morton, Roman Gushchin, linux-mm, David Rientjes,
	Shakeel Butt, Sergey Senozhatsky, Petr Mladek, Steven Rostedt

On 2020/04/23 20:07, Michal Hocko wrote:
> The existing loglevels we use are not really carved in stone and we can
> prioritize some more than others. dump_tasks is KERN_INFO already and
> this is quite a low priority so you shouldn't really miss much when
> omitting it. But I wouldn't mind making it KERN_DEBUG.

Do you mean

-       pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
+       pr_debug("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",

in order to suppress printing to consoles? Wow! That will also suppress
saving to log files because syslog daemon is likely configured not to save
KERN_DEBUG level messages.

>> - tune the dump_tasks specifically (vm.oom_dump_tasks)
>> All the consumers are effected.
>> The logfile is fast enough, so we expect that these dump_tasks could
>> be printed into the logfile.
>> The console is so slow that we don't want to print into it.
>> A possilbe way to fix it is improve vm.oom_dump_tasks.
>>     vm.oom_dump_tasks : 1 - dump into all consumers
>>                                          2 - don't dump into console
>>                                          0 - don't dump into any of
> 
> How would that be implemented. I do not know of a way to tell printk
> which consoles to use for the output.  Anyway, isn't this something
> that can be configured on the printk level. In other words send only
> important information to slow consoles?

Last year I proposed
https://lkml.kernel.org/r/1550896930-12324-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
and Sergey Senozhatsky commented

  "This is a bit of a strange issue, to be honest. If OOM prints too
   many messages then we might want to do some work on the OOM side."

. I was thinking that printing to consoles is a requirement for oom_dump_tasks .

If we can agree with not printing dump_tasks() output to consoles, a trivial
patch shown below will solve the problem. Those who cannot run syslog daemon
in userspace might disagree, but this will be the simplest answer.

 include/linux/kern_levels.h |    3 +++
 include/linux/printk.h      |    1 +
 kernel/printk/printk.c      |    7 ++++++-
 mm/oom_kill.c               |    7 ++++---
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/include/linux/kern_levels.h b/include/linux/kern_levels.h
index bf2389c26ae3..cd69a9cb3c2a 100644
--- a/include/linux/kern_levels.h
+++ b/include/linux/kern_levels.h
@@ -23,6 +23,9 @@
  */
 #define KERN_CONT	KERN_SOH "c"
 
+/* Annotation for "don't print to consoles". */
+#define KERN_NO_CONSOLES KERN_SOH "S"
+
 /* integer equivalents of KERN_<LEVEL> */
 #define LOGLEVEL_SCHED		-2	/* Deferred messages from sched code
 					 * are set to this special level */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index e061635e0409..da338b81c2e1 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -19,6 +19,7 @@ static inline int printk_get_level(const char *buffer)
 		switch (buffer[1]) {
 		case '0' ... '7':
 		case 'c':	/* KERN_CONT */
+		case 'S':       /* KERN_NO_CONSOLES */
 			return buffer[1];
 		}
 	}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9a9b6156270b..ed51641af087 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -361,6 +361,7 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
  */
 
 enum log_flags {
+	LOG_NO_CONSOLES = 1,    /* don't print to consoles */
 	LOG_NEWLINE	= 2,	/* text ended with a newline */
 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };
@@ -1959,6 +1960,9 @@ int vprintk_store(int facility, int level,
 				break;
 			case 'c':	/* KERN_CONT */
 				lflags |= LOG_CONT;
+				break;
+			case 'S':       /* KERN_NO_CONSOLES */
+				lflags |= LOG_NO_CONSOLES;
 			}
 
 			text_len -= 2;
@@ -2453,7 +2457,8 @@ void console_unlock(void)
 			break;
 
 		msg = log_from_idx(console_idx);
-		if (suppress_message_printing(msg->level)) {
+		if ((msg->flags & LOG_NO_CONSOLES) ||
+		    suppress_message_printing(msg->level)) {
 			/*
 			 * Skip record we have buffered and already printed
 			 * directly to the console when we received it, and
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dfc357614e56..0b487c13a2c9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -400,7 +400,7 @@ static int dump_task(struct task_struct *p, void *arg)
 		return 0;
 	}
 
-	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
+	pr_info(KERN_NO_CONSOLES "[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
 		task->pid, from_kuid(&init_user_ns, task_uid(task)),
 		task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 		mm_pgtables_bytes(task->mm),
@@ -423,8 +423,9 @@ static int dump_task(struct task_struct *p, void *arg)
  */
 static void dump_tasks(struct oom_control *oc)
 {
-	pr_info("Tasks state (memory values in pages):\n");
-	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
+	pr_info_once("Tasks state is sent to syslog.\n");
+	pr_info(KERN_NO_CONSOLES "Tasks state (memory values in pages):\n");
+	pr_info(KERN_NO_CONSOLES "[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
 
 	if (is_memcg_oom(oc))
 		mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);




^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23 13:28                       ` Tetsuo Handa
@ 2020-04-23 14:06                         ` Michal Hocko
  2020-04-23 14:14                           ` Tetsuo Handa
  2020-04-24 13:02                           ` Steven Rostedt
  2020-04-23 15:54                         ` Sergey Senozhatsky
  2020-04-24  0:56                         ` Yafang Shao
  2 siblings, 2 replies; 21+ messages in thread
From: Michal Hocko @ 2020-04-23 14:06 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Yafang Shao, Andrew Morton, Roman Gushchin, linux-mm,
	David Rientjes, Shakeel Butt, Sergey Senozhatsky, Petr Mladek,
	Steven Rostedt

On Thu 23-04-20 22:28:00, Tetsuo Handa wrote:
> On 2020/04/23 20:07, Michal Hocko wrote:
> > The existing loglevels we use are not really carved in stone and we can
> > prioritize some more than others. dump_tasks is KERN_INFO already and
> > this is quite a low priority so you shouldn't really miss much when
> > omitting it. But I wouldn't mind making it KERN_DEBUG.
> 
> Do you mean
> 
> -       pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> +       pr_debug("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> 
> in order to suppress printing to consoles?

Not with the primary intention. But reducing the log level might allow
to filter it more easily because KERN_DEBUG should really be something
that can be dropped on the floor. I am not aware of heavy KERN_INFO
users that would matter so the current log level makes sense to me but
maybe there are some and that's why I've mentioned I wouldn't be
opposed.
 
[...]

> Last year I proposed
> https://lkml.kernel.org/r/1550896930-12324-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
> and Sergey Senozhatsky commented
> 
>   "This is a bit of a strange issue, to be honest. If OOM prints too
>    many messages then we might want to do some work on the OOM side."
> 
> . I was thinking that printing to consoles is a requirement for oom_dump_tasks .
> 
> If we can agree with not printing dump_tasks() output to consoles, a trivial
> patch shown below will solve the problem. Those who cannot run syslog daemon
> in userspace might disagree, but this will be the simplest answer.

Talk to printk people about this.
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23 14:06                         ` Michal Hocko
@ 2020-04-23 14:14                           ` Tetsuo Handa
  2020-04-23 14:35                             ` Michal Hocko
  2020-04-24 13:02                           ` Steven Rostedt
  1 sibling, 1 reply; 21+ messages in thread
From: Tetsuo Handa @ 2020-04-23 14:14 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Yafang Shao, Andrew Morton, Roman Gushchin, linux-mm,
	David Rientjes, Shakeel Butt, Sergey Senozhatsky, Petr Mladek,
	Steven Rostedt

On 2020/04/23 23:06, Michal Hocko wrote:
>> Last year I proposed
>> https://lkml.kernel.org/r/1550896930-12324-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
>> and Sergey Senozhatsky commented
>>
>>   "This is a bit of a strange issue, to be honest. If OOM prints too
>>    many messages then we might want to do some work on the OOM side."
>>
>> . I was thinking that printing to consoles is a requirement for oom_dump_tasks .
>>
>> If we can agree with not printing dump_tasks() output to consoles, a trivial
>> patch shown below will solve the problem. Those who cannot run syslog daemon
>> in userspace might disagree, but this will be the simplest answer.
> 
> Talk to printk people about this.
> 

Already CC'ed. My question for you is are you OK with "not printing dump_tasks() output
to consoles?" . If your answer is yes, I will start a new thread for this proposal.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23 14:14                           ` Tetsuo Handa
@ 2020-04-23 14:35                             ` Michal Hocko
  0 siblings, 0 replies; 21+ messages in thread
From: Michal Hocko @ 2020-04-23 14:35 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Yafang Shao, Andrew Morton, Roman Gushchin, linux-mm,
	David Rientjes, Shakeel Butt, Sergey Senozhatsky, Petr Mladek,
	Steven Rostedt

On Thu 23-04-20 23:14:37, Tetsuo Handa wrote:
> On 2020/04/23 23:06, Michal Hocko wrote:
> >> Last year I proposed
> >> https://lkml.kernel.org/r/1550896930-12324-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
> >> and Sergey Senozhatsky commented
> >>
> >>   "This is a bit of a strange issue, to be honest. If OOM prints too
> >>    many messages then we might want to do some work on the OOM side."
> >>
> >> . I was thinking that printing to consoles is a requirement for oom_dump_tasks .
> >>
> >> If we can agree with not printing dump_tasks() output to consoles, a trivial
> >> patch shown below will solve the problem. Those who cannot run syslog daemon
> >> in userspace might disagree, but this will be the simplest answer.
> > 
> > Talk to printk people about this.
> > 
> 
> Already CC'ed.

In a separate thread please.

> My question for you is are you OK with "not printing dump_tasks() output
> to consoles?" .

To be honest, I do not know whether that make sense. This sounds
like something that people might have different opinions on and
therefore it should be configured from the userspace. It is a policy
which output goes where. All we should be dealing with is a log level.
But as I've said this is something for printk maintainers.

I have said I am ok with tweaking log levels in the oom report.

-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23 13:28                       ` Tetsuo Handa
  2020-04-23 14:06                         ` Michal Hocko
@ 2020-04-23 15:54                         ` Sergey Senozhatsky
  2020-04-23 22:56                           ` Tetsuo Handa
  2020-04-24  0:56                         ` Yafang Shao
  2 siblings, 1 reply; 21+ messages in thread
From: Sergey Senozhatsky @ 2020-04-23 15:54 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Michal Hocko, Yafang Shao, Andrew Morton, Roman Gushchin,
	linux-mm, David Rientjes, Shakeel Butt, Sergey Senozhatsky,
	Petr Mladek, Steven Rostedt

On (20/04/23 22:28), Tetsuo Handa wrote:
[..]
> +/* Annotation for "don't print to consoles". */
> +#define KERN_NO_CONSOLES KERN_SOH "S"
> +
>  /* integer equivalents of KERN_<LEVEL> */
>  #define LOGLEVEL_SCHED		-2	/* Deferred messages from sched code
>  					 * are set to this special level */
> diff --git a/include/linux/printk.h b/include/linux/printk.h
> index e061635e0409..da338b81c2e1 100644
> --- a/include/linux/printk.h
> +++ b/include/linux/printk.h
> @@ -19,6 +19,7 @@ static inline int printk_get_level(const char *buffer)
>  		switch (buffer[1]) {
>  		case '0' ... '7':
>  		case 'c':	/* KERN_CONT */
> +		case 'S':       /* KERN_NO_CONSOLES */
>  			return buffer[1];
>  		}
>  	}
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index 9a9b6156270b..ed51641af087 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -361,6 +361,7 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
>   */
>  
>  enum log_flags {
> +	LOG_NO_CONSOLES = 1,    /* don't print to consoles */
>  	LOG_NEWLINE	= 2,	/* text ended with a newline */
>  	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
>  };
> @@ -1959,6 +1960,9 @@ int vprintk_store(int facility, int level,
>  				break;
>  			case 'c':	/* KERN_CONT */
>  				lflags |= LOG_CONT;
> +				break;
> +			case 'S':       /* KERN_NO_CONSOLES */
> +				lflags |= LOG_NO_CONSOLES;
>  			}
[..]

Hmm. Any chance that those NO_CONSOLE messages overflow logbuf so then
we start losing pending YES_CONSOLE messages?

	-ss


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23 15:54                         ` Sergey Senozhatsky
@ 2020-04-23 22:56                           ` Tetsuo Handa
  0 siblings, 0 replies; 21+ messages in thread
From: Tetsuo Handa @ 2020-04-23 22:56 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Michal Hocko, Yafang Shao, Andrew Morton, Roman Gushchin,
	linux-mm, David Rientjes, Shakeel Butt, Petr Mladek,
	Steven Rostedt

On 2020/04/24 0:54, Sergey Senozhatsky wrote:
> Hmm. Any chance that those NO_CONSOLE messages overflow logbuf so then
> we start losing pending YES_CONSOLE messages?

Of course there will be such possibility if printk() user is doing something
close to

  while (1) {
    cond_resched();
    printk("hello\n");
  }

which is not avoidable unless "we allocate huge logbuf" and/or "add a flow control
on the caller side". My proposal was to keep printk() synchronous by offloading
dump_tasks() to a deferrable context (i.e. a workqueue context) and adding a flow
control so that "logbuf will not overflow", and Michal Hocko is still refusing on
adding a flow control to dump_tasks() side. The possibility of overflowing logbuf
will remain no matter how printk() changes (i.e. even asynchronous printk cannot
solve).

NO_CONSOLE is not only a way for suppress printing to consoles but also a way for
reducing possibility of overflowing logbuf, by hoping that userspace syslog daemon
can read logbuf quickly enough to make free space for future NO_CONSOLE/YES_CONSOLE
messages.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23 13:28                       ` Tetsuo Handa
  2020-04-23 14:06                         ` Michal Hocko
  2020-04-23 15:54                         ` Sergey Senozhatsky
@ 2020-04-24  0:56                         ` Yafang Shao
  2020-04-24  1:16                           ` Tetsuo Handa
  2 siblings, 1 reply; 21+ messages in thread
From: Yafang Shao @ 2020-04-24  0:56 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: Michal Hocko, Andrew Morton, Roman Gushchin, linux-mm,
	David Rientjes, Shakeel Butt, Sergey Senozhatsky, Petr Mladek,
	Steven Rostedt

On Thu, Apr 23, 2020 at 9:28 PM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> On 2020/04/23 20:07, Michal Hocko wrote:
> > The existing loglevels we use are not really carved in stone and we can
> > prioritize some more than others. dump_tasks is KERN_INFO already and
> > this is quite a low priority so you shouldn't really miss much when
> > omitting it. But I wouldn't mind making it KERN_DEBUG.
>
> Do you mean
>
> -       pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> +       pr_debug("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
>
> in order to suppress printing to consoles? Wow! That will also suppress
> saving to log files because syslog daemon is likely configured not to save
> KERN_DEBUG level messages.
>
> >> - tune the dump_tasks specifically (vm.oom_dump_tasks)
> >> All the consumers are effected.
> >> The logfile is fast enough, so we expect that these dump_tasks could
> >> be printed into the logfile.
> >> The console is so slow that we don't want to print into it.
> >> A possilbe way to fix it is improve vm.oom_dump_tasks.
> >>     vm.oom_dump_tasks : 1 - dump into all consumers
> >>                                          2 - don't dump into console
> >>                                          0 - don't dump into any of
> >
> > How would that be implemented. I do not know of a way to tell printk
> > which consoles to use for the output.  Anyway, isn't this something
> > that can be configured on the printk level. In other words send only
> > important information to slow consoles?
>
> Last year I proposed
> https://lkml.kernel.org/r/1550896930-12324-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
> and Sergey Senozhatsky commented
>
>   "This is a bit of a strange issue, to be honest. If OOM prints too
>    many messages then we might want to do some work on the OOM side."
>
> . I was thinking that printing to consoles is a requirement for oom_dump_tasks .
>
> If we can agree with not printing dump_tasks() output to consoles, a trivial
> patch shown below will solve the problem. Those who cannot run syslog daemon
> in userspace might disagree, but this will be the simplest answer.
>
>  include/linux/kern_levels.h |    3 +++
>  include/linux/printk.h      |    1 +
>  kernel/printk/printk.c      |    7 ++++++-
>  mm/oom_kill.c               |    7 ++++---
>  4 files changed, 14 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/kern_levels.h b/include/linux/kern_levels.h
> index bf2389c26ae3..cd69a9cb3c2a 100644
> --- a/include/linux/kern_levels.h
> +++ b/include/linux/kern_levels.h
> @@ -23,6 +23,9 @@
>   */
>  #define KERN_CONT      KERN_SOH "c"
>
> +/* Annotation for "don't print to consoles". */
> +#define KERN_NO_CONSOLES KERN_SOH "S"
> +
>  /* integer equivalents of KERN_<LEVEL> */
>  #define LOGLEVEL_SCHED         -2      /* Deferred messages from sched code
>                                          * are set to this special level */
> diff --git a/include/linux/printk.h b/include/linux/printk.h
> index e061635e0409..da338b81c2e1 100644
> --- a/include/linux/printk.h
> +++ b/include/linux/printk.h
> @@ -19,6 +19,7 @@ static inline int printk_get_level(const char *buffer)
>                 switch (buffer[1]) {
>                 case '0' ... '7':
>                 case 'c':       /* KERN_CONT */
> +               case 'S':       /* KERN_NO_CONSOLES */
>                         return buffer[1];
>                 }
>         }
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index 9a9b6156270b..ed51641af087 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -361,6 +361,7 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
>   */
>
>  enum log_flags {
> +       LOG_NO_CONSOLES = 1,    /* don't print to consoles */
>         LOG_NEWLINE     = 2,    /* text ended with a newline */
>         LOG_CONT        = 8,    /* text is a fragment of a continuation line */
>  };
> @@ -1959,6 +1960,9 @@ int vprintk_store(int facility, int level,
>                                 break;
>                         case 'c':       /* KERN_CONT */
>                                 lflags |= LOG_CONT;
> +                               break;
> +                       case 'S':       /* KERN_NO_CONSOLES */
> +                               lflags |= LOG_NO_CONSOLES;
>                         }
>
>                         text_len -= 2;
> @@ -2453,7 +2457,8 @@ void console_unlock(void)
>                         break;
>
>                 msg = log_from_idx(console_idx);
> -               if (suppress_message_printing(msg->level)) {
> +               if ((msg->flags & LOG_NO_CONSOLES) ||
> +                   suppress_message_printing(msg->level)) {
>                         /*
>                          * Skip record we have buffered and already printed
>                          * directly to the console when we received it, and
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index dfc357614e56..0b487c13a2c9 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -400,7 +400,7 @@ static int dump_task(struct task_struct *p, void *arg)
>                 return 0;
>         }
>
> -       pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
> +       pr_info(KERN_NO_CONSOLES "[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
>                 task->pid, from_kuid(&init_user_ns, task_uid(task)),
>                 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
>                 mm_pgtables_bytes(task->mm),
> @@ -423,8 +423,9 @@ static int dump_task(struct task_struct *p, void *arg)
>   */
>  static void dump_tasks(struct oom_control *oc)
>  {
> -       pr_info("Tasks state (memory values in pages):\n");
> -       pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
> +       pr_info_once("Tasks state is sent to syslog.\n");
> +       pr_info(KERN_NO_CONSOLES "Tasks state (memory values in pages):\n");
> +       pr_info(KERN_NO_CONSOLES "[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
>
>         if (is_memcg_oom(oc))
>                 mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
>
>

I suggest to set KERN_NO_CONSOLES by default but the user can tune it
back to the original behavior.
I'm not a fan of sysctl, but if there's no better chioce, enhancing
vm.oom_dump_tasks seems a possible solution.

-- 
Thanks
Yafang


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-24  0:56                         ` Yafang Shao
@ 2020-04-24  1:16                           ` Tetsuo Handa
  0 siblings, 0 replies; 21+ messages in thread
From: Tetsuo Handa @ 2020-04-24  1:16 UTC (permalink / raw)
  To: Yafang Shao
  Cc: Michal Hocko, Andrew Morton, Roman Gushchin, linux-mm,
	David Rientjes, Shakeel Butt, Sergey Senozhatsky, Petr Mladek,
	Steven Rostedt

On 2020/04/24 9:56, Yafang Shao wrote:
> I suggest to set KERN_NO_CONSOLES by default but the user can tune it
> back to the original behavior.

Yes, I'd like to set KERN_NO_CONSOLES by default.

> I'm not a fan of sysctl, but if there's no better chioce, enhancing
> vm.oom_dump_tasks seems a possible solution.

I don't think vm.oom_dump_tasks is the right variable to enhance. We can allow users
to suppress not only dump_tasks() output but also e.g. dump_stack() / show_mem()
output, for OOM-killer messages and memory allocation failure messages can take long
time (if printed to consoles) is an unhappy thing for OOM context and atomic context.
Since Dmitry Safonov is working on adding loglevel argument to show_stack(), we will
be able to control dump_stack() in near future. I think that we should introduce a new
sysctl variable under vm directory in order to implement bitmask for "which OOM and
memory allocation failure related messages should be printed to consoles". I think
that there is no need to print to consoles (i.e. saving to log files via userspace
syslog daemon is sufficient) for most systems.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-23 14:06                         ` Michal Hocko
  2020-04-23 14:14                           ` Tetsuo Handa
@ 2020-04-24 13:02                           ` Steven Rostedt
  2020-04-24 15:18                             ` Michal Hocko
  1 sibling, 1 reply; 21+ messages in thread
From: Steven Rostedt @ 2020-04-24 13:02 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Tetsuo Handa, Yafang Shao, Andrew Morton, Roman Gushchin,
	linux-mm, David Rientjes, Shakeel Butt, Sergey Senozhatsky,
	Petr Mladek

On Thu, 23 Apr 2020 16:06:46 +0200
Michal Hocko <mhocko@suse.com> wrote:

> > If we can agree with not printing dump_tasks() output to consoles, a trivial
> > patch shown below will solve the problem. Those who cannot run syslog daemon
> > in userspace might disagree, but this will be the simplest answer.  
> 
> Talk to printk people about this.

Please no!

Usually when the system runs out of memory, I have no access to a shell,
and the oom output to the console may be the only information I can use to
debug the situation.

-- Steve


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree
  2020-04-24 13:02                           ` Steven Rostedt
@ 2020-04-24 15:18                             ` Michal Hocko
  0 siblings, 0 replies; 21+ messages in thread
From: Michal Hocko @ 2020-04-24 15:18 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Tetsuo Handa, Yafang Shao, Andrew Morton, Roman Gushchin,
	linux-mm, David Rientjes, Shakeel Butt, Sergey Senozhatsky,
	Petr Mladek

On Fri 24-04-20 09:02:30, Steven Rostedt wrote:
> On Thu, 23 Apr 2020 16:06:46 +0200
> Michal Hocko <mhocko@suse.com> wrote:
> 
> > > If we can agree with not printing dump_tasks() output to consoles, a trivial
> > > patch shown below will solve the problem. Those who cannot run syslog daemon
> > > in userspace might disagree, but this will be the simplest answer.  
> > 
> > Talk to printk people about this.
> 
> Please no!
> 
> Usually when the system runs out of memory, I have no access to a shell,
> and the oom output to the console may be the only information I can use to
> debug the situation.

Well, as I've said in other email, what belongs to console is a matter
of userspace policy which should be based on the loglevel [1].  Whether
a concept of a loglevel or some other means to tell that this is a large
bag of output that doesn't have the top priority and printk would do
something clever about that is a question for you (printk maintainers).

[1] http://lkml.kernel.org/r/20200423143550.GH4206@dhcp22.suse.cz
-- 
Michal Hocko
SUSE Labs


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2020-04-24 15:18 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20200131043324.wkArXTf6-%akpm@linux-foundation.org>
2020-04-17 14:33 ` [nacked] mm-oom-avoid-printk-iteration-under-rcu.patch removed from -mm tree Tetsuo Handa
2020-04-17 15:26   ` Michal Hocko
2020-04-18 10:13     ` Tetsuo Handa
2020-04-20  7:33       ` Michal Hocko
2020-04-22  6:40         ` Tetsuo Handa
2020-04-22  6:59           ` Michal Hocko
2020-04-23  5:34             ` Tetsuo Handa
2020-04-23  6:35               ` Yafang Shao
2020-04-23  7:34                 ` Michal Hocko
2020-04-23 10:22                   ` Yafang Shao
2020-04-23 11:07                     ` Michal Hocko
2020-04-23 13:28                       ` Tetsuo Handa
2020-04-23 14:06                         ` Michal Hocko
2020-04-23 14:14                           ` Tetsuo Handa
2020-04-23 14:35                             ` Michal Hocko
2020-04-24 13:02                           ` Steven Rostedt
2020-04-24 15:18                             ` Michal Hocko
2020-04-23 15:54                         ` Sergey Senozhatsky
2020-04-23 22:56                           ` Tetsuo Handa
2020-04-24  0:56                         ` Yafang Shao
2020-04-24  1:16                           ` Tetsuo Handa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).