All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] tracing: Add trace for task_exit
@ 2021-04-30 14:22 Peter Enderborg
  2021-04-30 14:22 ` [PATCH 1/2] tracing: Add a " Peter Enderborg
  2021-04-30 14:22 ` [PATCH 2/2] tracing: Align task.h to use __assing_str for strings Peter Enderborg
  0 siblings, 2 replies; 15+ messages in thread
From: Peter Enderborg @ 2021-04-30 14:22 UTC (permalink / raw)
  To: linux-kernel, Steven Rostedt, Ingo Molnar, Andrew Morton,
	Eric W . Biederman, Peter Zijlstra, Alexei Starovoitov,
	Christian Brauner, Davidlohr Bueso, Michel Lespinasse, Jann Horn,
	Mathieu Desnoyers, Christophe Leroy, Minchan Kim


Patch 1 adds a tracepoint for task_exit and use trace strings for that.
the second algin the other (task_newtask and task_rename) to use same
string.

Knowing when and why tasks dies are very useful for varius debugging
aktivites in userspace and we can trace tasks complete life-cycle.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/2] tracing: Add a trace for task_exit
  2021-04-30 14:22 [PATCH 0/2] tracing: Add trace for task_exit Peter Enderborg
@ 2021-04-30 14:22 ` Peter Enderborg
  2021-04-30 17:48   ` Eric W. Biederman
  2021-04-30 14:22 ` [PATCH 2/2] tracing: Align task.h to use __assing_str for strings Peter Enderborg
  1 sibling, 1 reply; 15+ messages in thread
From: Peter Enderborg @ 2021-04-30 14:22 UTC (permalink / raw)
  To: linux-kernel, Steven Rostedt, Ingo Molnar, Andrew Morton,
	Eric W . Biederman, Peter Zijlstra, Alexei Starovoitov,
	Christian Brauner, Davidlohr Bueso, Michel Lespinasse, Jann Horn,
	Mathieu Desnoyers, Christophe Leroy, Minchan Kim
  Cc: Peter Enderborg

This is the peer functions to task_rename and task_newtask.
With this we get hole "life-cycle" of task and can easily
see short livied task and their exit status.

Format might look like:
            bash-1144    [006] ....  1306.601707: task_newtask: pid=1181 comm=bash clone_flags=1200000 oom_score_adj=0
           <...>-1181    [007] ....  1306.602080: task_rename: pid=1181 oldcomm=bash newcomm=ls oom_score_adj=0
            bash-1144    [006] d...  1306.785960: task_exit: pid=1181 oom_score_adj=0 exit_signal=17 exit_code=0 exit_state=0x10 comm=ls

For a sequence when a bash shell runs the ls command.

Signed-off-by: Peter Enderborg <peter.enderborg@sony.com>
---
 include/trace/events/task.h | 32 ++++++++++++++++++++++++++++++++
 kernel/exit.c               |  3 +++
 2 files changed, 35 insertions(+)

diff --git a/include/trace/events/task.h b/include/trace/events/task.h
index 64d160930b0d..2e977d2935e1 100644
--- a/include/trace/events/task.h
+++ b/include/trace/events/task.h
@@ -56,6 +56,38 @@ TRACE_EVENT(task_rename,
 		__entry->newcomm, __entry->oom_score_adj)
 );
 
+TRACE_EVENT(task_exit,
+
+	TP_PROTO(struct task_struct *task),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__field(pid_t,	pid)
+		__field(short,	oom_score_adj)
+		__field(int,	exit_signal)
+		__field(int,	exit_code)
+		__field(int,	exit_state)
+		__string(comm, task->comm)
+
+	),
+
+	TP_fast_assign(
+		__entry->pid = task->pid;
+		__entry->oom_score_adj = task->signal->oom_score_adj;
+		__entry->exit_signal = task->exit_signal;
+		__entry->exit_code = task->exit_code;
+		__entry->exit_state = task->exit_state;
+		__assign_str(comm, task->comm);
+	),
+
+	TP_printk("pid=%d oom_score_adj=%hd exit_signal=%d exit_code=%d exit_state=0x%x comm=%s",
+		  __entry->pid,
+		  __entry->oom_score_adj, __entry->exit_signal,
+		  __entry->exit_code, __entry->exit_state,
+		  __get_str(comm))
+);
+
 #endif
 
 /* This part must be outside protection */
diff --git a/kernel/exit.c b/kernel/exit.c
index 04029e35e69a..3ab0944e5dfc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,6 +68,7 @@
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/mmu_context.h>
+#include <trace/events/task.h>
 
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
@@ -107,6 +108,8 @@ static void __exit_signal(struct task_struct *tsk)
 		posix_cpu_timers_exit_group(tsk);
 #endif
 
+	trace_task_exit(tsk);
+
 	if (group_dead) {
 		tty = sig->tty;
 		sig->tty = NULL;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/2] tracing: Align task.h to use __assing_str for strings.
  2021-04-30 14:22 [PATCH 0/2] tracing: Add trace for task_exit Peter Enderborg
  2021-04-30 14:22 ` [PATCH 1/2] tracing: Add a " Peter Enderborg
@ 2021-04-30 14:22 ` Peter Enderborg
  1 sibling, 0 replies; 15+ messages in thread
From: Peter Enderborg @ 2021-04-30 14:22 UTC (permalink / raw)
  To: linux-kernel, Steven Rostedt, Ingo Molnar, Andrew Morton,
	Eric W . Biederman, Peter Zijlstra, Alexei Starovoitov,
	Christian Brauner, Davidlohr Bueso, Michel Lespinasse, Jann Horn,
	Mathieu Desnoyers, Christophe Leroy, Minchan Kim
  Cc: Peter Enderborg

This moves the comm strings to use the tracing string functions
instead of char arrays so they all work in the same way.

Signed-off-by: Peter Enderborg <peter.enderborg@sony.com>
---
 include/trace/events/task.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/task.h b/include/trace/events/task.h
index 2e977d2935e1..2bd314e60cb6 100644
--- a/include/trace/events/task.h
+++ b/include/trace/events/task.h
@@ -14,20 +14,20 @@ TRACE_EVENT(task_newtask,
 
 	TP_STRUCT__entry(
 		__field(	pid_t,	pid)
-		__array(	char,	comm, TASK_COMM_LEN)
+		__string(comm, task->comm)
 		__field( unsigned long, clone_flags)
 		__field(	short,	oom_score_adj)
 	),
 
 	TP_fast_assign(
 		__entry->pid = task->pid;
-		memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
+		__assign_str(comm, task->comm);
 		__entry->clone_flags = clone_flags;
 		__entry->oom_score_adj = task->signal->oom_score_adj;
 	),
 
 	TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd",
-		__entry->pid, __entry->comm,
+		  __entry->pid, __get_str(comm),
 		__entry->clone_flags, __entry->oom_score_adj)
 );
 
@@ -39,21 +39,21 @@ TRACE_EVENT(task_rename,
 
 	TP_STRUCT__entry(
 		__field(	pid_t,	pid)
-		__array(	char, oldcomm,  TASK_COMM_LEN)
-		__array(	char, newcomm,  TASK_COMM_LEN)
+		__string(oldcomm, task->comm)
+		__string(newcomm, comm)
 		__field(	short,	oom_score_adj)
 	),
 
 	TP_fast_assign(
 		__entry->pid = task->pid;
-		memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN);
-		strlcpy(entry->newcomm, comm, TASK_COMM_LEN);
+		__assign_str(oldcomm, task->comm);
+		__assign_str(newcomm, comm);
 		__entry->oom_score_adj = task->signal->oom_score_adj;
 	),
 
 	TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%hd",
-		__entry->pid, __entry->oldcomm,
-		__entry->newcomm, __entry->oom_score_adj)
+		__entry->pid, __get_str(oldcomm),
+		__get_str(newcomm), __entry->oom_score_adj)
 );
 
 TRACE_EVENT(task_exit,
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-04-30 14:22 ` [PATCH 1/2] tracing: Add a " Peter Enderborg
@ 2021-04-30 17:48   ` Eric W. Biederman
  2021-05-01  9:29     ` Peter.Enderborg
  0 siblings, 1 reply; 15+ messages in thread
From: Eric W. Biederman @ 2021-04-30 17:48 UTC (permalink / raw)
  To: Peter Enderborg
  Cc: linux-kernel, Steven Rostedt, Ingo Molnar, Andrew Morton,
	Peter Zijlstra, Alexei Starovoitov, Christian Brauner,
	Davidlohr Bueso, Michel Lespinasse, Jann Horn, Mathieu Desnoyers,
	Christophe Leroy, Minchan Kim

Peter Enderborg <peter.enderborg@sony.com> writes:

> This is the peer functions to task_rename and task_newtask.
> With this we get hole "life-cycle" of task and can easily
> see short livied task and their exit status.

This patch is incorrect.  The location you are dealing with is not part
of task exit.  The location you have instrumented is part of reaping a
task which can come arbitrarily long after the task exits.

There are some special rules associated with task_comm so I don't know
if your change to __string from a fixed size character array is safe.

Certainly something like that needs an explanation of why such a type
change is safe.

Eric


> Format might look like:
>             bash-1144    [006] ....  1306.601707: task_newtask: pid=1181 comm=bash clone_flags=1200000 oom_score_adj=0
>            <...>-1181    [007] ....  1306.602080: task_rename: pid=1181 oldcomm=bash newcomm=ls oom_score_adj=0
>             bash-1144    [006] d...  1306.785960: task_exit: pid=1181 oom_score_adj=0 exit_signal=17 exit_code=0 exit_state=0x10 comm=ls
>
> For a sequence when a bash shell runs the ls command.
>
> Signed-off-by: Peter Enderborg <peter.enderborg@sony.com>
> ---
>  include/trace/events/task.h | 32 ++++++++++++++++++++++++++++++++
>  kernel/exit.c               |  3 +++
>  2 files changed, 35 insertions(+)
>
> diff --git a/include/trace/events/task.h b/include/trace/events/task.h
> index 64d160930b0d..2e977d2935e1 100644
> --- a/include/trace/events/task.h
> +++ b/include/trace/events/task.h
> @@ -56,6 +56,38 @@ TRACE_EVENT(task_rename,
>  		__entry->newcomm, __entry->oom_score_adj)
>  );
>  
> +TRACE_EVENT(task_exit,
> +
> +	TP_PROTO(struct task_struct *task),
> +
> +	TP_ARGS(task),
> +
> +	TP_STRUCT__entry(
> +		__field(pid_t,	pid)
> +		__field(short,	oom_score_adj)
> +		__field(int,	exit_signal)
> +		__field(int,	exit_code)
> +		__field(int,	exit_state)
> +		__string(comm, task->comm)
> +
> +	),
> +
> +	TP_fast_assign(
> +		__entry->pid = task->pid;
> +		__entry->oom_score_adj = task->signal->oom_score_adj;
> +		__entry->exit_signal = task->exit_signal;
> +		__entry->exit_code = task->exit_code;
> +		__entry->exit_state = task->exit_state;
> +		__assign_str(comm, task->comm);
> +	),
> +
> +	TP_printk("pid=%d oom_score_adj=%hd exit_signal=%d exit_code=%d exit_state=0x%x comm=%s",
> +		  __entry->pid,
> +		  __entry->oom_score_adj, __entry->exit_signal,
> +		  __entry->exit_code, __entry->exit_state,
> +		  __get_str(comm))
> +);
> +
>  #endif
>  
>  /* This part must be outside protection */
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 04029e35e69a..3ab0944e5dfc 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -68,6 +68,7 @@
>  #include <linux/uaccess.h>
>  #include <asm/unistd.h>
>  #include <asm/mmu_context.h>
> +#include <trace/events/task.h>
>  
>  static void __unhash_process(struct task_struct *p, bool group_dead)
>  {
> @@ -107,6 +108,8 @@ static void __exit_signal(struct task_struct *tsk)
>  		posix_cpu_timers_exit_group(tsk);
>  #endif
>  
> +	trace_task_exit(tsk);
> +
>  	if (group_dead) {
>  		tty = sig->tty;
>  		sig->tty = NULL;

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-04-30 17:48   ` Eric W. Biederman
@ 2021-05-01  9:29     ` Peter.Enderborg
  2021-05-01 13:11       ` Steven Rostedt
  0 siblings, 1 reply; 15+ messages in thread
From: Peter.Enderborg @ 2021-05-01  9:29 UTC (permalink / raw)
  To: ebiederm
  Cc: linux-kernel, rostedt, mingo, akpm, peterz, ast,
	christian.brauner, dave, walken, jannh, mathieu.desnoyers,
	christophe.leroy, minchan

On 4/30/21 7:48 PM, Eric W. Biederman wrote:
> Peter Enderborg <peter.enderborg@sony.com> writes:
>
>> This is the peer functions to task_rename and task_newtask.
>> With this we get hole "life-cycle" of task and can easily
>> see short livied task and their exit status.
> This patch is incorrect.  The location you are dealing with is not part
> of task exit.  The location you have instrumented is part of reaping a
> task which can come arbitrarily long after the task exits.

That is what it aiming. When using this as tool for userspace you
would like to know when the task is done. When it no longer
holds any thing that might have any impact. If you think the
exit imply something more specific I can change the name.

I thought exit was a good name, it is in in exit.c.

Will the name task_done, task_finished or task_reaped work for you?


>
> There are some special rules associated with task_comm so I don't know
> if your change to __string from a fixed size character array is safe.
>
> Certainly something like that needs an explanation of why such a type
> change is safe.
>
> Eric
>
>
>> Format might look like:
>>             bash-1144    [006] ....  1306.601707: task_newtask: pid=1181 comm=bash clone_flags=1200000 oom_score_adj=0
>>            <...>-1181    [007] ....  1306.602080: task_rename: pid=1181 oldcomm=bash newcomm=ls oom_score_adj=0
>>             bash-1144    [006] d...  1306.785960: task_exit: pid=1181 oom_score_adj=0 exit_signal=17 exit_code=0 exit_state=0x10 comm=ls
>>
>> For a sequence when a bash shell runs the ls command.
>>
>> Signed-off-by: Peter Enderborg <peter.enderborg@sony.com>
>> ---
>>  include/trace/events/task.h | 32 ++++++++++++++++++++++++++++++++
>>  kernel/exit.c               |  3 +++
>>  2 files changed, 35 insertions(+)
>>
>> diff --git a/include/trace/events/task.h b/include/trace/events/task.h
>> index 64d160930b0d..2e977d2935e1 100644
>> --- a/include/trace/events/task.h
>> +++ b/include/trace/events/task.h
>> @@ -56,6 +56,38 @@ TRACE_EVENT(task_rename,
>>  		__entry->newcomm, __entry->oom_score_adj)
>>  );
>>  
>> +TRACE_EVENT(task_exit,
>> +
>> +	TP_PROTO(struct task_struct *task),
>> +
>> +	TP_ARGS(task),
>> +
>> +	TP_STRUCT__entry(
>> +		__field(pid_t,	pid)
>> +		__field(short,	oom_score_adj)
>> +		__field(int,	exit_signal)
>> +		__field(int,	exit_code)
>> +		__field(int,	exit_state)
>> +		__string(comm, task->comm)
>> +
>> +	),
>> +
>> +	TP_fast_assign(
>> +		__entry->pid = task->pid;
>> +		__entry->oom_score_adj = task->signal->oom_score_adj;
>> +		__entry->exit_signal = task->exit_signal;
>> +		__entry->exit_code = task->exit_code;
>> +		__entry->exit_state = task->exit_state;
>> +		__assign_str(comm, task->comm);
>> +	),
>> +
>> +	TP_printk("pid=%d oom_score_adj=%hd exit_signal=%d exit_code=%d exit_state=0x%x comm=%s",
>> +		  __entry->pid,
>> +		  __entry->oom_score_adj, __entry->exit_signal,
>> +		  __entry->exit_code, __entry->exit_state,
>> +		  __get_str(comm))
>> +);
>> +
>>  #endif
>>  
>>  /* This part must be outside protection */
>> diff --git a/kernel/exit.c b/kernel/exit.c
>> index 04029e35e69a..3ab0944e5dfc 100644
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -68,6 +68,7 @@
>>  #include <linux/uaccess.h>
>>  #include <asm/unistd.h>
>>  #include <asm/mmu_context.h>
>> +#include <trace/events/task.h>
>>  
>>  static void __unhash_process(struct task_struct *p, bool group_dead)
>>  {
>> @@ -107,6 +108,8 @@ static void __exit_signal(struct task_struct *tsk)
>>  		posix_cpu_timers_exit_group(tsk);
>>  #endif
>>  
>> +	trace_task_exit(tsk);
>> +
>>  	if (group_dead) {
>>  		tty = sig->tty;
>>  		sig->tty = NULL;


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-05-01  9:29     ` Peter.Enderborg
@ 2021-05-01 13:11       ` Steven Rostedt
  2021-05-03 13:50         ` Mathieu Desnoyers
  0 siblings, 1 reply; 15+ messages in thread
From: Steven Rostedt @ 2021-05-01 13:11 UTC (permalink / raw)
  To: Peter.Enderborg
  Cc: ebiederm, linux-kernel, mingo, akpm, peterz, ast,
	christian.brauner, dave, walken, jannh, mathieu.desnoyers,
	christophe.leroy, minchan

On Sat, 1 May 2021 09:29:41 +0000
<Peter.Enderborg@sony.com> wrote:

> On 4/30/21 7:48 PM, Eric W. Biederman wrote:
> > Peter Enderborg <peter.enderborg@sony.com> writes:
> >  
> >> This is the peer functions to task_rename and task_newtask.
> >> With this we get hole "life-cycle" of task and can easily
> >> see short livied task and their exit status.  
> > This patch is incorrect.  The location you are dealing with is not part
> > of task exit.  The location you have instrumented is part of reaping a
> > task which can come arbitrarily long after the task exits.  
> 
> That is what it aiming. When using this as tool for userspace you
> would like to know when the task is done. When it no longer
> holds any thing that might have any impact. If you think the
> exit imply something more specific I can change the name.
> 
> I thought exit was a good name, it is in in exit.c.
> 
> Will the name task_done, task_finished or task_reaped work for you?

I think "task_reaped" is probably the best name, and the most
descriptive of what happened.

-- Steve

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-05-01 13:11       ` Steven Rostedt
@ 2021-05-03 13:50         ` Mathieu Desnoyers
  2021-05-03 14:48           ` Peter.Enderborg
  0 siblings, 1 reply; 15+ messages in thread
From: Mathieu Desnoyers @ 2021-05-03 13:50 UTC (permalink / raw)
  To: rostedt
  Cc: peter enderborg, Eric W. Biederman, linux-kernel, Ingo Molnar,
	Andrew Morton, Peter Zijlstra, Alexei Starovoitov,
	Christian Brauner, Davidlohr Bueso, Michel Lespinasse, Jann Horn,
	christophe leroy, Minchan Kim

----- On May 1, 2021, at 9:11 AM, rostedt rostedt@goodmis.org wrote:

> On Sat, 1 May 2021 09:29:41 +0000
> <Peter.Enderborg@sony.com> wrote:
> 
>> On 4/30/21 7:48 PM, Eric W. Biederman wrote:
>> > Peter Enderborg <peter.enderborg@sony.com> writes:
>> >  
>> >> This is the peer functions to task_rename and task_newtask.
>> >> With this we get hole "life-cycle" of task and can easily
>> >> see short livied task and their exit status.
>> > This patch is incorrect.  The location you are dealing with is not part
>> > of task exit.  The location you have instrumented is part of reaping a
>> > task which can come arbitrarily long after the task exits.
>> 
>> That is what it aiming. When using this as tool for userspace you
>> would like to know when the task is done. When it no longer
>> holds any thing that might have any impact. If you think the
>> exit imply something more specific I can change the name.
>> 
>> I thought exit was a good name, it is in in exit.c.
>> 
>> Will the name task_done, task_finished or task_reaped work for you?
> 
> I think "task_reaped" is probably the best name, and the most
> descriptive of what happened.

What would it provide that is not already available through the "sched_process_free"
tracepoint in delayed_put_task_struct ?

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-05-03 13:50         ` Mathieu Desnoyers
@ 2021-05-03 14:48           ` Peter.Enderborg
  2021-05-03 16:30             ` Eric W. Biederman
  0 siblings, 1 reply; 15+ messages in thread
From: Peter.Enderborg @ 2021-05-03 14:48 UTC (permalink / raw)
  To: mathieu.desnoyers, rostedt
  Cc: ebiederm, linux-kernel, mingo, akpm, peterz, ast,
	christian.brauner, dave, walken, jannh, christophe.leroy,
	minchan

On 5/3/21 3:50 PM, Mathieu Desnoyers wrote:
> ----- On May 1, 2021, at 9:11 AM, rostedt rostedt@goodmis.org wrote:
>
>> On Sat, 1 May 2021 09:29:41 +0000
>> <Peter.Enderborg@sony.com> wrote:
>>
>>> On 4/30/21 7:48 PM, Eric W. Biederman wrote:
>>>> Peter Enderborg <peter.enderborg@sony.com> writes:
>>>>  
>>>>> This is the peer functions to task_rename and task_newtask.
>>>>> With this we get hole "life-cycle" of task and can easily
>>>>> see short livied task and their exit status.
>>>> This patch is incorrect.  The location you are dealing with is not part
>>>> of task exit.  The location you have instrumented is part of reaping a
>>>> task which can come arbitrarily long after the task exits.
>>> That is what it aiming. When using this as tool for userspace you
>>> would like to know when the task is done. When it no longer
>>> holds any thing that might have any impact. If you think the
>>> exit imply something more specific I can change the name.
>>>
>>> I thought exit was a good name, it is in in exit.c.
>>>
>>> Will the name task_done, task_finished or task_reaped work for you?
>> I think "task_reaped" is probably the best name, and the most
>> descriptive of what happened.
> What would it provide that is not already available through the "sched_process_free"
> tracepoint in delayed_put_task_struct ?

For task_exit (or task_reaped)

        field:pid_t pid;        offset:8;       size:4; signed:1;
        field:short oom_score_adj;      offset:12;      size:2; signed:1;
        field:int exit_signal;  offset:16;      size:4; signed:1;
        field:int exit_code;    offset:20;      size:4; signed:1;
        field:int exit_state;   offset:24;      size:4; signed:1;
        field:__data_loc char[] comm;   offset:28;      size:4; signed:1;

sched_process_free
        field:char comm[16];    offset:8;       size:16;        signed:1;
        field:pid_t pid;        offset:24;      size:4; signed:1;
        field:int prio; offset:28;      size:4; signed:1;

So information about oom_score_adj, and it's exit parameters.

And it is also gives information on kernel tasks. The task exit also
gives information about it's parent.   sched_process_free is "idle" as parent.

          sshd-7399    [002] d... 262223.258018: task_exit: pid=7400 oom_score_adj=0 exit_signal=17 exit_code=0 exit_state=0x10 comm=ls
            sshd-7395    [007] ..s. 262223.263429: sched_process_free: comm=ps pid=7428 prio=120
          <idle>-0       [005] ..s. 262223.263440: sched_process_free: comm=basename pid=7427 prio=120
            sshd-800     [007] d... 262223.263844: task_exit: pid=7395 oom_score_adj=0 exit_signal=17 exit_code=65280 exit_state=0x10 comm=sshd
         systemd-1       [001] d... 262223.264126: task_exit: pid=7399 oom_score_adj=0 exit_signal=17 exit_code=65280 exit_state=0x10 comm=sshd
          <idle>-0       [002] ..s. 262223.268439: sched_process_free: comm=ls pid=7400 prio=120
          <idle>-0       [007] ..s. 262223.272484: sched_process_free: comm=sshd pid=7395 prio=120
          <idle>-0       [001] ..s. 262223.273446: sched_process_free: comm=sshd pid=7399 prio=120

With the exit_signal you can also see that the process is gone, not only one of the thread that might not be the last.

               t-7734    [006] .... 263946.427704: task_newtask: pid=7736 comm=t clone_flags=3d0f00 oom_score_adj=0
               t-7734    [006] .... 263946.427778: task_newtask: pid=7737 comm=t clone_flags=3d0f00 oom_score_adj=0
               t-7735    [000] .... 263946.428138: task_newtask: pid=7738 comm=t clone_flags=3d0f00 oom_score_adj=0
               t-7736    [001] .... 263946.428387: task_newtask: pid=7739 comm=t clone_flags=3d0f00 oom_score_adj=0
               t-7737    [002] .... 263946.428515: task_newtask: pid=7740 comm=t clone_flags=3d0f00 oom_score_adj=0
           <...>-7740    [002] dN.. 263979.289647: task_exit: pid=7740 oom_score_adj=0 exit_signal=-1 exit_code=15 exit_state=0x10 comm=t
               t-7739    [001] dN.. 263979.289661: task_exit: pid=7739 oom_score_adj=0 exit_signal=-1 exit_code=15 exit_state=0x10 comm=t
               t-7738    [000] dN.. 263979.289682: task_exit: pid=7738 oom_score_adj=0 exit_signal=-1 exit_code=15 exit_state=0x10 comm=t
               t-7737    [002] dN.. 263979.289686: task_exit: pid=7737 oom_score_adj=0 exit_signal=-1 exit_code=15 exit_state=0x10 comm=t
               t-7736    [001] dN.. 263979.289690: task_exit: pid=7736 oom_score_adj=0 exit_signal=-1 exit_code=15 exit_state=0x10 comm=t
               t-7735    [000] dN.. 263979.289702: task_exit: pid=7735 oom_score_adj=0 exit_signal=-1 exit_code=15 exit_state=0x10 comm=t
            bash-7455    [005] d... 263979.289851: task_exit: pid=7734 oom_score_adj=0 exit_signal=17 exit_code=15 exit_state=0x10 comm=t
           <...>-740     [005] ..s. 263979.384477: sched_process_free: comm=t pid=7734 prio=120
          <idle>-0       [000] ..s. 263979.384481: sched_process_free: comm=t pid=7738 prio=120
          <idle>-0       [000] ..s. 263979.384488: sched_process_free: comm=t pid=7735 prio=120
          <idle>-0       [001] ..s. 263979.385482: sched_process_free: comm=t pid=7739 prio=120
          <idle>-0       [001] ..s. 263979.385496: sched_process_free: comm=t pid=7736 prio=120
          <idle>-0       [002] ..s. 263979.489469: sched_process_free: comm=t pid=7740 prio=120
          <idle>-0       [002] ..s. 263979.489490: sched_process_free: comm=t pid=7737 prio=120

Not also the order.

> Thanks,
>
> Mathieu
>
Thanks

Peter

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-05-03 14:48           ` Peter.Enderborg
@ 2021-05-03 16:30             ` Eric W. Biederman
  2021-05-03 18:04               ` Peter.Enderborg
  0 siblings, 1 reply; 15+ messages in thread
From: Eric W. Biederman @ 2021-05-03 16:30 UTC (permalink / raw)
  To: Peter.Enderborg
  Cc: mathieu.desnoyers, rostedt, linux-kernel, mingo, akpm, peterz,
	ast, christian.brauner, dave, walken, jannh, christophe.leroy,
	minchan

<Peter.Enderborg@sony.com> writes:

> On 5/3/21 3:50 PM, Mathieu Desnoyers wrote:
>> ----- On May 1, 2021, at 9:11 AM, rostedt rostedt@goodmis.org wrote:
>>
>>> On Sat, 1 May 2021 09:29:41 +0000
>>> <Peter.Enderborg@sony.com> wrote:
>>>
>>>> On 4/30/21 7:48 PM, Eric W. Biederman wrote:
>>>>> Peter Enderborg <peter.enderborg@sony.com> writes:
>>>>>  
>>>>>> This is the peer functions to task_rename and task_newtask.
>>>>>> With this we get hole "life-cycle" of task and can easily
>>>>>> see short livied task and their exit status.
>>>>> This patch is incorrect.  The location you are dealing with is not part
>>>>> of task exit.  The location you have instrumented is part of reaping a
>>>>> task which can come arbitrarily long after the task exits.
>>>> That is what it aiming. When using this as tool for userspace you
>>>> would like to know when the task is done. When it no longer
>>>> holds any thing that might have any impact. If you think the
>>>> exit imply something more specific I can change the name.
>>>>
>>>> I thought exit was a good name, it is in in exit.c.
>>>>
>>>> Will the name task_done, task_finished or task_reaped work for you?
>>> I think "task_reaped" is probably the best name, and the most
>>> descriptive of what happened.
>> What would it provide that is not already available through the "sched_process_free"
>> tracepoint in delayed_put_task_struct ?
>
> For task_exit (or task_reaped)
>
>         field:pid_t pid;        offset:8;       size:4; signed:1;
>         field:short oom_score_adj;      offset:12;      size:2; signed:1;
>         field:int exit_signal;  offset:16;      size:4; signed:1;
>         field:int exit_code;    offset:20;      size:4; signed:1;
>         field:int exit_state;   offset:24;      size:4; signed:1;
>         field:__data_loc char[] comm;   offset:28;      size:4; signed:1;
>
> sched_process_free
>         field:char comm[16];    offset:8;       size:16;        signed:1;
>         field:pid_t pid;        offset:24;      size:4; signed:1;
>         field:int prio; offset:28;      size:4; signed:1;
>
> So information about oom_score_adj, and it's exit parameters.


For the record returning oom_score_adj that late is not appropriate for
any kernel/user API.  It is perfectly valid for the kernel to optimize
out anything that wait(2) does not return.

If you want oom_score_adj you probably need to sample it in
sched_process_exit.

I periodically move things from the point a process is reaped to the
point where a task stops running, for both correctness and for simpler
maintenance.  When threads were added a bunch of cleanup was added
to the wrong place.  I certainly would not hesitate to mess with
oom_score_adj if changing something would make the code simpler.

With both sched_process_free and sched_process_exit it looks like we
already have tracepoints everywhere they could be needed.
task exit.

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-05-03 16:30             ` Eric W. Biederman
@ 2021-05-03 18:04               ` Peter.Enderborg
  2021-05-03 19:02                 ` Eric W. Biederman
  0 siblings, 1 reply; 15+ messages in thread
From: Peter.Enderborg @ 2021-05-03 18:04 UTC (permalink / raw)
  To: ebiederm
  Cc: mathieu.desnoyers, rostedt, linux-kernel, mingo, akpm, peterz,
	ast, christian.brauner, dave, walken, jannh, christophe.leroy,
	minchan

On 5/3/21 6:30 PM, Eric W. Biederman wrote:
> <Peter.Enderborg@sony.com> writes:
>
>> On 5/3/21 3:50 PM, Mathieu Desnoyers wrote:
>>> ----- On May 1, 2021, at 9:11 AM, rostedt rostedt@goodmis.org wrote:
>>>
>>>> On Sat, 1 May 2021 09:29:41 +0000
>>>> <Peter.Enderborg@sony.com> wrote:
>>>>
>>>>> On 4/30/21 7:48 PM, Eric W. Biederman wrote:
>>>>>> Peter Enderborg <peter.enderborg@sony.com> writes:
>>>>>>  
>>>>>>> This is the peer functions to task_rename and task_newtask.
>>>>>>> With this we get hole "life-cycle" of task and can easily
>>>>>>> see short livied task and their exit status.
>>>>>> This patch is incorrect.  The location you are dealing with is not part
>>>>>> of task exit.  The location you have instrumented is part of reaping a
>>>>>> task which can come arbitrarily long after the task exits.
>>>>> That is what it aiming. When using this as tool for userspace you
>>>>> would like to know when the task is done. When it no longer
>>>>> holds any thing that might have any impact. If you think the
>>>>> exit imply something more specific I can change the name.
>>>>>
>>>>> I thought exit was a good name, it is in in exit.c.
>>>>>
>>>>> Will the name task_done, task_finished or task_reaped work for you?
>>>> I think "task_reaped" is probably the best name, and the most
>>>> descriptive of what happened.
>>> What would it provide that is not already available through the "sched_process_free"
>>> tracepoint in delayed_put_task_struct ?
>> For task_exit (or task_reaped)
>>
>>         field:pid_t pid;        offset:8;       size:4; signed:1;
>>         field:short oom_score_adj;      offset:12;      size:2; signed:1;
>>         field:int exit_signal;  offset:16;      size:4; signed:1;
>>         field:int exit_code;    offset:20;      size:4; signed:1;
>>         field:int exit_state;   offset:24;      size:4; signed:1;
>>         field:__data_loc char[] comm;   offset:28;      size:4; signed:1;
>>
>> sched_process_free
>>         field:char comm[16];    offset:8;       size:16;        signed:1;
>>         field:pid_t pid;        offset:24;      size:4; signed:1;
>>         field:int prio; offset:28;      size:4; signed:1;
>>
>> So information about oom_score_adj, and it's exit parameters.
>
> For the record returning oom_score_adj that late is not appropriate for
> any kernel/user API.  It is perfectly valid for the kernel to optimize
> out anything that wait(2) does not return.
>
> If you want oom_score_adj you probably need to sample it in
> sched_process_exit.
That I don't understand why?  oom_score_adj is part of the signal,
why is that not intact when we run __exit_signal ?


> I periodically move things from the point a process is reaped to the
> point where a task stops running, for both correctness and for simpler
> maintenance.  When threads were added a bunch of cleanup was added
> to the wrong place.  I certainly would not hesitate to mess with
> oom_score_adj if changing something would make the code simpler.
>
> With both sched_process_free and sched_process_exit it looks like we
> already have tracepoints everywhere they could be needed.
> task exit.
>
> Eric

It might be where we it is needed, but it does not contain information that
are needed for userspace. I don't see this as tool for sched issues,
but ading information to existing ones is of course a option.

However current traces is template based, and I assume it wont be popular to add new fields to the template,
and exit reasons is not right for the other template use cases.

I still see a "new" task moving it to do_exit make trace name more correct?  Or is trace_task_do_exit better?


Thanks

Peter

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-05-03 18:04               ` Peter.Enderborg
@ 2021-05-03 19:02                 ` Eric W. Biederman
  2021-05-03 19:43                   ` Peter.Enderborg
  2021-05-03 20:55                   ` Steven Rostedt
  0 siblings, 2 replies; 15+ messages in thread
From: Eric W. Biederman @ 2021-05-03 19:02 UTC (permalink / raw)
  To: Peter.Enderborg
  Cc: mathieu.desnoyers, rostedt, linux-kernel, mingo, akpm, peterz,
	ast, christian.brauner, dave, walken, jannh, christophe.leroy,
	minchan

<Peter.Enderborg@sony.com> writes:

> On 5/3/21 6:30 PM, Eric W. Biederman wrote:
>> <Peter.Enderborg@sony.com> writes:
>>
>>> On 5/3/21 3:50 PM, Mathieu Desnoyers wrote:
>>>> ----- On May 1, 2021, at 9:11 AM, rostedt rostedt@goodmis.org wrote:
>>>>
>>>>> On Sat, 1 May 2021 09:29:41 +0000
>>>>> <Peter.Enderborg@sony.com> wrote:
>>>>>
>>>>>> On 4/30/21 7:48 PM, Eric W. Biederman wrote:
>>>>>>> Peter Enderborg <peter.enderborg@sony.com> writes:
>>>>>>>  
>>>>>>>> This is the peer functions to task_rename and task_newtask.
>>>>>>>> With this we get hole "life-cycle" of task and can easily
>>>>>>>> see short livied task and their exit status.
>>>>>>> This patch is incorrect.  The location you are dealing with is not part
>>>>>>> of task exit.  The location you have instrumented is part of reaping a
>>>>>>> task which can come arbitrarily long after the task exits.
>>>>>> That is what it aiming. When using this as tool for userspace you
>>>>>> would like to know when the task is done. When it no longer
>>>>>> holds any thing that might have any impact. If you think the
>>>>>> exit imply something more specific I can change the name.
>>>>>>
>>>>>> I thought exit was a good name, it is in in exit.c.
>>>>>>
>>>>>> Will the name task_done, task_finished or task_reaped work for you?
>>>>> I think "task_reaped" is probably the best name, and the most
>>>>> descriptive of what happened.
>>>> What would it provide that is not already available through the "sched_process_free"
>>>> tracepoint in delayed_put_task_struct ?
>>> For task_exit (or task_reaped)
>>>
>>>         field:pid_t pid;        offset:8;       size:4; signed:1;
>>>         field:short oom_score_adj;      offset:12;      size:2; signed:1;
>>>         field:int exit_signal;  offset:16;      size:4; signed:1;
>>>         field:int exit_code;    offset:20;      size:4; signed:1;
>>>         field:int exit_state;   offset:24;      size:4; signed:1;
>>>         field:__data_loc char[] comm;   offset:28;      size:4; signed:1;
>>>
>>> sched_process_free
>>>         field:char comm[16];    offset:8;       size:16;        signed:1;
>>>         field:pid_t pid;        offset:24;      size:4; signed:1;
>>>         field:int prio; offset:28;      size:4; signed:1;
>>>
>>> So information about oom_score_adj, and it's exit parameters.
>>
>> For the record returning oom_score_adj that late is not appropriate for
>> any kernel/user API.  It is perfectly valid for the kernel to optimize
>> out anything that wait(2) does not return.
>>
>> If you want oom_score_adj you probably need to sample it in
>> sched_process_exit.
> That I don't understand why?  oom_score_adj is part of the signal,
> why is that not intact when we run __exit_signal ?

Yes oom_score_adj lives in struct signal.  The naming of __exit_signal
is simply historical at this point, not descriptive.

The function of oom_score_adj is to tell the oom kill how aggressive to
be when oom_killing functions.  That stops being relevant as soon as
PF_EXITING gets set.

An optimization I have toyed with that would be completely relevant
is to have a very minimal struct zombie that would contain just the
information that wait(2) needs.  Everything else about the process
can be freed when the process actually stops running.

It would make no sense to include oom_score_adj in such a struct zombie.

As such it makes it very bad choice to place oom_score_adj userspace API
that triggers when a task is reaped.

>> I periodically move things from the point a process is reaped to the
>> point where a task stops running, for both correctness and for simpler
>> maintenance.  When threads were added a bunch of cleanup was added
>> to the wrong place.  I certainly would not hesitate to mess with
>> oom_score_adj if changing something would make the code simpler.
>>
>> With both sched_process_free and sched_process_exit it looks like we
>> already have tracepoints everywhere they could be needed.
>> task exit.
>
> It might be where we it is needed, but it does not contain information that
> are needed for userspace. I don't see this as tool for sched issues,
> but ading information to existing ones is of course a option.
>
> However current traces is template based, and I assume it wont be
> popular to add new fields to the template, and exit reasons is not
> right for the other template use cases.
>
> I still see a "new" task moving it to do_exit make trace name more
> correct?  Or is trace_task_do_exit better?

I really can't say, as I don't know much of anything about the tracing
infrastructure.  I would assume in most cases with a tracepoint in place
other kinds of tracing (like bpf programs) could come into play and read
out pieces of information that are not commonly wanted.

All I really know something about is the exit code path, as I keep
slowly trying to clean it up.  I plan on ignoring any tracepoint that
makes that gets in the way.

Eric

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-05-03 19:02                 ` Eric W. Biederman
@ 2021-05-03 19:43                   ` Peter.Enderborg
  2021-05-03 20:55                   ` Steven Rostedt
  1 sibling, 0 replies; 15+ messages in thread
From: Peter.Enderborg @ 2021-05-03 19:43 UTC (permalink / raw)
  To: ebiederm
  Cc: mathieu.desnoyers, rostedt, linux-kernel, mingo, akpm, peterz,
	ast, christian.brauner, dave, walken, jannh, christophe.leroy,
	minchan

On 5/3/21 9:02 PM, Eric W. Biederman wrote:
> <Peter.Enderborg@sony.com> writes:
>
>> On 5/3/21 6:30 PM, Eric W. Biederman wrote:
>>> <Peter.Enderborg@sony.com> writes:
>>>
>>>> On 5/3/21 3:50 PM, Mathieu Desnoyers wrote:
>>>>> ----- On May 1, 2021, at 9:11 AM, rostedt rostedt@goodmis.org wrote:
>>>>>
>>>>>> On Sat, 1 May 2021 09:29:41 +0000
>>>>>> <Peter.Enderborg@sony.com> wrote:
>>>>>>
>>>>>>> On 4/30/21 7:48 PM, Eric W. Biederman wrote:
>>>>>>>> Peter Enderborg <peter.enderborg@sony.com> writes:
>>>>>>>>  
>>>>>>>>> This is the peer functions to task_rename and task_newtask.
>>>>>>>>> With this we get hole "life-cycle" of task and can easily
>>>>>>>>> see short livied task and their exit status.
>>>>>>>> This patch is incorrect.  The location you are dealing with is not part
>>>>>>>> of task exit.  The location you have instrumented is part of reaping a
>>>>>>>> task which can come arbitrarily long after the task exits.
>>>>>>> That is what it aiming. When using this as tool for userspace you
>>>>>>> would like to know when the task is done. When it no longer
>>>>>>> holds any thing that might have any impact. If you think the
>>>>>>> exit imply something more specific I can change the name.
>>>>>>>
>>>>>>> I thought exit was a good name, it is in in exit.c.
>>>>>>>
>>>>>>> Will the name task_done, task_finished or task_reaped work for you?
>>>>>> I think "task_reaped" is probably the best name, and the most
>>>>>> descriptive of what happened.
>>>>> What would it provide that is not already available through the "sched_process_free"
>>>>> tracepoint in delayed_put_task_struct ?
>>>> For task_exit (or task_reaped)
>>>>
>>>>         field:pid_t pid;        offset:8;       size:4; signed:1;
>>>>         field:short oom_score_adj;      offset:12;      size:2; signed:1;
>>>>         field:int exit_signal;  offset:16;      size:4; signed:1;
>>>>         field:int exit_code;    offset:20;      size:4; signed:1;
>>>>         field:int exit_state;   offset:24;      size:4; signed:1;
>>>>         field:__data_loc char[] comm;   offset:28;      size:4; signed:1;
>>>>
>>>> sched_process_free
>>>>         field:char comm[16];    offset:8;       size:16;        signed:1;
>>>>         field:pid_t pid;        offset:24;      size:4; signed:1;
>>>>         field:int prio; offset:28;      size:4; signed:1;
>>>>
>>>> So information about oom_score_adj, and it's exit parameters.
>>> For the record returning oom_score_adj that late is not appropriate for
>>> any kernel/user API.  It is perfectly valid for the kernel to optimize
>>> out anything that wait(2) does not return.
>>>
>>> If you want oom_score_adj you probably need to sample it in
>>> sched_process_exit.
>> That I don't understand why?  oom_score_adj is part of the signal,
>> why is that not intact when we run __exit_signal ?
> Yes oom_score_adj lives in struct signal.  The naming of __exit_signal
> is simply historical at this point, not descriptive.
>
> The function of oom_score_adj is to tell the oom kill how aggressive to
> be when oom_killing functions.  That stops being relevant as soon as
> PF_EXITING gets set.

Still it relevant for the case it was oom that killed a task or not. As I see it
it must be valid until the pid is removed in proc_flush_pid.

If it was part of a oom or not is different matter.  I have a oom trace
too, but I have not sent that yet.

> An optimization I have toyed with that would be completely relevant
> is to have a very minimal struct zombie that would contain just the
> information that wait(2) needs.  Everything else about the process
> can be freed when the process actually stops running.
>
> It would make no sense to include oom_score_adj in such a struct zombie.

But still you can do a call to the task_reaped in the fast path too?


> As such it makes it very bad choice to place oom_score_adj userspace API
> that triggers when a task is reaped.

if it is relevant for it's creation and renaming surely it must be
more relevant for it's killing since it is part of it's oom? However
this patch is not about oom, the oom_score_adj are used
as a prioritization in android when task are recycled. Having at as trace
parameter make it easy to monitor the activity among services.

Just select task 0-250 and you get the applications and services
that is currently used.


>
>>> I periodically move things from the point a process is reaped to the
>>> point where a task stops running, for both correctness and for simpler
>>> maintenance.  When threads were added a bunch of cleanup was added
>>> to the wrong place.  I certainly would not hesitate to mess with
>>> oom_score_adj if changing something would make the code simpler.
>>>
>>> With both sched_process_free and sched_process_exit it looks like we
>>> already have tracepoints everywhere they could be needed.
>>> task exit.
>> It might be where we it is needed, but it does not contain information that
>> are needed for userspace. I don't see this as tool for sched issues,
>> but ading information to existing ones is of course a option.
>>
>> However current traces is template based, and I assume it wont be
>> popular to add new fields to the template, and exit reasons is not
>> right for the other template use cases.
>>
>> I still see a "new" task moving it to do_exit make trace name more
>> correct?  Or is trace_task_do_exit better?
> I really can't say, as I don't know much of anything about the tracing
> infrastructure.  I would assume in most cases with a tracepoint in place
> other kinds of tracing (like bpf programs) could come into play and read
> out pieces of information that are not commonly wanted.
>
> All I really know something about is the exit code path, as I keep
> slowly trying to clean it up.  I plan on ignoring any tracepoint that
> makes that gets in the way.
>
> Eric


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-05-03 19:02                 ` Eric W. Biederman
  2021-05-03 19:43                   ` Peter.Enderborg
@ 2021-05-03 20:55                   ` Steven Rostedt
  2021-05-04  8:00                     ` Peter.Enderborg
  1 sibling, 1 reply; 15+ messages in thread
From: Steven Rostedt @ 2021-05-03 20:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Peter.Enderborg, mathieu.desnoyers, linux-kernel, mingo, akpm,
	peterz, ast, christian.brauner, dave, walken, jannh,
	christophe.leroy, minchan

On Mon, 03 May 2021 14:02:48 -0500
ebiederm@xmission.com (Eric W. Biederman) wrote:

> > However current traces is template based, and I assume it wont be
> > popular to add new fields to the template, and exit reasons is not
> > right for the other template use cases.

trace events can always add fields, it's removing them that can cause
problems (but even then, it's not that bad). The new libtracefs and
libtraceevent make it trivial for tools to get the fields from trace events
when needed.

> >
> > I still see a "new" task moving it to do_exit make trace name more
> > correct?  Or is trace_task_do_exit better?  

It is also trivial with the libraries to write a tool that can put together
everything you want. We even are working on python bindings to connect to
these libraries where you could write a python script to do this.

There is no need for a new tracepoint, especially if it makes it harder to
improve the implementation of what is being traced.

> 
> I really can't say, as I don't know much of anything about the tracing
> infrastructure.  I would assume in most cases with a tracepoint in place
> other kinds of tracing (like bpf programs) could come into play and read
> out pieces of information that are not commonly wanted.
> 
> All I really know something about is the exit code path, as I keep
> slowly trying to clean it up.  I plan on ignoring any tracepoint that
> makes that gets in the way.

As you should.

-- Steve

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-05-03 20:55                   ` Steven Rostedt
@ 2021-05-04  8:00                     ` Peter.Enderborg
  2021-05-04 13:18                       ` Steven Rostedt
  0 siblings, 1 reply; 15+ messages in thread
From: Peter.Enderborg @ 2021-05-04  8:00 UTC (permalink / raw)
  To: rostedt, ebiederm
  Cc: mathieu.desnoyers, linux-kernel, mingo, akpm, peterz, ast,
	christian.brauner, dave, walken, jannh, christophe.leroy,
	minchan

On 5/3/21 10:55 PM, Steven Rostedt wrote:
> On Mon, 03 May 2021 14:02:48 -0500
> ebiederm@xmission.com (Eric W. Biederman) wrote:
>
>>> However current traces is template based, and I assume it wont be
>>> popular to add new fields to the template, and exit reasons is not
>>> right for the other template use cases.
> trace events can always add fields, it's removing them that can cause
> problems (but even then, it's not that bad). The new libtracefs and
> libtraceevent make it trivial for tools to get the fields from trace events
> when needed.
>
>>> I still see a "new" task moving it to do_exit make trace name more
>>> correct?  Or is trace_task_do_exit better?  
> It is also trivial with the libraries to write a tool that can put together
> everything you want. We even are working on python bindings to connect to
> these libraries where you could write a python script to do this.

The bpftrace package are 163 MB install size and that is on
a system that already have python. Linux is very much used
on embedded system, having a shell is luxurious.


Trivial?

Concept

A eBpf program hook in to a tracepoint A & B and collect data.

A happens before B and send the collected data when B happen.

1 A is called:

2 C is created, C is destroyed.

3 B is called. How do I fetch C?

However I can make a ebpf that hooks in sched_process_free and
sched_process_exit use  the uapi version of bpf_get_current_task to pick up

oom_score_adj and exit_code.  However task definition is dependent on 71 ifdef's
not including object that is pointers that also might have build dependency
and some are there more than once.

I think kprobe will cause the same problem. It wont be that big deal if it
was a for kernel debugging. But this is for userspace and should not
have dependency on kernel internals.


> There is no need for a new tracepoint, especially if it makes it harder to
> improve the implementation of what is being traced.
It does not introduce any complex functionality, and with a other
mechanism i still believe you would need to reap the task somewhere.
But I guess it will be needed to add a exist status flag that is new,
but that is with or without a new tracepoint.

The python libs that uses this fetch the first item in the task_struct
and assume that it is thread_info. What could possible go wrong?

Is there a runtime linker in ebpf that resolves this by magic?

>
>> I really can't say, as I don't know much of anything about the tracing
>> infrastructure.  I would assume in most cases with a tracepoint in place
>> other kinds of tracing (like bpf programs) could come into play and read
>> out pieces of information that are not commonly wanted.
>>
>> All I really know something about is the exit code path, as I keep
>> slowly trying to clean it up.  I plan on ignoring any tracepoint that
>> makes that gets in the way.
> As you should.
>
> -- Steve


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/2] tracing: Add a trace for task_exit
  2021-05-04  8:00                     ` Peter.Enderborg
@ 2021-05-04 13:18                       ` Steven Rostedt
  0 siblings, 0 replies; 15+ messages in thread
From: Steven Rostedt @ 2021-05-04 13:18 UTC (permalink / raw)
  To: Peter.Enderborg
  Cc: ebiederm, mathieu.desnoyers, linux-kernel, mingo, akpm, peterz,
	ast, christian.brauner, dave, walken, jannh, christophe.leroy,
	minchan

On Tue, 4 May 2021 08:00:43 +0000
<Peter.Enderborg@sony.com> wrote:

> On 5/3/21 10:55 PM, Steven Rostedt wrote:
> > On Mon, 03 May 2021 14:02:48 -0500
> > ebiederm@xmission.com (Eric W. Biederman) wrote:
> >  
> >>> However current traces is template based, and I assume it wont be
> >>> popular to add new fields to the template, and exit reasons is not
> >>> right for the other template use cases.  
> > trace events can always add fields, it's removing them that can cause
> > problems (but even then, it's not that bad). The new libtracefs and
> > libtraceevent make it trivial for tools to get the fields from trace events
> > when needed.
> >  
> >>> I still see a "new" task moving it to do_exit make trace name more
> >>> correct?  Or is trace_task_do_exit better?    
> > It is also trivial with the libraries to write a tool that can put together
> > everything you want. We even are working on python bindings to connect to
> > these libraries where you could write a python script to do this.  
> 
> The bpftrace package are 163 MB install size and that is on

No idea, I never used bpftrace or even BPF for that matter.

> a system that already have python. Linux is very much used
> on embedded system, having a shell is luxurious.
> 
> 
> Trivial?
> 
> Concept
> 
> A eBpf program hook in to a tracepoint A & B and collect data.
> 
> A happens before B and send the collected data when B happen.
> 
> 1 A is called:
> 
> 2 C is created, C is destroyed.
> 
> 3 B is called. How do I fetch C?

In use space. I'm talking about writing a simple C program that uses
libtracefs. We are also adding python bindings to do the same in python
(which is what I meant by "python script").


> 
> However I can make a ebpf that hooks in sched_process_free and
> sched_process_exit use  the uapi version of bpf_get_current_task to pick up

No idea. As I said, I never used ebpf.

> 
> oom_score_adj and exit_code.  However task definition is dependent on 71 ifdef's
> not including object that is pointers that also might have build dependency
> and some are there more than once.
> 
> I think kprobe will cause the same problem. It wont be that big deal if it
> was a for kernel debugging. But this is for userspace and should not
> have dependency on kernel internals.

But tracepoints *are* kernel internals and userspace should not have
dependency on them.

> 
> 
> > There is no need for a new tracepoint, especially if it makes it harder to
> > improve the implementation of what is being traced.  
> It does not introduce any complex functionality, and with a other
> mechanism i still believe you would need to reap the task somewhere.
> But I guess it will be needed to add a exist status flag that is new,
> but that is with or without a new tracepoint.
> 
> The python libs that uses this fetch the first item in the task_struct
> and assume that it is thread_info. What could possible go wrong?

No idea what you mean by python fetching task_struct items.

> 
> Is there a runtime linker in ebpf that resolves this by magic?

No idea. Never used ebpf.

Have you tried extending the other trace events?

Not sure this would be accepted though. But that's another discussion to be
had.

-- Steve

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 1eca2305ca42..83cc56174b75 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -307,16 +307,24 @@ DECLARE_EVENT_CLASS(sched_process_template,
 		__array(	char,	comm,	TASK_COMM_LEN	)
 		__field(	pid_t,	pid			)
 		__field(	int,	prio			)
+		__field(	short,	oom_score_adj		)
+		__field(	int,	exit_signal		)
+		__field(	int,	exit_state		)
 	),
 
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio; /* XXX SCHED_DEADLINE */
+		__entry->oom_score_adj	= p->signal ? p->signal->oom_score_adj : 0;
+		__entry->exit_signal	= p->exit_signal;
+		__entry->exit_state	= p->exit_state;
 	),
 
-	TP_printk("comm=%s pid=%d prio=%d",
-		  __entry->comm, __entry->pid, __entry->prio)
+	TP_printk("comm=%s pid=%d prio=%d oom=%d signal=%d state=0x%x",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->oom_score_adj, __entry->exit_signal,
+		  __entry->exit_state)
 );
 
 /*

^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2021-05-04 13:18 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-30 14:22 [PATCH 0/2] tracing: Add trace for task_exit Peter Enderborg
2021-04-30 14:22 ` [PATCH 1/2] tracing: Add a " Peter Enderborg
2021-04-30 17:48   ` Eric W. Biederman
2021-05-01  9:29     ` Peter.Enderborg
2021-05-01 13:11       ` Steven Rostedt
2021-05-03 13:50         ` Mathieu Desnoyers
2021-05-03 14:48           ` Peter.Enderborg
2021-05-03 16:30             ` Eric W. Biederman
2021-05-03 18:04               ` Peter.Enderborg
2021-05-03 19:02                 ` Eric W. Biederman
2021-05-03 19:43                   ` Peter.Enderborg
2021-05-03 20:55                   ` Steven Rostedt
2021-05-04  8:00                     ` Peter.Enderborg
2021-05-04 13:18                       ` Steven Rostedt
2021-04-30 14:22 ` [PATCH 2/2] tracing: Align task.h to use __assing_str for strings Peter Enderborg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.