Re: [PATCH] sched: Add trace for task wake up latency and leave running time

* Re: [PATCH] sched: Add trace for task wake up latency and leave running time
  2020-08-21 23:59 [PATCH] sched: Add trace for task wake up latency and leave running time Dongjiu Geng
@ 2020-08-21 18:49 ` Peter Zijlstra
  0 siblings, 0 replies; 2+ messages in thread
From: Peter Zijlstra @ 2020-08-21 18:49 UTC (permalink / raw)
  To: Dongjiu Geng
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, thara.gopinath, pauld, vincent.donnefort,
	rdunlap, linux-kernel, linuxarm

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 93ecd930efd3..edb622c40a90 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1324,6 +1324,13 @@ struct task_struct {
>  	/* CPU-specific state of this task: */
>  	struct thread_struct		thread;
>  
> +	/* Task wake up time stamp */
> +	u64				ts_wakeup;
> +	/* Previous task switch out time stamp */
> +	u64				pre_ts_end;
> +	/* Next task switch in time stamp */
> +	u64				next_ts_start;
> +	bool				wakeup_state;
>  	/*
>  	 * WARNING: on x86, 'thread_struct' contains a variable-sized
>  	 * structure.  It *MUST* be at the end of 'task_struct'.

^^^ did you read that comment?

> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index fec25b9cfbaf..e99c6d573a42 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -183,6 +183,72 @@ TRACE_EVENT(sched_switch,
>  		__entry->next_comm, __entry->next_pid, __entry->next_prio)
>  );
>  
> +DECLARE_EVENT_CLASS(sched_latency_template,
> +
> +	TP_PROTO(bool preempt,
> +		 struct task_struct *prev,
> +		 struct task_struct *next,
> +		 u64 time),
> +
> +	TP_ARGS(preempt, prev, next, time),
> +
> +	TP_STRUCT__entry(
> +		__array(	char,	prev_comm,	TASK_COMM_LEN	)
> +		__field(	pid_t,	prev_pid			)
> +		__field(	int,	prev_prio			)
> +		__field(	long,	prev_state			)
> +		__array(	char,	next_comm,	TASK_COMM_LEN	)
> +		__field(	pid_t,	next_pid			)
> +		__field(	int,	next_prio			)
> +		__field(	u64,	time				)
> +	),
> +
> +	TP_fast_assign(
> +		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> +		__entry->prev_pid	= prev->pid;
> +		__entry->prev_prio	= prev->prio;
> +		__entry->prev_state	= __trace_sched_switch_state(preempt, prev);
> +		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> +		__entry->next_pid	= next->pid;
> +		__entry->next_prio	= next->prio;
> +		__entry->time		= time;
> +		/* XXX SCHED_DEADLINE */
> +	),
> +
> +	TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d passed time=%llu (ns)",
> +		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
> +
> +		(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
> +		  __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
> +				{ TASK_INTERRUPTIBLE, "S" },
> +				{ TASK_UNINTERRUPTIBLE, "D" },
> +				{ __TASK_STOPPED, "T" },
> +				{ __TASK_TRACED, "t" },
> +				{ EXIT_DEAD, "X" },
> +				{ EXIT_ZOMBIE, "Z" },
> +				{ TASK_PARKED, "P" },
> +				{ TASK_DEAD, "I" }) :
> +		  "R",
> +
> +		__entry->prev_state & TASK_REPORT_MAX ? "+" : "",
> +		__entry->next_comm, __entry->next_pid, __entry->next_prio,
> +		__entry->time)
> +);

NAK, that tracepoint is already broken, we don't want to proliferate the
broken.

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8471a0f7eb32..b5a1928dc948 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2464,6 +2464,8 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
>  {
>  	check_preempt_curr(rq, p, wake_flags);
>  	p->state = TASK_RUNNING;
> +	p->ts_wakeup = local_clock();
> +	p->wakeup_state = true;
>  	trace_sched_wakeup(p);
>  
>  #ifdef CONFIG_SMP

NAK, userless overhead.

> @@ -2846,6 +2848,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
>  		success = 1;
>  		trace_sched_waking(p);
>  		p->state = TASK_RUNNING;
> +		p->ts_wakeup = local_clock();
> +		p->wakeup_state = true;
>  		trace_sched_wakeup(p);
>  		goto out;
>  	}

idem

> @@ -3355,6 +3362,8 @@ void wake_up_new_task(struct task_struct *p)
>  	post_init_entity_util_avg(p);
>  
>  	activate_task(rq, p, ENQUEUE_NOCLOCK);
> +	p->ts_wakeup = local_clock();
> +	p->wakeup_state = true;
>  	trace_sched_wakeup_new(p);
>  	check_preempt_curr(rq, p, WF_FORK);
>  #ifdef CONFIG_SMP

idem

> @@ -4521,8 +4530,19 @@ static void __sched notrace __schedule(bool preempt)
>  
>  		psi_sched_switch(prev, next, !task_on_rq_queued(prev));
>  
> +		next->next_ts_start = prev->pre_ts_end = local_clock();
> +
>  		trace_sched_switch(preempt, prev, next);
>  
> +		if (next->ts_wakeup && next->wakeup_state) {
> +			trace_sched_wakeup_latency(preempt, prev, next,
> +				next->next_ts_start - next->ts_wakeup);
> +			next->wakeup_state = false;
> +		}
> +
> +		trace_sched_leave_running_time(preempt, prev,
> +			next, next->next_ts_start - next->pre_ts_end);
> +
>  		/* Also unlocks the rq: */
>  		rq = context_switch(rq, prev, next, &rf);
>  	} else {

This really isn't ever going to happen.

^ permalink raw reply	[flat|nested] 2+ messages in thread