* [RFC PATCH 1/2] sched/tracing: Don't re-read p->state when emitting sched_switch event
2021-11-29 12:35 [RFC PATCH 0/2] sched/tracing: sched_switch prev_state reported as TASK_RUNNING when it's not Valentin Schneider
@ 2021-11-29 12:36 ` Valentin Schneider
2021-12-08 20:12 ` Steven Rostedt
2021-11-29 12:36 ` [RFC PATCH 2/2] sched/tracing: Add TASK_RTLOCK_WAIT to TASK_REPORT Valentin Schneider
1 sibling, 1 reply; 7+ messages in thread
From: Valentin Schneider @ 2021-11-29 12:36 UTC (permalink / raw)
To: linux-kernel
Cc: Abhijeet Dharmapurikar, Uwe Kleine-König, Dietmar Eggemann,
Steven Rostedt, Peter Zijlstra, Ingo Molnar, Vincent Guittot,
Thomas Gleixner, Sebastian Andrzej Siewior, Juri Lelli,
Daniel Bristot de Oliveira, Kees Cook, Andrew Morton
As of commit
c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
the following sequence becomes possible:
p->__state = TASK_INTERRUPTIBLE;
__schedule()
deactivate_task(p);
ttwu()
READ !p->on_rq
p->__state=TASK_WAKING
trace_sched_switch()
__trace_sched_switch_state()
task_state_index()
return 0;
TASK_WAKING isn't in TASK_REPORT, so the task appears as TASK_RUNNING in
the trace event.
Prevent this by pushing the value read from __schedule() down the trace
event.
Reported-by: Abhijeet Dharmapurikar <adharmap@quicinc.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
---
include/linux/sched.h | 11 ++++++++---
include/trace/events/sched.h | 11 +++++++----
kernel/sched/core.c | 4 ++--
kernel/trace/fgraph.c | 4 +++-
kernel/trace/ftrace.c | 4 +++-
kernel/trace/trace_events.c | 8 ++++++--
kernel/trace/trace_sched_switch.c | 1 +
7 files changed, 30 insertions(+), 13 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2e261adb8ea..d00837d12b9d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1616,10 +1616,10 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
-static inline unsigned int task_state_index(struct task_struct *tsk)
+static inline unsigned int __task_state_index(unsigned int tsk_state,
+ unsigned int tsk_exit_state)
{
- unsigned int tsk_state = READ_ONCE(tsk->__state);
- unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
+ unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;
BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
@@ -1629,6 +1629,11 @@ static inline unsigned int task_state_index(struct task_struct *tsk)
return fls(state);
}
+static inline unsigned int task_state_index(struct task_struct *tsk)
+{
+ return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
+}
+
static inline char task_index_to_char(unsigned int state)
{
static const char state_char[] = "RSDTtXZPI";
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 94640482cfe7..65e786756321 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -187,7 +187,9 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_ARGS(p));
#ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
+static inline long __trace_sched_switch_state(bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p)
{
unsigned int state;
@@ -208,7 +210,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
* it for left shift operation to get the correct task->state
* mapping.
*/
- state = task_state_index(p);
+ state = __task_state_index(prev_state, p->exit_state);
return state ? (1 << (state - 1)) : state;
}
@@ -220,10 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
TRACE_EVENT(sched_switch,
TP_PROTO(bool preempt,
+ unsigned int prev_state,
struct task_struct *prev,
struct task_struct *next),
- TP_ARGS(preempt, prev, next),
+ TP_ARGS(preempt, prev_state, prev, next),
TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN )
@@ -239,7 +242,7 @@ TRACE_EVENT(sched_switch,
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
__entry->prev_pid = prev->pid;
__entry->prev_prio = prev->prio;
- __entry->prev_state = __trace_sched_switch_state(preempt, prev);
+ __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
__entry->next_pid = next->pid;
__entry->next_prio = next->prio;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index beaa8be6241e..91e6c33650ca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4822,7 +4822,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
- long prev_state;
+ unsigned int prev_state;
/*
* The previous task will have left us with a preempt_count of 2
@@ -6287,7 +6287,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 22061d38fc00..19028e072cdb 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -415,7 +415,9 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
unsigned long long timestamp;
int index;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 30bc880c3849..e296ddeec99f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7311,7 +7311,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4021b9a79f93..6ddc6cc0d5d5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -759,7 +759,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -783,7 +785,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196d7c28..993b0ed10d8c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
int flags;
--
2.25.1
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [RFC PATCH 2/2] sched/tracing: Add TASK_RTLOCK_WAIT to TASK_REPORT
2021-11-29 12:35 [RFC PATCH 0/2] sched/tracing: sched_switch prev_state reported as TASK_RUNNING when it's not Valentin Schneider
2021-11-29 12:36 ` [RFC PATCH 1/2] sched/tracing: Don't re-read p->state when emitting sched_switch event Valentin Schneider
@ 2021-11-29 12:36 ` Valentin Schneider
2021-12-08 20:34 ` Steven Rostedt
2022-01-14 10:11 ` Sebastian Andrzej Siewior
1 sibling, 2 replies; 7+ messages in thread
From: Valentin Schneider @ 2021-11-29 12:36 UTC (permalink / raw)
To: linux-kernel
Cc: Abhijeet Dharmapurikar, Uwe Kleine-König, Dietmar Eggemann,
Steven Rostedt, Peter Zijlstra, Ingo Molnar, Vincent Guittot,
Thomas Gleixner, Sebastian Andrzej Siewior, Juri Lelli,
Daniel Bristot de Oliveira, Kees Cook, Andrew Morton
TASK_RTLOCK_WAIT currently isn't part of TASK_REPORT, thus a task blocking
on an rtlock will appear as having a task state == 0, IOW TASK_RUNNING.
The actual state is saved in p->saved_state, but reading it after reading
p->__state has a few issues:
o that could still be TASK_RUNNING in the case of e.g. rt_spin_lock
o ttwu_state_match() might have changed that to TASK_RUNNING
Add TASK_RTLOCK_WAIT to TASK_REPORT.
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
---
fs/proc/array.c | 3 ++-
include/linux/sched.h | 17 +++++++++--------
include/trace/events/sched.h | 1 +
3 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ff869a66b34e..f4cae65529a6 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -128,9 +128,10 @@ static const char * const task_state_array[] = {
"X (dead)", /* 0x10 */
"Z (zombie)", /* 0x20 */
"P (parked)", /* 0x40 */
+ "L (rt-locked)", /* 0x80 */
/* states beyond TASK_REPORT: */
- "I (idle)", /* 0x80 */
+ "I (idle)", /* 0x100 */
};
static inline const char *get_task_state(struct task_struct *tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d00837d12b9d..18fd77578dae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -91,13 +91,14 @@ struct task_group;
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->state again: */
#define TASK_PARKED 0x0040
-#define TASK_DEAD 0x0080
-#define TASK_WAKEKILL 0x0100
-#define TASK_WAKING 0x0200
-#define TASK_NOLOAD 0x0400
-#define TASK_NEW 0x0800
/* RT specific auxilliary flag to mark RT lock waiters */
-#define TASK_RTLOCK_WAIT 0x1000
+#define TASK_RTLOCK_WAIT 0x0080
+
+#define TASK_DEAD 0x0100
+#define TASK_WAKEKILL 0x0200
+#define TASK_WAKING 0x0400
+#define TASK_NOLOAD 0x0800
+#define TASK_NEW 0x1000
#define TASK_STATE_MAX 0x2000
/* Convenience macros for the sake of set_current_state: */
@@ -114,7 +115,7 @@ struct task_group;
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
__TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
- TASK_PARKED)
+ TASK_PARKED | TASK_RTLOCK_WAIT)
#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING)
@@ -1636,7 +1637,7 @@ static inline unsigned int task_state_index(struct task_struct *tsk)
static inline char task_index_to_char(unsigned int state)
{
- static const char state_char[] = "RSDTtXZPI";
+ static const char state_char[] = "RSDTtXZPLI";
BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1);
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 65e786756321..f86ec9af19ff 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -261,6 +261,7 @@ TRACE_EVENT(sched_switch,
{ EXIT_DEAD, "X" },
{ EXIT_ZOMBIE, "Z" },
{ TASK_PARKED, "P" },
+ { TASK_RTLOCK_WAIT, "L" },
{ TASK_DEAD, "I" }) :
"R",
--
2.25.1
^ permalink raw reply related [flat|nested] 7+ messages in thread