All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sched/core: forced idle accounting
@ 2021-10-08  0:08 Josh Don
  2021-10-08 21:04 ` Josh Don
                   ` (5 more replies)
  0 siblings, 6 replies; 18+ messages in thread
From: Josh Don @ 2021-10-08  0:08 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot
  Cc: Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	Hao Luo, linux-kernel, Josh Don

Adds accounting for "forced idle" time, which is time where a cookie'd
task forces its SMT sibling to idle, despite the presence of runnable
tasks.

Forced idle time is one means to measure the cost of enabling core
scheduling (ie. the capacity lost due to the need to force idle).

Signed-off-by: Josh Don <joshdon@google.com>
---
 include/linux/sched.h     |  1 +
 kernel/sched/core.c       | 46 ++++++++++++++++-----
 kernel/sched/core_sched.c | 85 ++++++++++++++++++++++++++++++++++++++-
 kernel/sched/debug.c      |  3 ++
 kernel/sched/sched.h      | 11 ++++-
 5 files changed, 135 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2aa30e8e1440..d114ba350802 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -784,6 +784,7 @@ struct task_struct {
 	struct rb_node			core_node;
 	unsigned long			core_cookie;
 	unsigned int			core_occupation;
+	u64				core_forceidle_sum;
 #endif
 
 #ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ada028e579b0..baa4f48cacff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
 	rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
 }
 
-void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
 {
 	rq->core->core_task_seq++;
 
-	if (!sched_core_enqueued(p))
-		return;
+	if (sched_core_enqueued(p)) {
+		rb_erase(&p->core_node, &rq->core_tree);
+		RB_CLEAR_NODE(&p->core_node);
+	}
 
-	rb_erase(&p->core_node, &rq->core_tree);
-	RB_CLEAR_NODE(&p->core_node);
+	/*
+	 * Migrating the last task off the cpu, with the cpu in forced idle
+	 * state. Reschedule to create an accounting edge for forced idle,
+	 * and re-examine whether the core is still in forced idle state.
+	 */
+	if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
+	    rq->core->core_forceidle && rq->curr == rq->idle)
+		resched_curr(rq);
 }
 
 /*
@@ -364,7 +372,8 @@ void sched_core_put(void)
 #else /* !CONFIG_SCHED_CORE */
 
 static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
+static inline void
+sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
 
 #endif /* CONFIG_SCHED_CORE */
 
@@ -2020,7 +2029,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (sched_core_enabled(rq))
-		sched_core_dequeue(rq, p);
+		sched_core_dequeue(rq, p, flags);
 
 	if (!(flags & DEQUEUE_NOCLOCK))
 		update_rq_clock(rq);
@@ -5256,6 +5265,7 @@ void scheduler_tick(void)
 	if (sched_feat(LATENCY_WARN))
 		resched_latency = cpu_resched_latency(rq);
 	calc_global_load_tick(rq);
+	sched_core_tick(rq);
 
 	rq_unlock(rq, &rf);
 
@@ -5668,6 +5678,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	struct task_struct *next, *p, *max = NULL;
 	const struct cpumask *smt_mask;
 	bool fi_before = false;
+	bool core_clock_updated = (rq == rq->core);
 	unsigned long cookie;
 	int i, cpu, occ = 0;
 	struct rq *rq_i;
@@ -5721,9 +5732,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	/* reset state */
 	rq->core->core_cookie = 0UL;
 	if (rq->core->core_forceidle) {
+		if (!core_clock_updated) {
+			update_rq_clock(rq->core);
+			core_clock_updated = true;
+		}
+		sched_core_account_forceidle(rq);
+		rq->core->core_forceidle = false;
+		rq->core->core_forceidle_start = 0;
 		need_sync = true;
 		fi_before = true;
-		rq->core->core_forceidle = false;
 	}
 
 	/*
@@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 	for_each_cpu_wrap(i, smt_mask, cpu) {
 		rq_i = cpu_rq(i);
 
-		if (i != cpu)
+		if (i != cpu && (rq_i != rq->core || !core_clock_updated))
 			update_rq_clock(rq_i);
 
 		p = rq_i->core_pick = pick_task(rq_i);
@@ -5804,6 +5821,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 		}
 	}
 
+	if (rq->core->core_forceidle && cookie)
+		rq->core->core_forceidle_start = rq_clock(rq->core);
+
 	rq->core->core_pick_seq = rq->core->core_task_seq;
 	next = rq->core_pick;
 	rq->core_sched_seq = rq->core->core_pick_seq;
@@ -6051,6 +6071,13 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
 	core_rq->core_forceidle     = rq->core_forceidle;
 	core_rq->core_forceidle_seq = rq->core_forceidle_seq;
 
+	/*
+	 * Accounting edge for forced idle is handled in pick_next_task().
+	 * Don't need another one here, since the hotplug thread shouldn't
+	 * have a cookie.
+	 */
+	core_rq->core_forceidle_start = 0;
+
 	/* install new leader */
 	for_each_cpu(t, smt_mask) {
 		rq = cpu_rq(t);
@@ -9424,6 +9451,7 @@ void __init sched_init(void)
 		rq->core_enabled = 0;
 		rq->core_tree = RB_ROOT;
 		rq->core_forceidle = false;
+		rq->core_forceidle_start = 0;
 
 		rq->core_cookie = 0UL;
 #endif
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 48ac72696012..aae4ac2ac7ec 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
 
 	enqueued = sched_core_enqueued(p);
 	if (enqueued)
-		sched_core_dequeue(rq, p);
+		sched_core_dequeue(rq, p, DEQUEUE_SAVE);
 
 	old_cookie = p->core_cookie;
 	p->core_cookie = cookie;
@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
 	 * If task is currently running, it may not be compatible anymore after
 	 * the cookie change, so enter the scheduler on its CPU to schedule it
 	 * away.
+	 *
+	 * Note that it is possible that as a result of this cookie change, the
+	 * core has now entered/left forced idle state. Defer accounting to the
+	 * next scheduling edge, rather than always forcing a reschedule here.
 	 */
 	if (task_running(rq, p))
 		resched_curr(rq);
@@ -109,6 +113,7 @@ void sched_core_fork(struct task_struct *p)
 {
 	RB_CLEAR_NODE(&p->core_node);
 	p->core_cookie = sched_core_clone_cookie(current);
+	p->core_forceidle_sum = 0;
 }
 
 void sched_core_free(struct task_struct *p)
@@ -228,3 +233,81 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
 	return err;
 }
 
+/* REQUIRES: rq->core's clock recently updated. */
+void sched_core_account_forceidle(struct rq *rq)
+{
+	const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
+	unsigned int smt_count;
+	u64 delta, now = rq_clock(rq->core);
+	struct rq *rq_i;
+	struct task_struct *p;
+	int i;
+
+	lockdep_assert_rq_held(rq);
+
+	WARN_ON_ONCE(!rq->core->core_forceidle);
+
+	if (rq->core->core_forceidle_start == 0)
+		return;
+
+	delta = now - rq->core->core_forceidle_start;
+	if (unlikely((s64)delta <= 0))
+		return;
+
+	rq->core->core_forceidle_start = now;
+
+	/*
+	 * For larger SMT configurations, we need to scale the charged
+	 * forced idle amount since there can be more than one forced idle
+	 * sibling and more than one running cookied task.
+	 */
+	smt_count = cpumask_weight(smt_mask);
+	if (smt_count > 2) {
+		unsigned int nr_forced_idle = 0, nr_running = 0;
+
+		for_each_cpu(i, smt_mask) {
+			rq_i = cpu_rq(i);
+			p = rq_i->core_pick ?: rq_i->curr;
+
+			if (p != rq_i->idle)
+				nr_running++;
+			else if (rq_i->nr_running)
+				nr_forced_idle++;
+		}
+
+		if (WARN_ON_ONCE(!nr_running)) {
+			/* can't be forced idle without a running task */
+		} else {
+			delta *= nr_forced_idle;
+			delta /= nr_running;
+		}
+	}
+
+	for_each_cpu(i, smt_mask) {
+		rq_i = cpu_rq(i);
+		p = rq_i->core_pick ?: rq_i->curr;
+
+		if (!p->core_cookie)
+			continue;
+
+		p->core_forceidle_sum += delta;
+
+		/* Optimize for common case. */
+		if (smt_count == 2)
+			break;
+	}
+}
+
+void sched_core_tick(struct rq *rq)
+{
+	if (!sched_core_enabled(rq))
+		return;
+
+	if (!rq->core->core_forceidle)
+		return;
+
+	if (rq != rq->core)
+		update_rq_clock(rq->core);
+	sched_core_account_forceidle(rq);
+}
+
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 26fac5e28bc0..0fe6a1bb8b60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1045,6 +1045,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 	__PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
 	__PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
 	__PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
+#endif
+#ifdef CONFIG_SCHED_CORE
+	PN(core_forceidle_sum);
 #endif
 	P(policy);
 	P(prio);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a00fc7057d97..4678b85754f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1113,6 +1113,7 @@ struct rq {
 	unsigned long		core_cookie;
 	unsigned char		core_forceidle;
 	unsigned int		core_forceidle_seq;
+	u64			core_forceidle_start;
 #endif
 };
 
@@ -1253,11 +1254,15 @@ static inline bool sched_core_enqueued(struct task_struct *p)
 }
 
 extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
-extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
+extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
 
 extern void sched_core_get(void);
 extern void sched_core_put(void);
 
+extern void sched_core_account_forceidle(struct rq *rq);
+
+extern void sched_core_tick(struct rq *rq);
+
 #else /* !CONFIG_SCHED_CORE */
 
 static inline bool sched_core_enabled(struct rq *rq)
@@ -1300,6 +1305,10 @@ static inline bool sched_group_cookie_match(struct rq *rq,
 {
 	return true;
 }
+
+static inline void sched_core_account_forceidle(struct rq *rq) {}
+
+static inline void sched_core_tick(struct rq *rq) {}
 #endif /* CONFIG_SCHED_CORE */
 
 static inline void lockdep_assert_rq_held(struct rq *rq)
-- 
2.33.0.882.g93a45727a2-goog


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-08  0:08 [PATCH] sched/core: forced idle accounting Josh Don
@ 2021-10-08 21:04 ` Josh Don
  2021-10-09 15:54 ` Peter Zijlstra
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Josh Don @ 2021-10-08 21:04 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot
  Cc: Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	Hao Luo, linux-kernel

On Thu, Oct 7, 2021 at 5:08 PM Josh Don <joshdon@google.com> wrote:
>
> @@ -6051,6 +6071,13 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
>         core_rq->core_forceidle     = rq->core_forceidle;
>         core_rq->core_forceidle_seq = rq->core_forceidle_seq;
>
> +       /*
> +        * Accounting edge for forced idle is handled in pick_next_task().
> +        * Don't need another one here, since the hotplug thread shouldn't
> +        * have a cookie.
> +        */
> +       core_rq->core_forceidle_start = 0;
> +
>         /* install new leader */
>         for_each_cpu(t, smt_mask) {
>                 rq = cpu_rq(t);

Realized there needs to be a similar edge in sched_core_flip(). I'll
include that in a v2, after seeing if there are any other comments on
this patch.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-08  0:08 [PATCH] sched/core: forced idle accounting Josh Don
  2021-10-08 21:04 ` Josh Don
@ 2021-10-09 15:54 ` Peter Zijlstra
  2021-10-12  0:12   ` Josh Don
  2021-10-09 18:11 ` Tao Zhou
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2021-10-09 15:54 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	Hao Luo, linux-kernel

On Thu, Oct 07, 2021 at 05:08:25PM -0700, Josh Don wrote:
> Adds accounting for "forced idle" time, which is time where a cookie'd
> task forces its SMT sibling to idle, despite the presence of runnable
> tasks.
> 
> Forced idle time is one means to measure the cost of enabling core
> scheduling (ie. the capacity lost due to the need to force idle).

It seems an excessive amount of code for what it says to do.

> +	smt_count = cpumask_weight(smt_mask);

That's a fairly expensive operation to find a number that's going the be
to same over and over and over...

> +	if (smt_count > 2) {
> +		unsigned int nr_forced_idle = 0, nr_running = 0;
> +
> +		for_each_cpu(i, smt_mask) {
> +			rq_i = cpu_rq(i);
> +			p = rq_i->core_pick ?: rq_i->curr;
> +
> +			if (p != rq_i->idle)
> +				nr_running++;
> +			else if (rq_i->nr_running)
> +				nr_forced_idle++;
> +		}
> +
> +		if (WARN_ON_ONCE(!nr_running)) {
> +			/* can't be forced idle without a running task */
> +		} else {
> +			delta *= nr_forced_idle;
> +			delta /= nr_running;
> +		}

Now the comment sayeth:

> +	/*
> +	 * For larger SMT configurations, we need to scale the charged
> +	 * forced idle amount since there can be more than one forced idle
> +	 * sibling and more than one running cookied task.
> +	 */

But why?

> +	}
> +
> +	for_each_cpu(i, smt_mask) {
> +		rq_i = cpu_rq(i);
> +		p = rq_i->core_pick ?: rq_i->curr;
> +
> +		if (!p->core_cookie)
> +			continue;
> +
> +		p->core_forceidle_sum += delta;
> +
> +		/* Optimize for common case. */
> +		if (smt_count == 2)
> +			break;
> +	}
> +}

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-08  0:08 [PATCH] sched/core: forced idle accounting Josh Don
  2021-10-08 21:04 ` Josh Don
  2021-10-09 15:54 ` Peter Zijlstra
@ 2021-10-09 18:11 ` Tao Zhou
  2021-10-12  0:14   ` Josh Don
  2021-10-11 14:33 ` kernel test robot
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Tao Zhou @ 2021-10-09 18:11 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	Hao Luo, linux-kernel, Tao Zhou

Hi Josh,

On Thu, Oct 07, 2021 at 05:08:25PM -0700, Josh Don wrote:
> Adds accounting for "forced idle" time, which is time where a cookie'd
> task forces its SMT sibling to idle, despite the presence of runnable
> tasks.
> 
> Forced idle time is one means to measure the cost of enabling core
> scheduling (ie. the capacity lost due to the need to force idle).
> 
> Signed-off-by: Josh Don <joshdon@google.com>
> ---
>  include/linux/sched.h     |  1 +
>  kernel/sched/core.c       | 46 ++++++++++++++++-----
>  kernel/sched/core_sched.c | 85 ++++++++++++++++++++++++++++++++++++++-
>  kernel/sched/debug.c      |  3 ++
>  kernel/sched/sched.h      | 11 ++++-
>  5 files changed, 135 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2aa30e8e1440..d114ba350802 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -784,6 +784,7 @@ struct task_struct {
>  	struct rb_node			core_node;
>  	unsigned long			core_cookie;
>  	unsigned int			core_occupation;
> +	u64				core_forceidle_sum;
>  #endif
>  
>  #ifdef CONFIG_CGROUP_SCHED
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ada028e579b0..baa4f48cacff 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
>  	rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
>  }
>  
> -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
>  {
>  	rq->core->core_task_seq++;
>  
> -	if (!sched_core_enqueued(p))
> -		return;
> +	if (sched_core_enqueued(p)) {
> +		rb_erase(&p->core_node, &rq->core_tree);
> +		RB_CLEAR_NODE(&p->core_node);
> +	}
>  
> -	rb_erase(&p->core_node, &rq->core_tree);
> -	RB_CLEAR_NODE(&p->core_node);
> +	/*
> +	 * Migrating the last task off the cpu, with the cpu in forced idle
> +	 * state. Reschedule to create an accounting edge for forced idle,
> +	 * and re-examine whether the core is still in forced idle state.
> +	 */
> +	if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> +	    rq->core->core_forceidle && rq->curr == rq->idle)
> +		resched_curr(rq);
>  }
>  
>  /*
> @@ -364,7 +372,8 @@ void sched_core_put(void)
>  #else /* !CONFIG_SCHED_CORE */
>  
>  static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
> -static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
> +static inline void
> +sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
>  
>  #endif /* CONFIG_SCHED_CORE */
>  
> @@ -2020,7 +2029,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
>  static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
>  {
>  	if (sched_core_enabled(rq))
> -		sched_core_dequeue(rq, p);
> +		sched_core_dequeue(rq, p, flags);
>  
>  	if (!(flags & DEQUEUE_NOCLOCK))
>  		update_rq_clock(rq);
> @@ -5256,6 +5265,7 @@ void scheduler_tick(void)
>  	if (sched_feat(LATENCY_WARN))
>  		resched_latency = cpu_resched_latency(rq);
>  	calc_global_load_tick(rq);
> +	sched_core_tick(rq);
>  
>  	rq_unlock(rq, &rf);
>  
> @@ -5668,6 +5678,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>  	struct task_struct *next, *p, *max = NULL;
>  	const struct cpumask *smt_mask;
>  	bool fi_before = false;
> +	bool core_clock_updated = (rq == rq->core);
>  	unsigned long cookie;
>  	int i, cpu, occ = 0;
>  	struct rq *rq_i;
> @@ -5721,9 +5732,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>  	/* reset state */
>  	rq->core->core_cookie = 0UL;
>  	if (rq->core->core_forceidle) {
> +		if (!core_clock_updated) {
> +			update_rq_clock(rq->core);
> +			core_clock_updated = true;
> +		}
> +		sched_core_account_forceidle(rq);
> +		rq->core->core_forceidle = false;
> +		rq->core->core_forceidle_start = 0;
>  		need_sync = true;
>  		fi_before = true;
> -		rq->core->core_forceidle = false;
>  	}
>  
>  	/*
> @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>  	for_each_cpu_wrap(i, smt_mask, cpu) {
>  		rq_i = cpu_rq(i);
>  
> -		if (i != cpu)
> +		if (i != cpu && (rq_i != rq->core || !core_clock_updated))
>  			update_rq_clock(rq_i);
>  
>  		p = rq_i->core_pick = pick_task(rq_i);
> @@ -5804,6 +5821,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>  		}
>  	}
>  
> +	if (rq->core->core_forceidle && cookie)
> +		rq->core->core_forceidle_start = rq_clock(rq->core);
> +
>  	rq->core->core_pick_seq = rq->core->core_task_seq;
>  	next = rq->core_pick;
>  	rq->core_sched_seq = rq->core->core_pick_seq;
> @@ -6051,6 +6071,13 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
>  	core_rq->core_forceidle     = rq->core_forceidle;
>  	core_rq->core_forceidle_seq = rq->core_forceidle_seq;
>  
> +	/*
> +	 * Accounting edge for forced idle is handled in pick_next_task().
> +	 * Don't need another one here, since the hotplug thread shouldn't
> +	 * have a cookie.
> +	 */
> +	core_rq->core_forceidle_start = 0;
> +
>  	/* install new leader */
>  	for_each_cpu(t, smt_mask) {
>  		rq = cpu_rq(t);
> @@ -9424,6 +9451,7 @@ void __init sched_init(void)
>  		rq->core_enabled = 0;
>  		rq->core_tree = RB_ROOT;
>  		rq->core_forceidle = false;
> +		rq->core_forceidle_start = 0;
>  
>  		rq->core_cookie = 0UL;
>  #endif
> diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
> index 48ac72696012..aae4ac2ac7ec 100644
> --- a/kernel/sched/core_sched.c
> +++ b/kernel/sched/core_sched.c
> @@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
>  
>  	enqueued = sched_core_enqueued(p);
>  	if (enqueued)
> -		sched_core_dequeue(rq, p);
> +		sched_core_dequeue(rq, p, DEQUEUE_SAVE);
>  
>  	old_cookie = p->core_cookie;
>  	p->core_cookie = cookie;
> @@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
>  	 * If task is currently running, it may not be compatible anymore after
>  	 * the cookie change, so enter the scheduler on its CPU to schedule it
>  	 * away.
> +	 *
> +	 * Note that it is possible that as a result of this cookie change, the
> +	 * core has now entered/left forced idle state. Defer accounting to the
> +	 * next scheduling edge, rather than always forcing a reschedule here.
>  	 */
>  	if (task_running(rq, p))
>  		resched_curr(rq);
> @@ -109,6 +113,7 @@ void sched_core_fork(struct task_struct *p)
>  {
>  	RB_CLEAR_NODE(&p->core_node);
>  	p->core_cookie = sched_core_clone_cookie(current);
> +	p->core_forceidle_sum = 0;
>  }
>  
>  void sched_core_free(struct task_struct *p)
> @@ -228,3 +233,81 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
>  	return err;
>  }
>  
> +/* REQUIRES: rq->core's clock recently updated. */
> +void sched_core_account_forceidle(struct rq *rq)
> +{
> +	const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
> +	unsigned int smt_count;
> +	u64 delta, now = rq_clock(rq->core);
> +	struct rq *rq_i;
> +	struct task_struct *p;
> +	int i;
> +
> +	lockdep_assert_rq_held(rq);
> +
> +	WARN_ON_ONCE(!rq->core->core_forceidle);
> +
> +	if (rq->core->core_forceidle_start == 0)
> +		return;
> +
> +	delta = now - rq->core->core_forceidle_start;
> +	if (unlikely((s64)delta <= 0))
> +		return;
> +
> +	rq->core->core_forceidle_start = now;
> +
> +	/*
> +	 * For larger SMT configurations, we need to scale the charged
> +	 * forced idle amount since there can be more than one forced idle
> +	 * sibling and more than one running cookied task.
> +	 */
> +	smt_count = cpumask_weight(smt_mask);
> +	if (smt_count > 2) {
> +		unsigned int nr_forced_idle = 0, nr_running = 0;
> +
> +		for_each_cpu(i, smt_mask) {
> +			rq_i = cpu_rq(i);
> +			p = rq_i->core_pick ?: rq_i->curr;
> +
> +			if (p != rq_i->idle)
> +				nr_running++;
> +			else if (rq_i->nr_running)
> +				nr_forced_idle++;
> +		}
> +
> +		if (WARN_ON_ONCE(!nr_running)) {
> +			/* can't be forced idle without a running task */
> +		} else {
> +			delta *= nr_forced_idle;
> +			delta /= nr_running;
> +		}

Is it possible to use (smt_count - core_occupation) / core_occupation 
to evaluate this delta ?

> +	}
> +
> +	for_each_cpu(i, smt_mask) {
> +		rq_i = cpu_rq(i);
> +		p = rq_i->core_pick ?: rq_i->curr;
> +
> +		if (!p->core_cookie)
> +			continue;
> +
> +		p->core_forceidle_sum += delta;
> +
> +		/* Optimize for common case. */
> +		if (smt_count == 2)
> +			break;
> +	}
> +}
> +
> +void sched_core_tick(struct rq *rq)
> +{
> +	if (!sched_core_enabled(rq))
> +		return;
> +
> +	if (!rq->core->core_forceidle)
> +		return;
> +
> +	if (rq != rq->core)
> +		update_rq_clock(rq->core);
> +	sched_core_account_forceidle(rq);
> +}
> +
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 26fac5e28bc0..0fe6a1bb8b60 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1045,6 +1045,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>  	__PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
>  	__PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
>  	__PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
> +#endif
> +#ifdef CONFIG_SCHED_CORE
> +	PN(core_forceidle_sum);
>  #endif
>  	P(policy);
>  	P(prio);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index a00fc7057d97..4678b85754f2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1113,6 +1113,7 @@ struct rq {
>  	unsigned long		core_cookie;
>  	unsigned char		core_forceidle;
>  	unsigned int		core_forceidle_seq;
> +	u64			core_forceidle_start;
>  #endif
>  };
>  
> @@ -1253,11 +1254,15 @@ static inline bool sched_core_enqueued(struct task_struct *p)
>  }
>  
>  extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
> -extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
> +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
>  
>  extern void sched_core_get(void);
>  extern void sched_core_put(void);
>  
> +extern void sched_core_account_forceidle(struct rq *rq);
> +
> +extern void sched_core_tick(struct rq *rq);
> +
>  #else /* !CONFIG_SCHED_CORE */
>  
>  static inline bool sched_core_enabled(struct rq *rq)
> @@ -1300,6 +1305,10 @@ static inline bool sched_group_cookie_match(struct rq *rq,
>  {
>  	return true;
>  }
> +
> +static inline void sched_core_account_forceidle(struct rq *rq) {}
> +
> +static inline void sched_core_tick(struct rq *rq) {}
>  #endif /* CONFIG_SCHED_CORE */
>  
>  static inline void lockdep_assert_rq_held(struct rq *rq)
> -- 
> 2.33.0.882.g93a45727a2-goog



Thanks,
Tao

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-08  0:08 [PATCH] sched/core: forced idle accounting Josh Don
                   ` (2 preceding siblings ...)
  2021-10-09 18:11 ` Tao Zhou
@ 2021-10-11 14:33 ` kernel test robot
  2021-10-11 17:33 ` Hao Luo
  2021-10-11 18:08   ` kernel test robot
  5 siblings, 0 replies; 18+ messages in thread
From: kernel test robot @ 2021-10-11 14:33 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 3640 bytes --]

Hi Josh,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master next-20211011]
[cannot apply to linux/master linus/master v5.15-rc5]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Josh-Don/sched-core-forced-idle-accounting/20211008-095615
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 769fdf83df57b373660343ef4270b3ada91ef434
config: i386-randconfig-a004-20210930 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/b2e2ae116ab03285ca899a48b9d23cbedab5ad0a
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Josh-Don/sched-core-forced-idle-accounting/20211008-095615
        git checkout b2e2ae116ab03285ca899a48b9d23cbedab5ad0a
        # save the attached .config to linux build tree
        mkdir build_dir
        make W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: kernel/sched/core_sched.o: in function `sched_core_account_forceidle':
>> kernel/sched/core_sched.c:282: undefined reference to `__udivdi3'


vim +282 kernel/sched/core_sched.c

   235	
   236	/* REQUIRES: rq->core's clock recently updated. */
   237	void sched_core_account_forceidle(struct rq *rq)
   238	{
   239		const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
   240		unsigned int smt_count;
   241		u64 delta, now = rq_clock(rq->core);
   242		struct rq *rq_i;
   243		struct task_struct *p;
   244		int i;
   245	
   246		lockdep_assert_rq_held(rq);
   247	
   248		WARN_ON_ONCE(!rq->core->core_forceidle);
   249	
   250		if (rq->core->core_forceidle_start == 0)
   251			return;
   252	
   253		delta = now - rq->core->core_forceidle_start;
   254		if (unlikely((s64)delta <= 0))
   255			return;
   256	
   257		rq->core->core_forceidle_start = now;
   258	
   259		/*
   260		 * For larger SMT configurations, we need to scale the charged
   261		 * forced idle amount since there can be more than one forced idle
   262		 * sibling and more than one running cookied task.
   263		 */
   264		smt_count = cpumask_weight(smt_mask);
   265		if (smt_count > 2) {
   266			unsigned int nr_forced_idle = 0, nr_running = 0;
   267	
   268			for_each_cpu(i, smt_mask) {
   269				rq_i = cpu_rq(i);
   270				p = rq_i->core_pick ?: rq_i->curr;
   271	
   272				if (p != rq_i->idle)
   273					nr_running++;
   274				else if (rq_i->nr_running)
   275					nr_forced_idle++;
   276			}
   277	
   278			if (WARN_ON_ONCE(!nr_running)) {
   279				/* can't be forced idle without a running task */
   280			} else {
   281				delta *= nr_forced_idle;
 > 282				delta /= nr_running;
   283			}
   284		}
   285	
   286		for_each_cpu(i, smt_mask) {
   287			rq_i = cpu_rq(i);
   288			p = rq_i->core_pick ?: rq_i->curr;
   289	
   290			if (!p->core_cookie)
   291				continue;
   292	
   293			p->core_forceidle_sum += delta;
   294	
   295			/* Optimize for common case. */
   296			if (smt_count == 2)
   297				break;
   298		}
   299	}
   300	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 38798 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-08  0:08 [PATCH] sched/core: forced idle accounting Josh Don
                   ` (3 preceding siblings ...)
  2021-10-11 14:33 ` kernel test robot
@ 2021-10-11 17:33 ` Hao Luo
  2021-10-12  0:31   ` Josh Don
  2021-10-11 18:08   ` kernel test robot
  5 siblings, 1 reply; 18+ messages in thread
From: Hao Luo @ 2021-10-11 17:33 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	linux-kernel

On Thu, Oct 7, 2021 at 5:08 PM Josh Don <joshdon@google.com> wrote:
>
> Adds accounting for "forced idle" time, which is time where a cookie'd
> task forces its SMT sibling to idle, despite the presence of runnable
> tasks.
>
> Forced idle time is one means to measure the cost of enabling core
> scheduling (ie. the capacity lost due to the need to force idle).
>
> Signed-off-by: Josh Don <joshdon@google.com>
> ---
>  include/linux/sched.h     |  1 +
>  kernel/sched/core.c       | 46 ++++++++++++++++-----
>  kernel/sched/core_sched.c | 85 ++++++++++++++++++++++++++++++++++++++-
>  kernel/sched/debug.c      |  3 ++
>  kernel/sched/sched.h      | 11 ++++-
>  5 files changed, 135 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2aa30e8e1440..d114ba350802 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -784,6 +784,7 @@ struct task_struct {
>         struct rb_node                  core_node;
>         unsigned long                   core_cookie;
>         unsigned int                    core_occupation;
> +       u64                             core_forceidle_sum;
>  #endif
>
>  #ifdef CONFIG_CGROUP_SCHED
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ada028e579b0..baa4f48cacff 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
>         rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
>  }
>
> -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
>  {
>         rq->core->core_task_seq++;
>
> -       if (!sched_core_enqueued(p))
> -               return;
> +       if (sched_core_enqueued(p)) {
> +               rb_erase(&p->core_node, &rq->core_tree);
> +               RB_CLEAR_NODE(&p->core_node);
> +       }
>
> -       rb_erase(&p->core_node, &rq->core_tree);
> -       RB_CLEAR_NODE(&p->core_node);
> +       /*
> +        * Migrating the last task off the cpu, with the cpu in forced idle
> +        * state. Reschedule to create an accounting edge for forced idle,
> +        * and re-examine whether the core is still in forced idle state.
> +        */
> +       if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> +           rq->core->core_forceidle && rq->curr == rq->idle)
> +               resched_curr(rq);

Resched_curr is probably an unwanted side effect of dequeue. Maybe we
could extract the check and resched_curr out into a function, and call
the function outside of sched_core_dequeue(). In that way, the
interface of dequeue doesn't need to change.

>  }
>
>  /*
> @@ -364,7 +372,8 @@ void sched_core_put(void)
>  #else /* !CONFIG_SCHED_CORE */
>
>  static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
> -static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
> +static inline void
> +sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
>
>  #endif /* CONFIG_SCHED_CORE */
>
> @@ -2020,7 +2029,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
>  static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
>  {
>         if (sched_core_enabled(rq))
> -               sched_core_dequeue(rq, p);
> +               sched_core_dequeue(rq, p, flags);
>
>         if (!(flags & DEQUEUE_NOCLOCK))
>                 update_rq_clock(rq);
> @@ -5256,6 +5265,7 @@ void scheduler_tick(void)
>         if (sched_feat(LATENCY_WARN))
>                 resched_latency = cpu_resched_latency(rq);
>         calc_global_load_tick(rq);
> +       sched_core_tick(rq);
>
>         rq_unlock(rq, &rf);
>
> @@ -5668,6 +5678,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>         struct task_struct *next, *p, *max = NULL;
>         const struct cpumask *smt_mask;
>         bool fi_before = false;
> +       bool core_clock_updated = (rq == rq->core);
>         unsigned long cookie;
>         int i, cpu, occ = 0;
>         struct rq *rq_i;
> @@ -5721,9 +5732,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>         /* reset state */
>         rq->core->core_cookie = 0UL;
>         if (rq->core->core_forceidle) {
> +               if (!core_clock_updated) {
> +                       update_rq_clock(rq->core);
> +                       core_clock_updated = true;
> +               }
> +               sched_core_account_forceidle(rq);
> +               rq->core->core_forceidle = false;
> +               rq->core->core_forceidle_start = 0;
>                 need_sync = true;
>                 fi_before = true;
> -               rq->core->core_forceidle = false;
>         }
>
>         /*
> @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>         for_each_cpu_wrap(i, smt_mask, cpu) {
>                 rq_i = cpu_rq(i);
>
> -               if (i != cpu)
> +               if (i != cpu && (rq_i != rq->core || !core_clock_updated))
>                         update_rq_clock(rq_i);

Do you mean (rq_i != rq->core && !core_clock_updated)? I thought
rq->core has core_clock updated always.

>
>                 p = rq_i->core_pick = pick_task(rq_i);
> @@ -5804,6 +5821,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>                 }
>         }
>
> +       if (rq->core->core_forceidle && cookie)
> +               rq->core->core_forceidle_start = rq_clock(rq->core);
> +
>         rq->core->core_pick_seq = rq->core->core_task_seq;
>         next = rq->core_pick;
>         rq->core_sched_seq = rq->core->core_pick_seq;
> @@ -6051,6 +6071,13 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
>         core_rq->core_forceidle     = rq->core_forceidle;
>         core_rq->core_forceidle_seq = rq->core_forceidle_seq;
>
> +       /*
> +        * Accounting edge for forced idle is handled in pick_next_task().
> +        * Don't need another one here, since the hotplug thread shouldn't
> +        * have a cookie.
> +        */
> +       core_rq->core_forceidle_start = 0;
> +
>         /* install new leader */
>         for_each_cpu(t, smt_mask) {
>                 rq = cpu_rq(t);
> @@ -9424,6 +9451,7 @@ void __init sched_init(void)
>                 rq->core_enabled = 0;
>                 rq->core_tree = RB_ROOT;
>                 rq->core_forceidle = false;
> +               rq->core_forceidle_start = 0;
>
>                 rq->core_cookie = 0UL;
>  #endif
> diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
> index 48ac72696012..aae4ac2ac7ec 100644
> --- a/kernel/sched/core_sched.c
> +++ b/kernel/sched/core_sched.c
> @@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
>
>         enqueued = sched_core_enqueued(p);
>         if (enqueued)
> -               sched_core_dequeue(rq, p);
> +               sched_core_dequeue(rq, p, DEQUEUE_SAVE);
>
>         old_cookie = p->core_cookie;
>         p->core_cookie = cookie;
> @@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
>          * If task is currently running, it may not be compatible anymore after
>          * the cookie change, so enter the scheduler on its CPU to schedule it
>          * away.
> +        *
> +        * Note that it is possible that as a result of this cookie change, the
> +        * core has now entered/left forced idle state. Defer accounting to the
> +        * next scheduling edge, rather than always forcing a reschedule here.
>          */
>         if (task_running(rq, p))
>                 resched_curr(rq);
> @@ -109,6 +113,7 @@ void sched_core_fork(struct task_struct *p)
>  {
>         RB_CLEAR_NODE(&p->core_node);
>         p->core_cookie = sched_core_clone_cookie(current);
> +       p->core_forceidle_sum = 0;
>  }
>
>  void sched_core_free(struct task_struct *p)
> @@ -228,3 +233,81 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
>         return err;
>  }
>
> +/* REQUIRES: rq->core's clock recently updated. */
> +void sched_core_account_forceidle(struct rq *rq)
> +{
> +       const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
> +       unsigned int smt_count;
> +       u64 delta, now = rq_clock(rq->core);
> +       struct rq *rq_i;
> +       struct task_struct *p;
> +       int i;
> +
> +       lockdep_assert_rq_held(rq);
> +
> +       WARN_ON_ONCE(!rq->core->core_forceidle);
> +
> +       if (rq->core->core_forceidle_start == 0)
> +               return;
> +
> +       delta = now - rq->core->core_forceidle_start;
> +       if (unlikely((s64)delta <= 0))
> +               return;
> +
> +       rq->core->core_forceidle_start = now;
> +
> +       /*
> +        * For larger SMT configurations, we need to scale the charged
> +        * forced idle amount since there can be more than one forced idle
> +        * sibling and more than one running cookied task.
> +        */
> +       smt_count = cpumask_weight(smt_mask);
> +       if (smt_count > 2) {
> +               unsigned int nr_forced_idle = 0, nr_running = 0;
> +
> +               for_each_cpu(i, smt_mask) {
> +                       rq_i = cpu_rq(i);
> +                       p = rq_i->core_pick ?: rq_i->curr;
> +
> +                       if (p != rq_i->idle)
> +                               nr_running++;
> +                       else if (rq_i->nr_running)
> +                               nr_forced_idle++;
> +               }
> +
> +               if (WARN_ON_ONCE(!nr_running)) {
> +                       /* can't be forced idle without a running task */
> +               } else {
> +                       delta *= nr_forced_idle;
> +                       delta /= nr_running;
> +               }
> +       }
> +
> +       for_each_cpu(i, smt_mask) {
> +               rq_i = cpu_rq(i);
> +               p = rq_i->core_pick ?: rq_i->curr;
> +
> +               if (!p->core_cookie)
> +                       continue;
> +
> +               p->core_forceidle_sum += delta;
> +
> +               /* Optimize for common case. */
> +               if (smt_count == 2)
> +                       break;
> +       }
> +}
> +
> +void sched_core_tick(struct rq *rq)
> +{
> +       if (!sched_core_enabled(rq))
> +               return;
> +
> +       if (!rq->core->core_forceidle)
> +               return;
> +
> +       if (rq != rq->core)
> +               update_rq_clock(rq->core);
> +       sched_core_account_forceidle(rq);
> +}
> +
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 26fac5e28bc0..0fe6a1bb8b60 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -1045,6 +1045,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>         __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
>         __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
>         __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
> +#endif
> +#ifdef CONFIG_SCHED_CORE
> +       PN(core_forceidle_sum);
>  #endif
>         P(policy);
>         P(prio);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index a00fc7057d97..4678b85754f2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1113,6 +1113,7 @@ struct rq {
>         unsigned long           core_cookie;
>         unsigned char           core_forceidle;
>         unsigned int            core_forceidle_seq;
> +       u64                     core_forceidle_start;
>  #endif
>  };
>
> @@ -1253,11 +1254,15 @@ static inline bool sched_core_enqueued(struct task_struct *p)
>  }
>
>  extern void sched_core_enqueue(struct rq *rq, struct task_struct *p);
> -extern void sched_core_dequeue(struct rq *rq, struct task_struct *p);
> +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
>
>  extern void sched_core_get(void);
>  extern void sched_core_put(void);
>
> +extern void sched_core_account_forceidle(struct rq *rq);
> +
> +extern void sched_core_tick(struct rq *rq);
> +
>  #else /* !CONFIG_SCHED_CORE */
>
>  static inline bool sched_core_enabled(struct rq *rq)
> @@ -1300,6 +1305,10 @@ static inline bool sched_group_cookie_match(struct rq *rq,
>  {
>         return true;
>  }
> +
> +static inline void sched_core_account_forceidle(struct rq *rq) {}
> +
> +static inline void sched_core_tick(struct rq *rq) {}
>  #endif /* CONFIG_SCHED_CORE */
>
>  static inline void lockdep_assert_rq_held(struct rq *rq)
> --
> 2.33.0.882.g93a45727a2-goog
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-08  0:08 [PATCH] sched/core: forced idle accounting Josh Don
@ 2021-10-11 18:08   ` kernel test robot
  2021-10-09 15:54 ` Peter Zijlstra
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: kernel test robot @ 2021-10-11 18:08 UTC (permalink / raw)
  To: Josh Don, Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot
  Cc: llvm, kbuild-all, Dietmar Eggemann, Steven Rostedt, Ben Segall,
	Mel Gorman, Daniel Bristot de Oliveira, Joel Fernandes

[-- Attachment #1: Type: text/plain, Size: 1949 bytes --]

Hi Josh,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master next-20211011]
[cannot apply to linux/master linus/master v5.15-rc5]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Josh-Don/sched-core-forced-idle-accounting/20211008-095615
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 769fdf83df57b373660343ef4270b3ada91ef434
config: i386-randconfig-a004-20211011 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project f95d9c95bbf4cf662b9a181245fc6dcede39f590)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/b2e2ae116ab03285ca899a48b9d23cbedab5ad0a
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Josh-Don/sched-core-forced-idle-accounting/20211008-095615
        git checkout b2e2ae116ab03285ca899a48b9d23cbedab5ad0a
        # save the attached .config to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> ld.lld: error: undefined symbol: __udivdi3
   >>> referenced by core_sched.c:282 (kernel/sched/core_sched.c:282)
   >>>               sched/core_sched.o:(sched_core_account_forceidle) in archive kernel/built-in.a

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 29954 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
@ 2021-10-11 18:08   ` kernel test robot
  0 siblings, 0 replies; 18+ messages in thread
From: kernel test robot @ 2021-10-11 18:08 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 1989 bytes --]

Hi Josh,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/sched/core]
[also build test ERROR on tip/master next-20211011]
[cannot apply to linux/master linus/master v5.15-rc5]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Josh-Don/sched-core-forced-idle-accounting/20211008-095615
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 769fdf83df57b373660343ef4270b3ada91ef434
config: i386-randconfig-a004-20211011 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project f95d9c95bbf4cf662b9a181245fc6dcede39f590)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/b2e2ae116ab03285ca899a48b9d23cbedab5ad0a
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Josh-Don/sched-core-forced-idle-accounting/20211008-095615
        git checkout b2e2ae116ab03285ca899a48b9d23cbedab5ad0a
        # save the attached .config to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> ld.lld: error: undefined symbol: __udivdi3
   >>> referenced by core_sched.c:282 (kernel/sched/core_sched.c:282)
   >>>               sched/core_sched.o:(sched_core_account_forceidle) in archive kernel/built-in.a

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 29954 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-09 15:54 ` Peter Zijlstra
@ 2021-10-12  0:12   ` Josh Don
  2021-10-12 12:27     ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Josh Don @ 2021-10-12  0:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	Hao Luo, linux-kernel

On Sat, Oct 9, 2021 at 8:55 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, Oct 07, 2021 at 05:08:25PM -0700, Josh Don wrote:
> > Adds accounting for "forced idle" time, which is time where a cookie'd
> > task forces its SMT sibling to idle, despite the presence of runnable
> > tasks.
> >
> > Forced idle time is one means to measure the cost of enabling core
> > scheduling (ie. the capacity lost due to the need to force idle).
>
> It seems an excessive amount of code for what it says to do.

I think I can cut some of that down by simplifying the SMT>2 case :)

>
> > +     smt_count = cpumask_weight(smt_mask);
>
> That's a fairly expensive operation to find a number that's going the be
> to same over and over and over...

Per Tao's suggestion, the nr_running and nr_forced_idle can be
computed and cached in pick(). Then there won't be any extra overhead
here, other than a potential division when SMT>2.

> > +     if (smt_count > 2) {
> > +             unsigned int nr_forced_idle = 0, nr_running = 0;
> > +
> > +             for_each_cpu(i, smt_mask) {
> > +                     rq_i = cpu_rq(i);
> > +                     p = rq_i->core_pick ?: rq_i->curr;
> > +
> > +                     if (p != rq_i->idle)
> > +                             nr_running++;
> > +                     else if (rq_i->nr_running)
> > +                             nr_forced_idle++;
> > +             }
> > +
> > +             if (WARN_ON_ONCE(!nr_running)) {
> > +                     /* can't be forced idle without a running task */
> > +             } else {
> > +                     delta *= nr_forced_idle;
> > +                     delta /= nr_running;
> > +             }
>
> Now the comment sayeth:
>
> > +     /*
> > +      * For larger SMT configurations, we need to scale the charged
> > +      * forced idle amount since there can be more than one forced idle
> > +      * sibling and more than one running cookied task.
> > +      */
>
> But why?

We scale by the number of cpus actually forced idle, since we don't
want to falsely over or under charge forced idle time (defined
strictly as time where we have a runnable task but idle the cpu). The
more important scaling here though is the division over the number of
running entities. This is done so that the aggregate amount of forced
idle over some group of threads makes sense. Ie if we have a cpu with
SMT8, and a group of 7 threads sharing a cookie, we don't want to
accrue 7 units of forced idle time per unit time while the 8th SMT is
forced idle.

> > +     }
> > +
> > +     for_each_cpu(i, smt_mask) {
> > +             rq_i = cpu_rq(i);
> > +             p = rq_i->core_pick ?: rq_i->curr;
> > +
> > +             if (!p->core_cookie)
> > +                     continue;
> > +
> > +             p->core_forceidle_sum += delta;
> > +
> > +             /* Optimize for common case. */
> > +             if (smt_count == 2)
> > +                     break;
> > +     }
> > +}

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-09 18:11 ` Tao Zhou
@ 2021-10-12  0:14   ` Josh Don
  0 siblings, 0 replies; 18+ messages in thread
From: Josh Don @ 2021-10-12  0:14 UTC (permalink / raw)
  To: Tao Zhou
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	Hao Luo, linux-kernel

On Sat, Oct 9, 2021 at 11:11 AM Tao Zhou <tao.zhou@linux.dev> wrote:
[snip]
> > +     smt_count = cpumask_weight(smt_mask);
> > +     if (smt_count > 2) {
> > +             unsigned int nr_forced_idle = 0, nr_running = 0;
> > +
> > +             for_each_cpu(i, smt_mask) {
> > +                     rq_i = cpu_rq(i);
> > +                     p = rq_i->core_pick ?: rq_i->curr;
> > +
> > +                     if (p != rq_i->idle)
> > +                             nr_running++;
> > +                     else if (rq_i->nr_running)
> > +                             nr_forced_idle++;
> > +             }
> > +
> > +             if (WARN_ON_ONCE(!nr_running)) {
> > +                     /* can't be forced idle without a running task */
> > +             } else {
> > +                     delta *= nr_forced_idle;
> > +                     delta /= nr_running;
> > +             }
>
> Is it possible to use (smt_count - core_occupation) / core_occupation
> to evaluate this delta ?

Good point, we can use the pre-computed info here. This will help
simplify this logic.

Thanks,
Josh

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-11 17:33 ` Hao Luo
@ 2021-10-12  0:31   ` Josh Don
  2021-10-14 17:57     ` Hao Luo
  0 siblings, 1 reply; 18+ messages in thread
From: Josh Don @ 2021-10-12  0:31 UTC (permalink / raw)
  To: Hao Luo
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	linux-kernel

On Mon, Oct 11, 2021 at 10:33 AM Hao Luo <haoluo@google.com> wrote:
>
> On Thu, Oct 7, 2021 at 5:08 PM Josh Don <joshdon@google.com> wrote:
> > -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> > +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> >  {
> >         rq->core->core_task_seq++;
> >
> > -       if (!sched_core_enqueued(p))
> > -               return;
> > +       if (sched_core_enqueued(p)) {
> > +               rb_erase(&p->core_node, &rq->core_tree);
> > +               RB_CLEAR_NODE(&p->core_node);
> > +       }
> >
> > -       rb_erase(&p->core_node, &rq->core_tree);
> > -       RB_CLEAR_NODE(&p->core_node);
> > +       /*
> > +        * Migrating the last task off the cpu, with the cpu in forced idle
> > +        * state. Reschedule to create an accounting edge for forced idle,
> > +        * and re-examine whether the core is still in forced idle state.
> > +        */
> > +       if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> > +           rq->core->core_forceidle && rq->curr == rq->idle)
> > +               resched_curr(rq);
>
> Resched_curr is probably an unwanted side effect of dequeue. Maybe we
> could extract the check and resched_curr out into a function, and call
> the function outside of sched_core_dequeue(). In that way, the
> interface of dequeue doesn't need to change.

This resched is an atypical case; normal load balancing won't steal
the last runnable task off a cpu. The main reasons this resched could
trigger are: migration due to affinity change, and migration due to
sched core doing a cookie_steal. Could bubble this up to
deactivate_task(), but seems less brittle to keep this in dequeue()
with the check against DEQUEUE_SAVE (since this creates an important
accounting edge). Thoughts?

> >         /*
> > @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> >         for_each_cpu_wrap(i, smt_mask, cpu) {
> >                 rq_i = cpu_rq(i);
> >
> > -               if (i != cpu)
> > +               if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> >                         update_rq_clock(rq_i);
>
> Do you mean (rq_i != rq->core && !core_clock_updated)? I thought
> rq->core has core_clock updated always.

rq->clock is updated on entry to pick_next_task(). rq->core is only
updated if rq == rq->core, or if we've done the clock update for
rq->core above.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-12  0:12   ` Josh Don
@ 2021-10-12 12:27     ` Peter Zijlstra
  2021-10-12 19:45       ` Josh Don
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2021-10-12 12:27 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	Hao Luo, linux-kernel

On Mon, Oct 11, 2021 at 05:12:43PM -0700, Josh Don wrote:

> > > +             if (WARN_ON_ONCE(!nr_running)) {
> > > +                     /* can't be forced idle without a running task */
> > > +             } else {
> > > +                     delta *= nr_forced_idle;
> > > +                     delta /= nr_running;
> > > +             }
> >
> > Now the comment sayeth:
> >
> > > +     /*
> > > +      * For larger SMT configurations, we need to scale the charged
> > > +      * forced idle amount since there can be more than one forced idle
> > > +      * sibling and more than one running cookied task.
> > > +      */
> >
> > But why?
> 
> We scale by the number of cpus actually forced idle, since we don't
> want to falsely over or under charge forced idle time (defined
> strictly as time where we have a runnable task but idle the cpu). The
> more important scaling here though is the division over the number of
> running entities. This is done so that the aggregate amount of forced
> idle over some group of threads makes sense. Ie if we have a cpu with
> SMT8, and a group of 7 threads sharing a cookie, we don't want to
> accrue 7 units of forced idle time per unit time while the 8th SMT is
> forced idle.

So why not simply compute the strict per-cpu force-idle time and let
userspace sort out the rest?


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-12 12:27     ` Peter Zijlstra
@ 2021-10-12 19:45       ` Josh Don
  2021-10-14 14:24         ` Peter Zijlstra
  0 siblings, 1 reply; 18+ messages in thread
From: Josh Don @ 2021-10-12 19:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	Hao Luo, linux-kernel

On Tue, Oct 12, 2021 at 5:27 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Oct 11, 2021 at 05:12:43PM -0700, Josh Don wrote:
>
> > > > +             if (WARN_ON_ONCE(!nr_running)) {
> > > > +                     /* can't be forced idle without a running task */
> > > > +             } else {
> > > > +                     delta *= nr_forced_idle;
> > > > +                     delta /= nr_running;
> > > > +             }
> > >
> > > Now the comment sayeth:
> > >
> > > > +     /*
> > > > +      * For larger SMT configurations, we need to scale the charged
> > > > +      * forced idle amount since there can be more than one forced idle
> > > > +      * sibling and more than one running cookied task.
> > > > +      */
> > >
> > > But why?
> >
> > We scale by the number of cpus actually forced idle, since we don't
> > want to falsely over or under charge forced idle time (defined
> > strictly as time where we have a runnable task but idle the cpu). The
> > more important scaling here though is the division over the number of
> > running entities. This is done so that the aggregate amount of forced
> > idle over some group of threads makes sense. Ie if we have a cpu with
> > SMT8, and a group of 7 threads sharing a cookie, we don't want to
> > accrue 7 units of forced idle time per unit time while the 8th SMT is
> > forced idle.
>
> So why not simply compute the strict per-cpu force-idle time and let
> userspace sort out the rest?

Do you mean to compute force idle solely as a per-cpu value? I think
that would be fine in addition to the per-thread field, but a
desirable property here is proper attribution to the cause of the
force idle. That lets system management understand which jobs are the
most antagonistic from a coresched perspective, and is a signal
(albeit noisy, due to system state and load balancing decisions) for
scaling their capacity requirements.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-12 19:45       ` Josh Don
@ 2021-10-14 14:24         ` Peter Zijlstra
  2021-10-14 23:18           ` Josh Don
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Zijlstra @ 2021-10-14 14:24 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	Hao Luo, linux-kernel

On Tue, Oct 12, 2021 at 12:45:28PM -0700, Josh Don wrote:
> On Tue, Oct 12, 2021 at 5:27 AM Peter Zijlstra <peterz@infradead.org> wrote:

> > > We scale by the number of cpus actually forced idle, since we don't
> > > want to falsely over or under charge forced idle time (defined
> > > strictly as time where we have a runnable task but idle the cpu). The
> > > more important scaling here though is the division over the number of
> > > running entities. This is done so that the aggregate amount of forced
> > > idle over some group of threads makes sense. Ie if we have a cpu with
> > > SMT8, and a group of 7 threads sharing a cookie, we don't want to
> > > accrue 7 units of forced idle time per unit time while the 8th SMT is
> > > forced idle.
> >
> > So why not simply compute the strict per-cpu force-idle time and let
> > userspace sort out the rest?
> 
> Do you mean to compute force idle solely as a per-cpu value? I think
> that would be fine in addition to the per-thread field, but a
> desirable property here is proper attribution to the cause of the
> force idle. That lets system management understand which jobs are the
> most antagonistic from a coresched perspective, and is a signal
> (albeit noisy, due to system state and load balancing decisions) for
> scaling their capacity requirements.

Urgh, reading is hard. I hadn't noticed you did per-task accounting (and
the original changelog doesn't clarify this either).

Also, should all this be undef SCHED_DEBUG ? Or be part of SCHEDSTATS ?

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-12  0:31   ` Josh Don
@ 2021-10-14 17:57     ` Hao Luo
  2021-10-14 23:29       ` Josh Don
  0 siblings, 1 reply; 18+ messages in thread
From: Hao Luo @ 2021-10-14 17:57 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	linux-kernel

On Mon, Oct 11, 2021 at 5:31 PM Josh Don <joshdon@google.com> wrote:
>
> On Mon, Oct 11, 2021 at 10:33 AM Hao Luo <haoluo@google.com> wrote:
> >
> > On Thu, Oct 7, 2021 at 5:08 PM Josh Don <joshdon@google.com> wrote:
> > > -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> > > +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> > >  {
> > >         rq->core->core_task_seq++;
> > >
> > > -       if (!sched_core_enqueued(p))
> > > -               return;
> > > +       if (sched_core_enqueued(p)) {
> > > +               rb_erase(&p->core_node, &rq->core_tree);
> > > +               RB_CLEAR_NODE(&p->core_node);
> > > +       }
> > >
> > > -       rb_erase(&p->core_node, &rq->core_tree);
> > > -       RB_CLEAR_NODE(&p->core_node);
> > > +       /*
> > > +        * Migrating the last task off the cpu, with the cpu in forced idle
> > > +        * state. Reschedule to create an accounting edge for forced idle,
> > > +        * and re-examine whether the core is still in forced idle state.
> > > +        */
> > > +       if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> > > +           rq->core->core_forceidle && rq->curr == rq->idle)
> > > +               resched_curr(rq);
> >
> > Resched_curr is probably an unwanted side effect of dequeue. Maybe we
> > could extract the check and resched_curr out into a function, and call
> > the function outside of sched_core_dequeue(). In that way, the
> > interface of dequeue doesn't need to change.
>
> This resched is an atypical case; normal load balancing won't steal
> the last runnable task off a cpu. The main reasons this resched could
> trigger are: migration due to affinity change, and migration due to
> sched core doing a cookie_steal. Could bubble this up to
> deactivate_task(), but seems less brittle to keep this in dequeue()
> with the check against DEQUEUE_SAVE (since this creates an important
> accounting edge). Thoughts?
>

I prefer bubbling it up to deactivate_task(). Depending on how many
callers of deactivate_task() need this resched, IMHO it is even fine
to put it in deactivate_task's caller. Wrapping it in a function may
help clarify its purpose.

> > >         /*
> > > @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > >         for_each_cpu_wrap(i, smt_mask, cpu) {
> > >                 rq_i = cpu_rq(i);
> > >
> > > -               if (i != cpu)
> > > +               if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> > >                         update_rq_clock(rq_i);
> >
> > Do you mean (rq_i != rq->core && !core_clock_updated)? I thought
> > rq->core has core_clock updated always.
>
> rq->clock is updated on entry to pick_next_task(). rq->core is only
> updated if rq == rq->core, or if we've done the clock update for
> rq->core above.

I meant 'if (i != cpu && rq_i != rq->core)'. Because at this point,
core_clock should already have been updated, is that not the case?
Anyway, the tracking of clock updates here is too confusing to me.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-14 14:24         ` Peter Zijlstra
@ 2021-10-14 23:18           ` Josh Don
  0 siblings, 0 replies; 18+ messages in thread
From: Josh Don @ 2021-10-14 23:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Juri Lelli, Vincent Guittot, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	Hao Luo, linux-kernel

On Thu, Oct 14, 2021 at 7:24 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Oct 12, 2021 at 12:45:28PM -0700, Josh Don wrote:
> > On Tue, Oct 12, 2021 at 5:27 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> > > > We scale by the number of cpus actually forced idle, since we don't
> > > > want to falsely over or under charge forced idle time (defined
> > > > strictly as time where we have a runnable task but idle the cpu). The
> > > > more important scaling here though is the division over the number of
> > > > running entities. This is done so that the aggregate amount of forced
> > > > idle over some group of threads makes sense. Ie if we have a cpu with
> > > > SMT8, and a group of 7 threads sharing a cookie, we don't want to
> > > > accrue 7 units of forced idle time per unit time while the 8th SMT is
> > > > forced idle.
> > >
> > > So why not simply compute the strict per-cpu force-idle time and let
> > > userspace sort out the rest?
> >
> > Do you mean to compute force idle solely as a per-cpu value? I think
> > that would be fine in addition to the per-thread field, but a
> > desirable property here is proper attribution to the cause of the
> > force idle. That lets system management understand which jobs are the
> > most antagonistic from a coresched perspective, and is a signal
> > (albeit noisy, due to system state and load balancing decisions) for
> > scaling their capacity requirements.
>
> Urgh, reading is hard. I hadn't noticed you did per-task accounting (and
> the original changelog doesn't clarify this either).

Yea, I'll add that to the description, along with a few other
implementation details.

> Also, should all this be undef SCHED_DEBUG ? Or be part of SCHEDSTATS ?

schedstats seems like a good home, that way users can avoid most of
the extra overhead if schedstats is disabled.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-14 17:57     ` Hao Luo
@ 2021-10-14 23:29       ` Josh Don
  2021-10-14 23:58         ` Hao Luo
  0 siblings, 1 reply; 18+ messages in thread
From: Josh Don @ 2021-10-14 23:29 UTC (permalink / raw)
  To: Hao Luo
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	linux-kernel

On Thu, Oct 14, 2021 at 10:58 AM Hao Luo <haoluo@google.com> wrote:
>
> On Mon, Oct 11, 2021 at 5:31 PM Josh Don <joshdon@google.com> wrote:
> >
> > On Mon, Oct 11, 2021 at 10:33 AM Hao Luo <haoluo@google.com> wrote:
> > >
> > > On Thu, Oct 7, 2021 at 5:08 PM Josh Don <joshdon@google.com> wrote:
> > > > -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> > > > +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> > > >  {
> > > >         rq->core->core_task_seq++;
> > > >
> > > > -       if (!sched_core_enqueued(p))
> > > > -               return;
> > > > +       if (sched_core_enqueued(p)) {
> > > > +               rb_erase(&p->core_node, &rq->core_tree);
> > > > +               RB_CLEAR_NODE(&p->core_node);
> > > > +       }
> > > >
> > > > -       rb_erase(&p->core_node, &rq->core_tree);
> > > > -       RB_CLEAR_NODE(&p->core_node);
> > > > +       /*
> > > > +        * Migrating the last task off the cpu, with the cpu in forced idle
> > > > +        * state. Reschedule to create an accounting edge for forced idle,
> > > > +        * and re-examine whether the core is still in forced idle state.
> > > > +        */
> > > > +       if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> > > > +           rq->core->core_forceidle && rq->curr == rq->idle)
> > > > +               resched_curr(rq);
> > >
> > > Resched_curr is probably an unwanted side effect of dequeue. Maybe we
> > > could extract the check and resched_curr out into a function, and call
> > > the function outside of sched_core_dequeue(). In that way, the
> > > interface of dequeue doesn't need to change.
> >
> > This resched is an atypical case; normal load balancing won't steal
> > the last runnable task off a cpu. The main reasons this resched could
> > trigger are: migration due to affinity change, and migration due to
> > sched core doing a cookie_steal. Could bubble this up to
> > deactivate_task(), but seems less brittle to keep this in dequeue()
> > with the check against DEQUEUE_SAVE (since this creates an important
> > accounting edge). Thoughts?
> >
>
> I prefer bubbling it up to deactivate_task(). Depending on how many
> callers of deactivate_task() need this resched, IMHO it is even fine
> to put it in deactivate_task's caller. Wrapping it in a function may
> help clarify its purpose.

I'd argue against bubbling up above deactivate_task(); makes things
much more brittle if a new use for deactivate_task() is added in the
future.

Tried both ways; IMO it seems slightly better to leave in dequeue() vs
deactivate(); less confusing to one hook instead of two for coresched
to handle dequeuing a task.

> > > >         /*
> > > > @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > > >         for_each_cpu_wrap(i, smt_mask, cpu) {
> > > >                 rq_i = cpu_rq(i);
> > > >
> > > > -               if (i != cpu)
> > > > +               if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> > > >                         update_rq_clock(rq_i);
> > >
> > > Do you mean (rq_i != rq->core && !core_clock_updated)? I thought
> > > rq->core has core_clock updated always.
> >
> > rq->clock is updated on entry to pick_next_task(). rq->core is only
> > updated if rq == rq->core, or if we've done the clock update for
> > rq->core above.
>
> I meant 'if (i != cpu && rq_i != rq->core)'. Because at this point,
> core_clock should already have been updated, is that not the case?
> Anyway, the tracking of clock updates here is too confusing to me.

Added a comment here, but the logic flow is:
- rq->clock is always updated on entry to pick_next_task()
- rq->core->clock _may_ be updated by the time we get to this part of
pick_next_task(). We have to be careful to avoid a double update,
hence the need for the core_clock_updated var.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] sched/core: forced idle accounting
  2021-10-14 23:29       ` Josh Don
@ 2021-10-14 23:58         ` Hao Luo
  0 siblings, 0 replies; 18+ messages in thread
From: Hao Luo @ 2021-10-14 23:58 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Joel Fernandes, Vineeth Pillai,
	linux-kernel

On Thu, Oct 14, 2021 at 4:29 PM Josh Don <joshdon@google.com> wrote:
>
> On Thu, Oct 14, 2021 at 10:58 AM Hao Luo <haoluo@google.com> wrote:
> >
> > On Mon, Oct 11, 2021 at 5:31 PM Josh Don <joshdon@google.com> wrote:
> > >
> > > On Mon, Oct 11, 2021 at 10:33 AM Hao Luo <haoluo@google.com> wrote:
> > > >
> > > > On Thu, Oct 7, 2021 at 5:08 PM Josh Don <joshdon@google.com> wrote:
> > > > > -void sched_core_dequeue(struct rq *rq, struct task_struct *p)
> > > > > +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
> > > > >  {
> > > > >         rq->core->core_task_seq++;
> > > > >
> > > > > -       if (!sched_core_enqueued(p))
> > > > > -               return;
> > > > > +       if (sched_core_enqueued(p)) {
> > > > > +               rb_erase(&p->core_node, &rq->core_tree);
> > > > > +               RB_CLEAR_NODE(&p->core_node);
> > > > > +       }
> > > > >
> > > > > -       rb_erase(&p->core_node, &rq->core_tree);
> > > > > -       RB_CLEAR_NODE(&p->core_node);
> > > > > +       /*
> > > > > +        * Migrating the last task off the cpu, with the cpu in forced idle
> > > > > +        * state. Reschedule to create an accounting edge for forced idle,
> > > > > +        * and re-examine whether the core is still in forced idle state.
> > > > > +        */
> > > > > +       if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
> > > > > +           rq->core->core_forceidle && rq->curr == rq->idle)
> > > > > +               resched_curr(rq);
> > > >
> > > > Resched_curr is probably an unwanted side effect of dequeue. Maybe we
> > > > could extract the check and resched_curr out into a function, and call
> > > > the function outside of sched_core_dequeue(). In that way, the
> > > > interface of dequeue doesn't need to change.
> > >
> > > This resched is an atypical case; normal load balancing won't steal
> > > the last runnable task off a cpu. The main reasons this resched could
> > > trigger are: migration due to affinity change, and migration due to
> > > sched core doing a cookie_steal. Could bubble this up to
> > > deactivate_task(), but seems less brittle to keep this in dequeue()
> > > with the check against DEQUEUE_SAVE (since this creates an important
> > > accounting edge). Thoughts?
> > >
> >
> > I prefer bubbling it up to deactivate_task(). Depending on how many
> > callers of deactivate_task() need this resched, IMHO it is even fine
> > to put it in deactivate_task's caller. Wrapping it in a function may
> > help clarify its purpose.
>
> I'd argue against bubbling up above deactivate_task(); makes things
> much more brittle if a new use for deactivate_task() is added in the
> future.
>
> Tried both ways; IMO it seems slightly better to leave in dequeue() vs
> deactivate(); less confusing to one hook instead of two for coresched
> to handle dequeuing a task.
>

Ack. No problem. I don't have strong objections here.

> > > > >         /*
> > > > > @@ -5765,7 +5782,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> > > > >         for_each_cpu_wrap(i, smt_mask, cpu) {
> > > > >                 rq_i = cpu_rq(i);
> > > > >
> > > > > -               if (i != cpu)
> > > > > +               if (i != cpu && (rq_i != rq->core || !core_clock_updated))
> > > > >                         update_rq_clock(rq_i);
> > > >
> > > > Do you mean (rq_i != rq->core && !core_clock_updated)? I thought
> > > > rq->core has core_clock updated always.
> > >
> > > rq->clock is updated on entry to pick_next_task(). rq->core is only
> > > updated if rq == rq->core, or if we've done the clock update for
> > > rq->core above.
> >
> > I meant 'if (i != cpu && rq_i != rq->core)'. Because at this point,
> > core_clock should already have been updated, is that not the case?
> > Anyway, the tracking of clock updates here is too confusing to me.
>
> Added a comment here, but the logic flow is:
> - rq->clock is always updated on entry to pick_next_task()
> - rq->core->clock _may_ be updated by the time we get to this part of
> pick_next_task(). We have to be careful to avoid a double update,
> hence the need for the core_clock_updated var.

Yeah. Sync'ed offline and that cleared my confusion. Thanks.

Hao

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-10-14 23:58 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-08  0:08 [PATCH] sched/core: forced idle accounting Josh Don
2021-10-08 21:04 ` Josh Don
2021-10-09 15:54 ` Peter Zijlstra
2021-10-12  0:12   ` Josh Don
2021-10-12 12:27     ` Peter Zijlstra
2021-10-12 19:45       ` Josh Don
2021-10-14 14:24         ` Peter Zijlstra
2021-10-14 23:18           ` Josh Don
2021-10-09 18:11 ` Tao Zhou
2021-10-12  0:14   ` Josh Don
2021-10-11 14:33 ` kernel test robot
2021-10-11 17:33 ` Hao Luo
2021-10-12  0:31   ` Josh Don
2021-10-14 17:57     ` Hao Luo
2021-10-14 23:29       ` Josh Don
2021-10-14 23:58         ` Hao Luo
2021-10-11 18:08 ` kernel test robot
2021-10-11 18:08   ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.