linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/4] SCHED_IDLE extensions
@ 2021-08-20  1:03 Josh Don
  2021-08-20  1:04 ` [PATCH v3 2/4] sched: account number of SCHED_IDLE entities on each cfs_rq Josh Don
                   ` (3 more replies)
  0 siblings, 4 replies; 28+ messages in thread
From: Josh Don @ 2021-08-20  1:03 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot
  Cc: Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel, Josh Don

This patch series contains improvements/extensions for SCHED_IDLE.

The first patch of the series is the previously mailed patch to add
cgroup support for SCHED_IDLE.

The second patch adds some additional idle accounting.

The third and fourth patches change some idle interactions.

Josh Don (4):
  sched: cgroup SCHED_IDLE support
  sched: account number of SCHED_IDLE entities on each cfs_rq
  sched: reduce sched slice for SCHED_IDLE entities
  sched: adjust sleeper credit for SCHED_IDLE entities

 kernel/sched/core.c  |  25 +++++
 kernel/sched/debug.c |   7 ++
 kernel/sched/fair.c  | 256 +++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |  10 ++
 4 files changed, 267 insertions(+), 31 deletions(-)

-- 
2.33.0.rc2.250.ged5fa647cd-goog


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 2/4] sched: account number of SCHED_IDLE entities on each cfs_rq
  2021-08-20  1:03 [PATCH v3 0/4] SCHED_IDLE extensions Josh Don
@ 2021-08-20  1:04 ` Josh Don
  2021-08-24  7:57   ` Vincent Guittot
                     ` (2 more replies)
  2021-08-20  1:04 ` [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities Josh Don
                   ` (2 subsequent siblings)
  3 siblings, 3 replies; 28+ messages in thread
From: Josh Don @ 2021-08-20  1:04 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot
  Cc: Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel, Josh Don

Adds cfs_rq->idle_nr_running, which accounts the number of idle entities
directly enqueued on the cfs_rq.

Signed-off-by: Josh Don <joshdon@google.com>
---
 kernel/sched/debug.c |  2 ++
 kernel/sched/fair.c  | 25 ++++++++++++++++++++++++-
 kernel/sched/sched.h |  1 +
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 49716228efb4..33538579db9a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,6 +608,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->nr_spread_over);
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+	SEQ_printf(m, "  .%-30s: %d\n", "idle_nr_running",
+			cfs_rq->idle_nr_running);
 	SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running",
 			cfs_rq->idle_h_nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5aa3cfd15a2e..19a9244c140f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2995,6 +2995,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	}
 #endif
 	cfs_rq->nr_running++;
+	if (se_is_idle(se))
+		cfs_rq->idle_nr_running++;
 }
 
 static void
@@ -3008,6 +3010,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	}
 #endif
 	cfs_rq->nr_running--;
+	if (se_is_idle(se))
+		cfs_rq->idle_nr_running--;
 }
 
 /*
@@ -5573,6 +5577,17 @@ static int sched_idle_rq(struct rq *rq)
 			rq->nr_running);
 }
 
+/*
+ * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
+ * of idle_nr_running, which does not consider idle descendants of normal
+ * entities.
+ */
+static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->nr_running &&
+		cfs_rq->nr_running == cfs_rq->idle_nr_running;
+}
+
 #ifdef CONFIG_SMP
 static int sched_idle_cpu(int cpu)
 {
@@ -11556,7 +11571,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
 	for_each_possible_cpu(i) {
 		struct rq *rq = cpu_rq(i);
 		struct sched_entity *se = tg->se[i];
-		struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+		struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
 		bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
 		long idle_task_delta;
 		struct rq_flags rf;
@@ -11567,6 +11582,14 @@ int sched_group_set_idle(struct task_group *tg, long idle)
 		if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
 			goto next_cpu;
 
+		if (se->on_rq) {
+			parent_cfs_rq = cfs_rq_of(se);
+			if (cfs_rq_is_idle(grp_cfs_rq))
+				parent_cfs_rq->idle_nr_running++;
+			else
+				parent_cfs_rq->idle_nr_running--;
+		}
+
 		idle_task_delta = grp_cfs_rq->h_nr_running -
 				  grp_cfs_rq->idle_h_nr_running;
 		if (!cfs_rq_is_idle(grp_cfs_rq))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8dfad8fb756c..6af039e433fb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -530,6 +530,7 @@ struct cfs_rq {
 	struct load_weight	load;
 	unsigned int		nr_running;
 	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
+	unsigned int		idle_nr_running;   /* SCHED_IDLE */
 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
 
 	u64			exec_clock;
-- 
2.33.0.rc2.250.ged5fa647cd-goog


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities
  2021-08-20  1:03 [PATCH v3 0/4] SCHED_IDLE extensions Josh Don
  2021-08-20  1:04 ` [PATCH v3 2/4] sched: account number of SCHED_IDLE entities on each cfs_rq Josh Don
@ 2021-08-20  1:04 ` Josh Don
  2021-08-23 10:08   ` Vincent Guittot
                     ` (3 more replies)
  2021-08-20  1:04 ` [PATCH v3 4/4] sched: adjust sleeper credit " Josh Don
       [not found] ` <20210906124702.Q6G0oOWwFOmQSl_jmRms3XQgfz4ROzfE71r3SNgWSf0@z>
  3 siblings, 4 replies; 28+ messages in thread
From: Josh Don @ 2021-08-20  1:04 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot
  Cc: Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel, Josh Don

Use a small, non-scaled min granularity for SCHED_IDLE entities, when
competing with normal entities. This reduces the latency of getting
a normal entity back on cpu, at the expense of increased context
switch frequency of SCHED_IDLE entities.

The benefit of this change is to reduce the round-robin latency for
normal entities when competing with a SCHED_IDLE entity.

Example: on a machine with HZ=1000, spawned two threads, one of which is
SCHED_IDLE, and affined to one cpu. Without this patch, the SCHED_IDLE
thread runs for 4ms then waits for 1.4s. With this patch, it runs for
1ms and waits 340ms (as it round-robins with the other thread).

Signed-off-by: Josh Don <joshdon@google.com>
---
 kernel/sched/debug.c |  2 ++
 kernel/sched/fair.c  | 29 ++++++++++++++++++++++++-----
 kernel/sched/sched.h |  1 +
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 33538579db9a..317ef560aa63 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ static __init int sched_init_debug(void)
 
 	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
 	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+	debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
 	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
 
 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
@@ -806,6 +807,7 @@ static void sched_debug_header(struct seq_file *m)
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
 	PN(sysctl_sched_latency);
 	PN(sysctl_sched_min_granularity);
+	PN(sysctl_sched_idle_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
 	P(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 19a9244c140f..31f40aa005b9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -59,6 +59,14 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
 unsigned int sysctl_sched_min_granularity			= 750000ULL;
 static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
 
+/*
+ * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
+ * Applies only when SCHED_IDLE tasks compete with normal tasks.
+ *
+ * (default: 0.75 msec)
+ */
+unsigned int sysctl_sched_idle_min_granularity			= 750000ULL;
+
 /*
  * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
  */
@@ -665,6 +673,8 @@ static u64 __sched_period(unsigned long nr_running)
 		return sysctl_sched_latency;
 }
 
+static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
+
 /*
  * We calculate the wall-time slice from the period by taking a part
  * proportional to the weight.
@@ -674,6 +684,8 @@ static u64 __sched_period(unsigned long nr_running)
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned int nr_running = cfs_rq->nr_running;
+	struct sched_entity *init_se = se;
+	unsigned int min_gran;
 	u64 slice;
 
 	if (sched_feat(ALT_PERIOD))
@@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	for_each_sched_entity(se) {
 		struct load_weight *load;
 		struct load_weight lw;
+		struct cfs_rq *qcfs_rq;
 
-		cfs_rq = cfs_rq_of(se);
-		load = &cfs_rq->load;
+		qcfs_rq = cfs_rq_of(se);
+		load = &qcfs_rq->load;
 
 		if (unlikely(!se->on_rq)) {
-			lw = cfs_rq->load;
+			lw = qcfs_rq->load;
 
 			update_load_add(&lw, se->load.weight);
 			load = &lw;
@@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		slice = __calc_delta(slice, se->load.weight, load);
 	}
 
-	if (sched_feat(BASE_SLICE))
-		slice = max(slice, (u64)sysctl_sched_min_granularity);
+	if (sched_feat(BASE_SLICE)) {
+		if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
+			min_gran = sysctl_sched_idle_min_granularity;
+		else
+			min_gran = sysctl_sched_min_granularity;
+
+		slice = max_t(u64, slice, min_gran);
+	}
 
 	return slice;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6af039e433fb..29846da35861 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2399,6 +2399,7 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_idle_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern int sysctl_resched_latency_warn_ms;
 extern int sysctl_resched_latency_warn_once;
-- 
2.33.0.rc2.250.ged5fa647cd-goog


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v3 4/4] sched: adjust sleeper credit for SCHED_IDLE entities
  2021-08-20  1:03 [PATCH v3 0/4] SCHED_IDLE extensions Josh Don
  2021-08-20  1:04 ` [PATCH v3 2/4] sched: account number of SCHED_IDLE entities on each cfs_rq Josh Don
  2021-08-20  1:04 ` [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities Josh Don
@ 2021-08-20  1:04 ` Josh Don
  2021-08-23 10:09   ` Vincent Guittot
                     ` (3 more replies)
       [not found] ` <20210906124702.Q6G0oOWwFOmQSl_jmRms3XQgfz4ROzfE71r3SNgWSf0@z>
  3 siblings, 4 replies; 28+ messages in thread
From: Josh Don @ 2021-08-20  1:04 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot
  Cc: Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel, Josh Don

Give reduced sleeper credit to SCHED_IDLE entities. As a result, woken
SCHED_IDLE entities will take longer to preempt normal entities.

The benefit of this change is to make it less likely that a newly woken
SCHED_IDLE entity will preempt a short-running normal entity before it
blocks.

We still give a small sleeper credit to SCHED_IDLE entities, so that
idle<->idle competition retains some fairness.

Example: With HZ=1000, spawned four threads affined to one cpu, one of
which was set to SCHED_IDLE. Without this patch, wakeup latency for the
SCHED_IDLE thread was ~1-2ms, with the patch the wakeup latency was
~5ms.

Signed-off-by: Josh Don <joshdon@google.com>
---
 kernel/sched/fair.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 31f40aa005b9..aa9c046d2aab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4230,7 +4230,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	/* sleeps up to a single latency don't count. */
 	if (!initial) {
-		unsigned long thresh = sysctl_sched_latency;
+		unsigned long thresh;
+
+		if (se_is_idle(se))
+			thresh = sysctl_sched_min_granularity;
+		else
+			thresh = sysctl_sched_latency;
 
 		/*
 		 * Halve their sleep time's effect, to allow
-- 
2.33.0.rc2.250.ged5fa647cd-goog


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/4] sched: cgroup SCHED_IDLE support
  2021-09-06 12:47   ` [PATCH v3 1/4] sched: cgroup SCHED_IDLE support alexs
@ 2021-08-20  8:39     ` Tao Zhou
  2021-08-23 17:29       ` Josh Don
  2021-09-02  1:22     ` Daniel Jordan
  2021-09-08 18:36     ` Josh Don
  2 siblings, 1 reply; 28+ messages in thread
From: Tao Zhou @ 2021-08-20  8:39 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel, tao.zhou

Hi Josh,

On Thu, Aug 19, 2021 at 06:04:00PM -0700, Josh Don wrote:
> This extends SCHED_IDLE to cgroups.
> 
> Interface: cgroup/cpu.idle.
>  0: default behavior
>  1: SCHED_IDLE
> 
> Extending SCHED_IDLE to cgroups means that we incorporate the existing
> aspects of SCHED_IDLE; a SCHED_IDLE cgroup will count all of its
> descendant threads towards the idle_h_nr_running count of all of its
> ancestor cgroups. Thus, sched_idle_rq() will work properly.
> Additionally, SCHED_IDLE cgroups are configured with minimum weight.
> 
> There are two key differences between the per-task and per-cgroup
> SCHED_IDLE interface:
> 
> - The cgroup interface allows tasks within a SCHED_IDLE hierarchy to
> maintain their relative weights. The entity that is "idle" is the
> cgroup, not the tasks themselves.
> 
> - Since the idle entity is the cgroup, our SCHED_IDLE wakeup preemption
> decision is not made by comparing the current task with the woken task,
> but rather by comparing their matching sched_entity.
> 
> A typical use-case for this is a user that creates an idle and a
> non-idle subtree. The non-idle subtree will dominate competition vs
> the idle subtree, but the idle subtree will still be high priority
> vs other users on the system. The latter is accomplished via comparing
> matching sched_entity in the waken preemption path (this could also be
> improved by making the sched_idle_rq() decision dependent on the
> perspective of a specific task).
> 
> For now, we maintain the existing SCHED_IDLE semantics. Future patches
> may make improvements that extend how we treat SCHED_IDLE entities.
> 
> The per-task_group idle field is an integer that currently only holds
> either a 0 or a 1. This is explicitly typed as an integer to allow for
> further extensions to this API. For example, a negative value may
> indicate a highly latency-sensitive cgroup that should be preferred for
> preemption/placement/etc.
> 
> Signed-off-by: Josh Don <joshdon@google.com>
> ---
> v3:
> - no change from v2
> v2:
> - Use WEIGHT_IDLEPRIO for the idle cgroup weight
> - Add cgroup-v1 support
> 
>  kernel/sched/core.c  |  25 ++++++
>  kernel/sched/debug.c |   3 +
>  kernel/sched/fair.c  | 197 +++++++++++++++++++++++++++++++++++++------
>  kernel/sched/sched.h |   8 ++
>  4 files changed, 208 insertions(+), 25 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 3431939699dc..c1d2227be7c1 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10195,6 +10195,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
>  }
>  #endif /* CONFIG_RT_GROUP_SCHED */
>  
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
> +			       struct cftype *cft)
> +{
> +	return css_tg(css)->idle;
> +}
> +
> +static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
> +				struct cftype *cft, s64 idle)
> +{
> +	return sched_group_set_idle(css_tg(css), idle);

	return sched_group_set_idle(css_tg(css), (int)idle);

> +}
> +#endif
> +
>  static struct cftype cpu_legacy_files[] = {
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  	{
> @@ -10202,6 +10216,11 @@ static struct cftype cpu_legacy_files[] = {
>  		.read_u64 = cpu_shares_read_u64,
>  		.write_u64 = cpu_shares_write_u64,
>  	},
> +	{
> +		.name = "idle",
> +		.read_s64 = cpu_idle_read_s64,
> +		.write_s64 = cpu_idle_write_s64,
> +	},
>  #endif
>  #ifdef CONFIG_CFS_BANDWIDTH
>  	{
> @@ -10409,6 +10428,12 @@ static struct cftype cpu_files[] = {
>  		.read_s64 = cpu_weight_nice_read_s64,
>  		.write_s64 = cpu_weight_nice_write_s64,
>  	},
> +	{
> +		.name = "idle",
> +		.flags = CFTYPE_NOT_ON_ROOT,
> +		.read_s64 = cpu_idle_read_s64,
> +		.write_s64 = cpu_idle_write_s64,
> +	},
>  #endif
>  #ifdef CONFIG_CFS_BANDWIDTH
>  	{
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 7e08e3d947c2..49716228efb4 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -607,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>  	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
>  			cfs_rq->nr_spread_over);
>  	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
> +	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
> +	SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running",
> +			cfs_rq->idle_h_nr_running);
>  	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
>  #ifdef CONFIG_SMP
>  	SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d425d11aa2b8..5aa3cfd15a2e 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
>  	}
>  }
>  
> +static int tg_is_idle(struct task_group *tg)
> +{
> +	return tg->idle > 0;
> +}
> +
> +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
> +{
> +	return cfs_rq->idle > 0;
> +}
> +
> +static int se_is_idle(struct sched_entity *se)
> +{
> +	if (entity_is_task(se))
> +		return task_has_idle_policy(task_of(se));
> +	return cfs_rq_is_idle(group_cfs_rq(se));
> +}
> +
>  #else	/* !CONFIG_FAIR_GROUP_SCHED */
>  
>  #define for_each_sched_entity(se) \
> @@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
>  {
>  }
>  
> +static int tg_is_idle(struct task_group *tg)
> +{
> +	return 0;
> +}
> +
> +static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
> +{
> +	return 0;
> +}
> +
> +static int se_is_idle(struct sched_entity *se)
> +{
> +	return 0;
> +}
> +
>  #endif	/* CONFIG_FAIR_GROUP_SCHED */
>  
>  static __always_inline
> @@ -4841,6 +4873,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  
>  		dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
>  
> +		if (cfs_rq_is_idle(group_cfs_rq(se)))
> +			idle_task_delta = cfs_rq->h_nr_running;
> +
>  		qcfs_rq->h_nr_running -= task_delta;
>  		qcfs_rq->idle_h_nr_running -= idle_task_delta;
>  
> @@ -4860,6 +4895,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
>  		update_load_avg(qcfs_rq, se, 0);
>  		se_update_runnable(se);
>  
> +		if (cfs_rq_is_idle(group_cfs_rq(se)))
> +			idle_task_delta = cfs_rq->h_nr_running;
> +
>  		qcfs_rq->h_nr_running -= task_delta;
>  		qcfs_rq->idle_h_nr_running -= idle_task_delta;
>  	}
> @@ -4904,39 +4942,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>  	task_delta = cfs_rq->h_nr_running;
>  	idle_task_delta = cfs_rq->idle_h_nr_running;
>  	for_each_sched_entity(se) {
> +		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
> +
>  		if (se->on_rq)
>  			break;
> -		cfs_rq = cfs_rq_of(se);
> -		enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
> +		enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
> +
> +		if (cfs_rq_is_idle(group_cfs_rq(se)))
> +			idle_task_delta = cfs_rq->h_nr_running;
>  
> -		cfs_rq->h_nr_running += task_delta;
> -		cfs_rq->idle_h_nr_running += idle_task_delta;
> +		qcfs_rq->h_nr_running += task_delta;
> +		qcfs_rq->idle_h_nr_running += idle_task_delta;
>  
>  		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> +		if (cfs_rq_throttled(qcfs_rq))
>  			goto unthrottle_throttle;
>  	}
>  
>  	for_each_sched_entity(se) {
> -		cfs_rq = cfs_rq_of(se);
> +		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
>  
> -		update_load_avg(cfs_rq, se, UPDATE_TG);
> +		update_load_avg(qcfs_rq, se, UPDATE_TG);
>  		se_update_runnable(se);
>  
> -		cfs_rq->h_nr_running += task_delta;
> -		cfs_rq->idle_h_nr_running += idle_task_delta;
> +		if (cfs_rq_is_idle(group_cfs_rq(se)))
> +			idle_task_delta = cfs_rq->h_nr_running;
>  
> +		qcfs_rq->h_nr_running += task_delta;
> +		qcfs_rq->idle_h_nr_running += idle_task_delta;
>  
>  		/* end evaluation on encountering a throttled cfs_rq */
> -		if (cfs_rq_throttled(cfs_rq))
> +		if (cfs_rq_throttled(qcfs_rq))
>  			goto unthrottle_throttle;
>  
>  		/*
>  		 * One parent has been throttled and cfs_rq removed from the
>  		 * list. Add it back to not break the leaf list.
>  		 */
> -		if (throttled_hierarchy(cfs_rq))
> -			list_add_leaf_cfs_rq(cfs_rq);
> +		if (throttled_hierarchy(qcfs_rq))
> +			list_add_leaf_cfs_rq(qcfs_rq);
>  	}
>  
>  	/* At this point se is NULL and we are at root level*/
> @@ -4949,9 +4993,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>  	 * assertion below.
>  	 */
>  	for_each_sched_entity(se) {
> -		cfs_rq = cfs_rq_of(se);
> +		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
>  
> -		if (list_add_leaf_cfs_rq(cfs_rq))
> +		if (list_add_leaf_cfs_rq(qcfs_rq))
>  			break;
>  	}
>  
> @@ -5574,6 +5618,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  		cfs_rq->h_nr_running++;
>  		cfs_rq->idle_h_nr_running += idle_h_nr_running;
>  
> +		if (cfs_rq_is_idle(cfs_rq))
> +			idle_h_nr_running = 1;
> +
>  		/* end evaluation on encountering a throttled cfs_rq */
>  		if (cfs_rq_throttled(cfs_rq))
>  			goto enqueue_throttle;
> @@ -5591,6 +5638,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  		cfs_rq->h_nr_running++;
>  		cfs_rq->idle_h_nr_running += idle_h_nr_running;
>  
> +		if (cfs_rq_is_idle(cfs_rq))
> +			idle_h_nr_running = 1;
> +
>  		/* end evaluation on encountering a throttled cfs_rq */
>  		if (cfs_rq_throttled(cfs_rq))
>  			goto enqueue_throttle;
> @@ -5668,6 +5718,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  		cfs_rq->h_nr_running--;
>  		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
>  
> +		if (cfs_rq_is_idle(cfs_rq))
> +			idle_h_nr_running = 1;
> +
>  		/* end evaluation on encountering a throttled cfs_rq */
>  		if (cfs_rq_throttled(cfs_rq))
>  			goto dequeue_throttle;
> @@ -5697,6 +5750,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>  		cfs_rq->h_nr_running--;
>  		cfs_rq->idle_h_nr_running -= idle_h_nr_running;
>  
> +		if (cfs_rq_is_idle(cfs_rq))
> +			idle_h_nr_running = 1;
> +
>  		/* end evaluation on encountering a throttled cfs_rq */
>  		if (cfs_rq_throttled(cfs_rq))
>  			goto dequeue_throttle;
> @@ -7039,24 +7095,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
>  
>  static void set_last_buddy(struct sched_entity *se)
>  {
> -	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
> -		return;
> -
>  	for_each_sched_entity(se) {
>  		if (SCHED_WARN_ON(!se->on_rq))
>  			return;
> +		if (se_is_idle(se))
> +			return;
>  		cfs_rq_of(se)->last = se;
>  	}
>  }
>  
>  static void set_next_buddy(struct sched_entity *se)
>  {
> -	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
> -		return;
> -
>  	for_each_sched_entity(se) {
>  		if (SCHED_WARN_ON(!se->on_rq))
>  			return;
> +		if (se_is_idle(se))
> +			return;
>  		cfs_rq_of(se)->next = se;
>  	}
>  }
> @@ -7077,6 +7131,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
>  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
>  	int scale = cfs_rq->nr_running >= sched_nr_latency;
>  	int next_buddy_marked = 0;
> +	int cse_is_idle, pse_is_idle;
>  
>  	if (unlikely(se == pse))
>  		return;
> @@ -7121,8 +7176,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
>  		return;
>  
>  	find_matching_se(&se, &pse);
> -	update_curr(cfs_rq_of(se));
>  	BUG_ON(!pse);
> +
> +	cse_is_idle = se_is_idle(se);
> +	pse_is_idle = se_is_idle(pse);
> +
> +	/*
> +	 * Preempt an idle group in favor of a non-idle group (and don't preempt
> +	 * in the inverse case).
> +	 */
> +	if (cse_is_idle && !pse_is_idle)
> +		goto preempt;
> +	if (cse_is_idle != pse_is_idle)
> +		return;
> +
> +	update_curr(cfs_rq_of(se));
>  	if (wakeup_preempt_entity(se, pse) == 1) {
>  		/*
>  		 * Bias pick_next to pick the sched entity that is
> @@ -11416,10 +11484,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
>  
>  static DEFINE_MUTEX(shares_mutex);
>  
> -int sched_group_set_shares(struct task_group *tg, unsigned long shares)
> +static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
>  {
>  	int i;
>  
> +	lockdep_assert_held(&shares_mutex);
> +
>  	/*
>  	 * We can't change the weight of the root cgroup.
>  	 */
> @@ -11428,9 +11498,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
>  
>  	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
>  
> -	mutex_lock(&shares_mutex);
>  	if (tg->shares == shares)
> -		goto done;
> +		return 0;
>  
>  	tg->shares = shares;
>  	for_each_possible_cpu(i) {
> @@ -11448,10 +11517,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
>  		rq_unlock_irqrestore(rq, &rf);
>  	}
>  
> -done:
> +	return 0;
> +}
> +
> +int sched_group_set_shares(struct task_group *tg, unsigned long shares)
> +{
> +	int ret;
> +
> +	mutex_lock(&shares_mutex);
> +	if (tg_is_idle(tg))
> +		ret = -EINVAL;
> +	else
> +		ret = __sched_group_set_shares(tg, shares);
> +	mutex_unlock(&shares_mutex);
> +
> +	return ret;
> +}
> +
> +int sched_group_set_idle(struct task_group *tg, long idle)

   int sched_group_set_idle(struct task_group *tg, int idle)

> +{
> +	int i;
> +
> +	if (tg == &root_task_group)
> +		return -EINVAL;
> +
> +	if (idle < 0 || idle > 1)
> +		return -EINVAL;
> +
> +	mutex_lock(&shares_mutex);
> +
> +	if (tg->idle == idle) {
> +		mutex_unlock(&shares_mutex);
> +		return 0;
> +	}
> +
> +	tg->idle = idle;
> +
> +	for_each_possible_cpu(i) {
> +		struct rq *rq = cpu_rq(i);
> +		struct sched_entity *se = tg->se[i];
> +		struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
> +		bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
> +		long idle_task_delta;
> +		struct rq_flags rf;
> +
> +		rq_lock_irqsave(rq, &rf);
> +
> +		grp_cfs_rq->idle = idle;
> +		if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
> +			goto next_cpu;
> +
> +		idle_task_delta = grp_cfs_rq->h_nr_running -
> +				  grp_cfs_rq->idle_h_nr_running;
> +		if (!cfs_rq_is_idle(grp_cfs_rq))
> +			idle_task_delta *= -1;
> +
> +		for_each_sched_entity(se) {
> +			struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +
> +			if (!se->on_rq)
> +				break;
> +
> +			cfs_rq->idle_h_nr_running += idle_task_delta;
> +
> +			/* Already accounted at parent level and above. */
> +			if (cfs_rq_is_idle(cfs_rq))
> +				break;
> +		}
> +
> +next_cpu:
> +		rq_unlock_irqrestore(rq, &rf);
> +	}
> +
> +	/* Idle groups have minimum weight. */
> +	if (tg_is_idle(tg))
> +		__sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
> +	else
> +		__sched_group_set_shares(tg, NICE_0_LOAD);
> +
>  	mutex_unlock(&shares_mutex);
>  	return 0;
>  }
> +
>  #else /* CONFIG_FAIR_GROUP_SCHED */
>  
>  void free_fair_sched_group(struct task_group *tg) { }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index d9f8d73a1d84..8dfad8fb756c 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -396,6 +396,9 @@ struct task_group {
>  	struct cfs_rq		**cfs_rq;
>  	unsigned long		shares;
>  
> +	/* A positive value indicates that this is a SCHED_IDLE group. */
> +	int			idle;
> +
>  #ifdef	CONFIG_SMP
>  	/*
>  	 * load_avg can be heavily contended at clock tick time, so put
> @@ -505,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk);
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
>  
> +extern int sched_group_set_idle(struct task_group *tg, long idle);

   extern int sched_group_set_idle(struct task_group *tg, int idle);

> +
>  #ifdef CONFIG_SMP
>  extern void set_task_rq_fair(struct sched_entity *se,
>  			     struct cfs_rq *prev, struct cfs_rq *next);
> @@ -601,6 +606,9 @@ struct cfs_rq {
>  	struct list_head	leaf_cfs_rq_list;
>  	struct task_group	*tg;	/* group that "owns" this runqueue */
>  
> +	/* Locally cached copy of our task_group's idle value */
> +	int			idle;
> +
>  #ifdef CONFIG_CFS_BANDWIDTH
>  	int			runtime_enabled;
>  	s64			runtime_remaining;
> -- 
> 2.33.0.rc2.250.ged5fa647cd-goog
> 

Cfs_rq and tg define @idle with int type. 
In sched_group_set_idle(..., long idle), @idle is long type.
Use int instead.

But, you filter idle value:

    if (idle < 0 || idle > 1)
    	return -EINVAL;

So, no effect here.. Just @idle can use 4 bytes.



Thanks,
Tao

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities
  2021-08-20  1:04 ` [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities Josh Don
@ 2021-08-23 10:08   ` Vincent Guittot
  2021-08-23 17:40     ` Josh Don
  2021-08-24 10:24   ` Jiang Biao
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 28+ messages in thread
From: Vincent Guittot @ 2021-08-23 10:08 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Fri, 20 Aug 2021 at 03:04, Josh Don <joshdon@google.com> wrote:
>
> Use a small, non-scaled min granularity for SCHED_IDLE entities, when
> competing with normal entities. This reduces the latency of getting
> a normal entity back on cpu, at the expense of increased context
> switch frequency of SCHED_IDLE entities.
>
> The benefit of this change is to reduce the round-robin latency for
> normal entities when competing with a SCHED_IDLE entity.
>
> Example: on a machine with HZ=1000, spawned two threads, one of which is
> SCHED_IDLE, and affined to one cpu. Without this patch, the SCHED_IDLE
> thread runs for 4ms then waits for 1.4s. With this patch, it runs for
> 1ms and waits 340ms (as it round-robins with the other thread).
>
> Signed-off-by: Josh Don <joshdon@google.com>
> ---
>  kernel/sched/debug.c |  2 ++
>  kernel/sched/fair.c  | 29 ++++++++++++++++++++++++-----
>  kernel/sched/sched.h |  1 +
>  3 files changed, 27 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 33538579db9a..317ef560aa63 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -305,6 +305,7 @@ static __init int sched_init_debug(void)
>
>         debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
>         debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
> +       debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
>         debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
>
>         debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
> @@ -806,6 +807,7 @@ static void sched_debug_header(struct seq_file *m)
>         SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
>         PN(sysctl_sched_latency);
>         PN(sysctl_sched_min_granularity);
> +       PN(sysctl_sched_idle_min_granularity);
>         PN(sysctl_sched_wakeup_granularity);
>         P(sysctl_sched_child_runs_first);
>         P(sysctl_sched_features);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 19a9244c140f..31f40aa005b9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -59,6 +59,14 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
>  unsigned int sysctl_sched_min_granularity                      = 750000ULL;
>  static unsigned int normalized_sysctl_sched_min_granularity    = 750000ULL;
>
> +/*
> + * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
> + * Applies only when SCHED_IDLE tasks compete with normal tasks.
> + *
> + * (default: 0.75 msec)
> + */
> +unsigned int sysctl_sched_idle_min_granularity                 = 750000ULL;
> +
>  /*
>   * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
>   */
> @@ -665,6 +673,8 @@ static u64 __sched_period(unsigned long nr_running)
>                 return sysctl_sched_latency;
>  }
>
> +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
> +
>  /*
>   * We calculate the wall-time slice from the period by taking a part
>   * proportional to the weight.
> @@ -674,6 +684,8 @@ static u64 __sched_period(unsigned long nr_running)
>  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>         unsigned int nr_running = cfs_rq->nr_running;
> +       struct sched_entity *init_se = se;
> +       unsigned int min_gran;
>         u64 slice;
>
>         if (sched_feat(ALT_PERIOD))
> @@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
>         for_each_sched_entity(se) {
>                 struct load_weight *load;
>                 struct load_weight lw;
> +               struct cfs_rq *qcfs_rq;
>
> -               cfs_rq = cfs_rq_of(se);
> -               load = &cfs_rq->load;
> +               qcfs_rq = cfs_rq_of(se);
> +               load = &qcfs_rq->load;
>
>                 if (unlikely(!se->on_rq)) {
> -                       lw = cfs_rq->load;
> +                       lw = qcfs_rq->load;
>
>                         update_load_add(&lw, se->load.weight);
>                         load = &lw;
> @@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
>                 slice = __calc_delta(slice, se->load.weight, load);
>         }
>
> -       if (sched_feat(BASE_SLICE))
> -               slice = max(slice, (u64)sysctl_sched_min_granularity);
> +       if (sched_feat(BASE_SLICE)) {
> +               if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))

Like for place_entity, we should probably not dynamically switch
between the 2 values below depending on the presence or not of non
sched idle tasks and always use sysctl_sched_idle_min_granularity


> +                       min_gran = sysctl_sched_idle_min_granularity;
> +               else
> +                       min_gran = sysctl_sched_min_granularity;
> +
> +               slice = max_t(u64, slice, min_gran);
> +       }
>
>         return slice;
>  }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 6af039e433fb..29846da35861 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2399,6 +2399,7 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
>  #ifdef CONFIG_SCHED_DEBUG
>  extern unsigned int sysctl_sched_latency;
>  extern unsigned int sysctl_sched_min_granularity;
> +extern unsigned int sysctl_sched_idle_min_granularity;
>  extern unsigned int sysctl_sched_wakeup_granularity;
>  extern int sysctl_resched_latency_warn_ms;
>  extern int sysctl_resched_latency_warn_once;
> --
> 2.33.0.rc2.250.ged5fa647cd-goog
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/4] sched: adjust sleeper credit for SCHED_IDLE entities
  2021-08-20  1:04 ` [PATCH v3 4/4] sched: adjust sleeper credit " Josh Don
@ 2021-08-23 10:09   ` Vincent Guittot
  2021-08-24  8:16   ` Jiang Biao
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 28+ messages in thread
From: Vincent Guittot @ 2021-08-23 10:09 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Fri, 20 Aug 2021 at 03:04, Josh Don <joshdon@google.com> wrote:
>
> Give reduced sleeper credit to SCHED_IDLE entities. As a result, woken
> SCHED_IDLE entities will take longer to preempt normal entities.
>
> The benefit of this change is to make it less likely that a newly woken
> SCHED_IDLE entity will preempt a short-running normal entity before it
> blocks.
>
> We still give a small sleeper credit to SCHED_IDLE entities, so that
> idle<->idle competition retains some fairness.
>
> Example: With HZ=1000, spawned four threads affined to one cpu, one of
> which was set to SCHED_IDLE. Without this patch, wakeup latency for the
> SCHED_IDLE thread was ~1-2ms, with the patch the wakeup latency was
> ~5ms.
>
> Signed-off-by: Josh Don <joshdon@google.com>

Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>

> ---
>  kernel/sched/fair.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 31f40aa005b9..aa9c046d2aab 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4230,7 +4230,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>
>         /* sleeps up to a single latency don't count. */
>         if (!initial) {
> -               unsigned long thresh = sysctl_sched_latency;
> +               unsigned long thresh;
> +
> +               if (se_is_idle(se))
> +                       thresh = sysctl_sched_min_granularity;
> +               else
> +                       thresh = sysctl_sched_latency;
>
>                 /*
>                  * Halve their sleep time's effect, to allow
> --
> 2.33.0.rc2.250.ged5fa647cd-goog
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/4] sched: cgroup SCHED_IDLE support
  2021-08-20  8:39     ` Tao Zhou
@ 2021-08-23 17:29       ` Josh Don
  2021-08-23 22:45         ` Tao Zhou
  0 siblings, 1 reply; 28+ messages in thread
From: Josh Don @ 2021-08-23 17:29 UTC (permalink / raw)
  To: Tao Zhou
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

Hi Tao,

On Fri, Aug 20, 2021 at 1:38 AM Tao Zhou <tao.zhou@linux.dev> wrote:
[snip]
> >  #ifdef CONFIG_SMP
> >  extern void set_task_rq_fair(struct sched_entity *se,
> >                            struct cfs_rq *prev, struct cfs_rq *next);
> > @@ -601,6 +606,9 @@ struct cfs_rq {
> >       struct list_head        leaf_cfs_rq_list;
> >       struct task_group       *tg;    /* group that "owns" this runqueue */
> >
> > +     /* Locally cached copy of our task_group's idle value */
> > +     int                     idle;
> > +
> >  #ifdef CONFIG_CFS_BANDWIDTH
> >       int                     runtime_enabled;
> >       s64                     runtime_remaining;
> > --
> > 2.33.0.rc2.250.ged5fa647cd-goog
> >
>
> Cfs_rq and tg define @idle with int type.
> In sched_group_set_idle(..., long idle), @idle is long type.
> Use int instead.
>
> But, you filter idle value:
>
>     if (idle < 0 || idle > 1)
>         return -EINVAL;
>
> So, no effect here.. Just @idle can use 4 bytes.
>
>
>
> Thanks,
> Tao

The use of 'long'  there is because the input from the cgroup
interface is a 64 bit value.

- Josh

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities
  2021-08-23 10:08   ` Vincent Guittot
@ 2021-08-23 17:40     ` Josh Don
  2021-08-24  7:56       ` Vincent Guittot
  0 siblings, 1 reply; 28+ messages in thread
From: Josh Don @ 2021-08-23 17:40 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Mon, Aug 23, 2021 at 3:08 AM Vincent Guittot
<vincent.guittot@linaro.org> wrote:
>
> On Fri, 20 Aug 2021 at 03:04, Josh Don <joshdon@google.com> wrote:
> >
> > @@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >         for_each_sched_entity(se) {
> >                 struct load_weight *load;
> >                 struct load_weight lw;
> > +               struct cfs_rq *qcfs_rq;
> >
> > -               cfs_rq = cfs_rq_of(se);
> > -               load = &cfs_rq->load;
> > +               qcfs_rq = cfs_rq_of(se);
> > +               load = &qcfs_rq->load;
> >
> >                 if (unlikely(!se->on_rq)) {
> > -                       lw = cfs_rq->load;
> > +                       lw = qcfs_rq->load;
> >
> >                         update_load_add(&lw, se->load.weight);
> >                         load = &lw;
> > @@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> >                 slice = __calc_delta(slice, se->load.weight, load);
> >         }
> >
> > -       if (sched_feat(BASE_SLICE))
> > -               slice = max(slice, (u64)sysctl_sched_min_granularity);
> > +       if (sched_feat(BASE_SLICE)) {
> > +               if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
>
> Like for place_entity, we should probably not dynamically switch
> between the 2 values below depending on the presence or not of non
> sched idle tasks and always use sysctl_sched_idle_min_granularity

My reasoning here is that sched_slice is something we reasonably
expect to change as tasks enqueue/dequeue, and unlike place_entity()
it does not create fairness issues by messing with vruntime.
Additionally, it would be preferable to use the larger min granularity
on a cpu running only idle tasks.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/4] sched: cgroup SCHED_IDLE support
  2021-08-23 17:29       ` Josh Don
@ 2021-08-23 22:45         ` Tao Zhou
  0 siblings, 0 replies; 28+ messages in thread
From: Tao Zhou @ 2021-08-23 22:45 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel, tao.zhou

Hi Josh,

On Mon, Aug 23, 2021 at 10:29:53AM -0700, Josh Don wrote:
> Hi Tao,
> 
> On Fri, Aug 20, 2021 at 1:38 AM Tao Zhou <tao.zhou@linux.dev> wrote:
> [snip]
> > >  #ifdef CONFIG_SMP
> > >  extern void set_task_rq_fair(struct sched_entity *se,
> > >                            struct cfs_rq *prev, struct cfs_rq *next);
> > > @@ -601,6 +606,9 @@ struct cfs_rq {
> > >       struct list_head        leaf_cfs_rq_list;
> > >       struct task_group       *tg;    /* group that "owns" this runqueue */
> > >
> > > +     /* Locally cached copy of our task_group's idle value */
> > > +     int                     idle;
> > > +
> > >  #ifdef CONFIG_CFS_BANDWIDTH
> > >       int                     runtime_enabled;
> > >       s64                     runtime_remaining;
> > > --
> > > 2.33.0.rc2.250.ged5fa647cd-goog
> > >
> >
> > Cfs_rq and tg define @idle with int type.
> > In sched_group_set_idle(..., long idle), @idle is long type.
> > Use int instead.
> >
> > But, you filter idle value:
> >
> >     if (idle < 0 || idle > 1)
> >         return -EINVAL;
> >
> > So, no effect here.. Just @idle can use 4 bytes.
> >
> >
> >
> > Thanks,
> > Tao
> 
> The use of 'long'  there is because the input from the cgroup
> interface is a 64 bit value.

Yes. If the compile align the stack to 8 or other, this will have 
no effect(I've not check this, and have not much about gcc compile
align). I just presume that if the stack can save 4 bytes. But, 
that may not right though.

> - Josh



Thanks,
Tao

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities
  2021-08-23 17:40     ` Josh Don
@ 2021-08-24  7:56       ` Vincent Guittot
  0 siblings, 0 replies; 28+ messages in thread
From: Vincent Guittot @ 2021-08-24  7:56 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Mon, 23 Aug 2021 at 19:40, Josh Don <joshdon@google.com> wrote:
>
> On Mon, Aug 23, 2021 at 3:08 AM Vincent Guittot
> <vincent.guittot@linaro.org> wrote:
> >
> > On Fri, 20 Aug 2021 at 03:04, Josh Don <joshdon@google.com> wrote:
> > >
> > > @@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > >         for_each_sched_entity(se) {
> > >                 struct load_weight *load;
> > >                 struct load_weight lw;
> > > +               struct cfs_rq *qcfs_rq;
> > >
> > > -               cfs_rq = cfs_rq_of(se);
> > > -               load = &cfs_rq->load;
> > > +               qcfs_rq = cfs_rq_of(se);
> > > +               load = &qcfs_rq->load;
> > >
> > >                 if (unlikely(!se->on_rq)) {
> > > -                       lw = cfs_rq->load;
> > > +                       lw = qcfs_rq->load;
> > >
> > >                         update_load_add(&lw, se->load.weight);
> > >                         load = &lw;
> > > @@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > >                 slice = __calc_delta(slice, se->load.weight, load);
> > >         }
> > >
> > > -       if (sched_feat(BASE_SLICE))
> > > -               slice = max(slice, (u64)sysctl_sched_min_granularity);
> > > +       if (sched_feat(BASE_SLICE)) {
> > > +               if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
> >
> > Like for place_entity, we should probably not dynamically switch
> > between the 2 values below depending on the presence or not of non
> > sched idle tasks and always use sysctl_sched_idle_min_granularity
>
> My reasoning here is that sched_slice is something we reasonably
> expect to change as tasks enqueue/dequeue, and unlike place_entity()
> it does not create fairness issues by messing with vruntime.
> Additionally, it would be preferable to use the larger min granularity
> on a cpu running only idle tasks.

Fair enough

Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 2/4] sched: account number of SCHED_IDLE entities on each cfs_rq
  2021-08-20  1:04 ` [PATCH v3 2/4] sched: account number of SCHED_IDLE entities on each cfs_rq Josh Don
@ 2021-08-24  7:57   ` Vincent Guittot
  2021-09-09 11:18   ` [tip: sched/core] sched: Account " tip-bot2 for Josh Don
  2021-10-05 14:12   ` tip-bot2 for Josh Don
  2 siblings, 0 replies; 28+ messages in thread
From: Vincent Guittot @ 2021-08-24  7:57 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Dietmar Eggemann,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Fri, 20 Aug 2021 at 03:04, Josh Don <joshdon@google.com> wrote:
>
> Adds cfs_rq->idle_nr_running, which accounts the number of idle entities
> directly enqueued on the cfs_rq.
>
> Signed-off-by: Josh Don <joshdon@google.com>

Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>

> ---
>  kernel/sched/debug.c |  2 ++
>  kernel/sched/fair.c  | 25 ++++++++++++++++++++++++-
>  kernel/sched/sched.h |  1 +
>  3 files changed, 27 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 49716228efb4..33538579db9a 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -608,6 +608,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>                         cfs_rq->nr_spread_over);
>         SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
>         SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
> +       SEQ_printf(m, "  .%-30s: %d\n", "idle_nr_running",
> +                       cfs_rq->idle_nr_running);
>         SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running",
>                         cfs_rq->idle_h_nr_running);
>         SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5aa3cfd15a2e..19a9244c140f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2995,6 +2995,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>         }
>  #endif
>         cfs_rq->nr_running++;
> +       if (se_is_idle(se))
> +               cfs_rq->idle_nr_running++;
>  }
>
>  static void
> @@ -3008,6 +3010,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
>         }
>  #endif
>         cfs_rq->nr_running--;
> +       if (se_is_idle(se))
> +               cfs_rq->idle_nr_running--;
>  }
>
>  /*
> @@ -5573,6 +5577,17 @@ static int sched_idle_rq(struct rq *rq)
>                         rq->nr_running);
>  }
>
> +/*
> + * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
> + * of idle_nr_running, which does not consider idle descendants of normal
> + * entities.
> + */
> +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
> +{
> +       return cfs_rq->nr_running &&
> +               cfs_rq->nr_running == cfs_rq->idle_nr_running;
> +}
> +
>  #ifdef CONFIG_SMP
>  static int sched_idle_cpu(int cpu)
>  {
> @@ -11556,7 +11571,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
>         for_each_possible_cpu(i) {
>                 struct rq *rq = cpu_rq(i);
>                 struct sched_entity *se = tg->se[i];
> -               struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
> +               struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
>                 bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
>                 long idle_task_delta;
>                 struct rq_flags rf;
> @@ -11567,6 +11582,14 @@ int sched_group_set_idle(struct task_group *tg, long idle)
>                 if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
>                         goto next_cpu;
>
> +               if (se->on_rq) {
> +                       parent_cfs_rq = cfs_rq_of(se);
> +                       if (cfs_rq_is_idle(grp_cfs_rq))
> +                               parent_cfs_rq->idle_nr_running++;
> +                       else
> +                               parent_cfs_rq->idle_nr_running--;
> +               }
> +
>                 idle_task_delta = grp_cfs_rq->h_nr_running -
>                                   grp_cfs_rq->idle_h_nr_running;
>                 if (!cfs_rq_is_idle(grp_cfs_rq))
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 8dfad8fb756c..6af039e433fb 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -530,6 +530,7 @@ struct cfs_rq {
>         struct load_weight      load;
>         unsigned int            nr_running;
>         unsigned int            h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
> +       unsigned int            idle_nr_running;   /* SCHED_IDLE */
>         unsigned int            idle_h_nr_running; /* SCHED_IDLE */
>
>         u64                     exec_clock;
> --
> 2.33.0.rc2.250.ged5fa647cd-goog
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/4] sched: adjust sleeper credit for SCHED_IDLE entities
  2021-08-20  1:04 ` [PATCH v3 4/4] sched: adjust sleeper credit " Josh Don
  2021-08-23 10:09   ` Vincent Guittot
@ 2021-08-24  8:16   ` Jiang Biao
  2021-08-24 17:12     ` Josh Don
  2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Josh Don
  2021-10-05 14:12   ` tip-bot2 for Josh Don
  3 siblings, 1 reply; 28+ messages in thread
From: Jiang Biao @ 2021-08-24  8:16 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

Hi,

On Fri, 20 Aug 2021 at 09:06, Josh Don <joshdon@google.com> wrote:
>
> Give reduced sleeper credit to SCHED_IDLE entities. As a result, woken
> SCHED_IDLE entities will take longer to preempt normal entities.
>
> The benefit of this change is to make it less likely that a newly woken
> SCHED_IDLE entity will preempt a short-running normal entity before it
> blocks.
>
> We still give a small sleeper credit to SCHED_IDLE entities, so that
> idle<->idle competition retains some fairness.
>
> Example: With HZ=1000, spawned four threads affined to one cpu, one of
> which was set to SCHED_IDLE. Without this patch, wakeup latency for the
> SCHED_IDLE thread was ~1-2ms, with the patch the wakeup latency was
> ~5ms.
>
> Signed-off-by: Josh Don <joshdon@google.com>
Tried to push a similar patch before, but failed. :)
https://lkml.org/lkml/2020/8/20/1773
Please pick my Reviewed-by if you don't mind,
Reviewed-by: Jiang Biao <benbjiang@tencent.com>

> ---
>  kernel/sched/fair.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 31f40aa005b9..aa9c046d2aab 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4230,7 +4230,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
>
>         /* sleeps up to a single latency don't count. */
>         if (!initial) {
> -               unsigned long thresh = sysctl_sched_latency;
> +               unsigned long thresh;
> +
> +               if (se_is_idle(se))
> +                       thresh = sysctl_sched_min_granularity;
> +               else
> +                       thresh = sysctl_sched_latency;
>
>                 /*
>                  * Halve their sleep time's effect, to allow
> --
> 2.33.0.rc2.250.ged5fa647cd-goog
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities
  2021-08-20  1:04 ` [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities Josh Don
  2021-08-23 10:08   ` Vincent Guittot
@ 2021-08-24 10:24   ` Jiang Biao
  2021-08-24 17:04     ` Josh Don
  2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Josh Don
  2021-10-05 14:12   ` tip-bot2 for Josh Don
  3 siblings, 1 reply; 28+ messages in thread
From: Jiang Biao @ 2021-08-24 10:24 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

Hi,

On Fri, 20 Aug 2021 at 09:08, Josh Don <joshdon@google.com> wrote:
>
> Use a small, non-scaled min granularity for SCHED_IDLE entities, when
> competing with normal entities. This reduces the latency of getting
> a normal entity back on cpu, at the expense of increased context
> switch frequency of SCHED_IDLE entities.
>
> The benefit of this change is to reduce the round-robin latency for
> normal entities when competing with a SCHED_IDLE entity.
Why not just ignore min granularity when normal entities compete with
a SCHED_IDLE entity? something like this,

@@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq,
struct sched_entity *se)
                 slice = __calc_delta(slice, se->load.weight, load);
         }

 -       if (sched_feat(BASE_SLICE))
 -               slice = max(slice, (u64)sysctl_sched_min_granularity);
 +       if (sched_feat(BASE_SLICE)
 +          && (!se_is_idle(init_se) || sched_idle_cfs_rq(cfs_rq)))
 +               slice = max(slice, (u64)sysctl_sched_min_granularity);

         return slice;
  }
If so, there seems no need to introduce sysctl_sched_idle_min_granularity? :)

>
> Example: on a machine with HZ=1000, spawned two threads, one of which is
> SCHED_IDLE, and affined to one cpu. Without this patch, the SCHED_IDLE
> thread runs for 4ms then waits for 1.4s. With this patch, it runs for
> 1ms and waits 340ms (as it round-robins with the other thread).
In that way, the SCHED_IDLE task could be preempted more likely by the
normal task, because the ideal_runtime should be less than
750us(non-scaled sysctl_sched_idle_min_granularity) in this case. And
scaled sysctl_sched_min_granularity could be guaranteed normally
between SCHED_IDLE tasks when only SCHED_IDLE tasks compete with each
other.

>
> Signed-off-by: Josh Don <joshdon@google.com>
> ---
>  kernel/sched/debug.c |  2 ++
>  kernel/sched/fair.c  | 29 ++++++++++++++++++++++++-----
>  kernel/sched/sched.h |  1 +
>  3 files changed, 27 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 33538579db9a..317ef560aa63 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -305,6 +305,7 @@ static __init int sched_init_debug(void)
>
>         debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
>         debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
> +       debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
>         debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
>
>         debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
> @@ -806,6 +807,7 @@ static void sched_debug_header(struct seq_file *m)
>         SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
>         PN(sysctl_sched_latency);
>         PN(sysctl_sched_min_granularity);
> +       PN(sysctl_sched_idle_min_granularity);
>         PN(sysctl_sched_wakeup_granularity);
>         P(sysctl_sched_child_runs_first);
>         P(sysctl_sched_features);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 19a9244c140f..31f40aa005b9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -59,6 +59,14 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
>  unsigned int sysctl_sched_min_granularity                      = 750000ULL;
>  static unsigned int normalized_sysctl_sched_min_granularity    = 750000ULL;
>
> +/*
> + * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
> + * Applies only when SCHED_IDLE tasks compete with normal tasks.
> + *
> + * (default: 0.75 msec)
> + */
> +unsigned int sysctl_sched_idle_min_granularity                 = 750000ULL;
> +
>  /*
>   * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
>   */
> @@ -665,6 +673,8 @@ static u64 __sched_period(unsigned long nr_running)
>                 return sysctl_sched_latency;
>  }
>
> +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
> +
>  /*
>   * We calculate the wall-time slice from the period by taking a part
>   * proportional to the weight.
> @@ -674,6 +684,8 @@ static u64 __sched_period(unsigned long nr_running)
>  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
>         unsigned int nr_running = cfs_rq->nr_running;
> +       struct sched_entity *init_se = se;
> +       unsigned int min_gran;
>         u64 slice;
>
>         if (sched_feat(ALT_PERIOD))
> @@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
>         for_each_sched_entity(se) {
>                 struct load_weight *load;
>                 struct load_weight lw;
> +               struct cfs_rq *qcfs_rq;
>
> -               cfs_rq = cfs_rq_of(se);
> -               load = &cfs_rq->load;
> +               qcfs_rq = cfs_rq_of(se);
> +               load = &qcfs_rq->load;
>
>                 if (unlikely(!se->on_rq)) {
> -                       lw = cfs_rq->load;
> +                       lw = qcfs_rq->load;
>
>                         update_load_add(&lw, se->load.weight);
>                         load = &lw;
> @@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
>                 slice = __calc_delta(slice, se->load.weight, load);
>         }
>
> -       if (sched_feat(BASE_SLICE))
> -               slice = max(slice, (u64)sysctl_sched_min_granularity);
> +       if (sched_feat(BASE_SLICE)) {
> +               if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
> +                       min_gran = sysctl_sched_idle_min_granularity;
> +               else
> +                       min_gran = sysctl_sched_min_granularity;
> +
> +               slice = max_t(u64, slice, min_gran);
> +       }
>
>         return slice;
>  }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 6af039e433fb..29846da35861 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2399,6 +2399,7 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
>  #ifdef CONFIG_SCHED_DEBUG
>  extern unsigned int sysctl_sched_latency;
>  extern unsigned int sysctl_sched_min_granularity;
> +extern unsigned int sysctl_sched_idle_min_granularity;
>  extern unsigned int sysctl_sched_wakeup_granularity;
>  extern int sysctl_resched_latency_warn_ms;
>  extern int sysctl_resched_latency_warn_once;
> --
> 2.33.0.rc2.250.ged5fa647cd-goog
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities
  2021-08-24 10:24   ` Jiang Biao
@ 2021-08-24 17:04     ` Josh Don
  2021-08-25  2:43       ` Jiang Biao
  0 siblings, 1 reply; 28+ messages in thread
From: Josh Don @ 2021-08-24 17:04 UTC (permalink / raw)
  To: Jiang Biao
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

Hi Jiang,

On Tue, Aug 24, 2021 at 3:25 AM Jiang Biao <benbjiang@gmail.com> wrote:
>
> Why not just ignore min granularity when normal entities compete with
> a SCHED_IDLE entity? something like this,
>
> @@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq,
> struct sched_entity *se)
>                  slice = __calc_delta(slice, se->load.weight, load);
>          }
>
>  -       if (sched_feat(BASE_SLICE))
>  -               slice = max(slice, (u64)sysctl_sched_min_granularity);
>  +       if (sched_feat(BASE_SLICE)
>  +          && (!se_is_idle(init_se) || sched_idle_cfs_rq(cfs_rq)))
>  +               slice = max(slice, (u64)sysctl_sched_min_granularity);
>
>          return slice;
>   }
> If so, there seems no need to introduce sysctl_sched_idle_min_granularity? :)

Ignoring min_gran entirely could lead to some really tiny slices; see
discussion at https://lkml.org/lkml/2021/8/12/651.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/4] sched: adjust sleeper credit for SCHED_IDLE entities
  2021-08-24  8:16   ` Jiang Biao
@ 2021-08-24 17:12     ` Josh Don
  0 siblings, 0 replies; 28+ messages in thread
From: Josh Don @ 2021-08-24 17:12 UTC (permalink / raw)
  To: Jiang Biao
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Tue, Aug 24, 2021 at 1:16 AM Jiang Biao <benbjiang@gmail.com> wrote:
>
> Hi,
>
> On Fri, 20 Aug 2021 at 09:06, Josh Don <joshdon@google.com> wrote:
> >
> > Give reduced sleeper credit to SCHED_IDLE entities. As a result, woken
> > SCHED_IDLE entities will take longer to preempt normal entities.
> >
> > The benefit of this change is to make it less likely that a newly woken
> > SCHED_IDLE entity will preempt a short-running normal entity before it
> > blocks.
> >
> > We still give a small sleeper credit to SCHED_IDLE entities, so that
> > idle<->idle competition retains some fairness.
> >
> > Example: With HZ=1000, spawned four threads affined to one cpu, one of
> > which was set to SCHED_IDLE. Without this patch, wakeup latency for the
> > SCHED_IDLE thread was ~1-2ms, with the patch the wakeup latency was
> > ~5ms.
> >
> > Signed-off-by: Josh Don <joshdon@google.com>
> Tried to push a similar patch before, but failed. :)
> https://lkml.org/lkml/2020/8/20/1773
> Please pick my Reviewed-by if you don't mind,
> Reviewed-by: Jiang Biao <benbjiang@tencent.com>

Done, thanks :)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities
  2021-08-24 17:04     ` Josh Don
@ 2021-08-25  2:43       ` Jiang Biao
  0 siblings, 0 replies; 28+ messages in thread
From: Jiang Biao @ 2021-08-25  2:43 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Wed, 25 Aug 2021 at 01:04, Josh Don <joshdon@google.com> wrote:
>
> Hi Jiang,
>
> On Tue, Aug 24, 2021 at 3:25 AM Jiang Biao <benbjiang@gmail.com> wrote:
> >
> > Why not just ignore min granularity when normal entities compete with
> > a SCHED_IDLE entity? something like this,
> >
> > @@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq,
> > struct sched_entity *se)
> >                  slice = __calc_delta(slice, se->load.weight, load);
> >          }
> >
> >  -       if (sched_feat(BASE_SLICE))
> >  -               slice = max(slice, (u64)sysctl_sched_min_granularity);
> >  +       if (sched_feat(BASE_SLICE)
> >  +          && (!se_is_idle(init_se) || sched_idle_cfs_rq(cfs_rq)))
> >  +               slice = max(slice, (u64)sysctl_sched_min_granularity);
> >
> >          return slice;
> >   }
> > If so, there seems no need to introduce sysctl_sched_idle_min_granularity? :)
>
> Ignoring min_gran entirely could lead to some really tiny slices; see
> discussion at https://lkml.org/lkml/2021/8/12/651.
Got it, tiny slices could be a problem in SCHED_HRTICK case.
But the sysctl_sched_idle_min_granularity used in sched_slice() and
sysctl_sched_min_granularity used in check_preempt_tick would have
different semantics for SCHED_IDLE task, which could be functional ok
but a little confusing.

Regards,
Jiang

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/4] sched: cgroup SCHED_IDLE support
  2021-09-06 12:47   ` [PATCH v3 1/4] sched: cgroup SCHED_IDLE support alexs
  2021-08-20  8:39     ` Tao Zhou
@ 2021-09-02  1:22     ` Daniel Jordan
  2021-09-03  1:21       ` Josh Don
  2021-09-08 18:36     ` Josh Don
  2 siblings, 1 reply; 28+ messages in thread
From: Daniel Jordan @ 2021-09-02  1:22 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Thu, Aug 19, 2021 at 06:04:00PM -0700, Josh Don wrote:
> +static int se_is_idle(struct sched_entity *se)
> +{
> +	return 0;
> +}

I'm thinking !FAIR_GROUP_SCHED is a rare thing to behold?  So not a big
deal, but I think this wants to be

        return task_has_idle_policy(task_of(se));

so buddies aren't set for SCHED_IDLE.

>  static void set_last_buddy(struct sched_entity *se)
>  {
> -	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
> -		return;
> -
>  	for_each_sched_entity(se) {
>  		if (SCHED_WARN_ON(!se->on_rq))
>  			return;
> +		if (se_is_idle(se))
> +			return;
>  		cfs_rq_of(se)->last = se;
>  	}
>  }
>  
>  static void set_next_buddy(struct sched_entity *se)
>  {
> -	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
> -		return;
> -
>  	for_each_sched_entity(se) {
>  		if (SCHED_WARN_ON(!se->on_rq))
>  			return;
> +		if (se_is_idle(se))
> +			return;
>  		cfs_rq_of(se)->next = se;
>  	}
>  }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/4] sched: cgroup SCHED_IDLE support
  2021-09-02  1:22     ` Daniel Jordan
@ 2021-09-03  1:21       ` Josh Don
  2021-09-08 20:38         ` Daniel Jordan
  0 siblings, 1 reply; 28+ messages in thread
From: Josh Don @ 2021-09-03  1:21 UTC (permalink / raw)
  To: Daniel Jordan
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Wed, Sep 1, 2021 at 6:22 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
>
> On Thu, Aug 19, 2021 at 06:04:00PM -0700, Josh Don wrote:
> > +static int se_is_idle(struct sched_entity *se)
> > +{
> > +     return 0;
> > +}
>
> I'm thinking !FAIR_GROUP_SCHED is a rare thing to behold?  So not a big
> deal, but I think this wants to be
>
>         return task_has_idle_policy(task_of(se));
>
> so buddies aren't set for SCHED_IDLE.

Good point. Peter has merged this one already, do you want to send a patch?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/4] sched: cgroup SCHED_IDLE support
       [not found] ` <20210906124702.Q6G0oOWwFOmQSl_jmRms3XQgfz4ROzfE71r3SNgWSf0@z>
@ 2021-09-06 12:47   ` alexs
  2021-08-20  8:39     ` Tao Zhou
                       ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: alexs @ 2021-09-06 12:47 UTC (permalink / raw)
  To: alexs, Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot
  Cc: Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel, Josh Don

...
 
> For now, we maintain the existing SCHED_IDLE semantics. Future patches
> may make improvements that extend how we treat SCHED_IDLE entities.
> 
> The per-task_group idle field is an integer that currently only holds
> either a 0 or a 1. This is explicitly typed as an integer to allow for
> further extensions to this API. For example, a negative value may
> indicate a highly latency-sensitive cgroup that should be preferred for
> preemption/placement/etc.

Hi Josh,

Sounds there is a ready solutions for colocation problem, isn't there?
I'd like to evaluate its effects if it could be sent out.

Thanks
Alex
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/4] sched: cgroup SCHED_IDLE support
  2021-09-06 12:47   ` [PATCH v3 1/4] sched: cgroup SCHED_IDLE support alexs
  2021-08-20  8:39     ` Tao Zhou
  2021-09-02  1:22     ` Daniel Jordan
@ 2021-09-08 18:36     ` Josh Don
  2 siblings, 0 replies; 28+ messages in thread
From: Josh Don @ 2021-09-08 18:36 UTC (permalink / raw)
  To: alexs
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Mon, Sep 6, 2021 at 5:47 AM <alexs@kernel.org> wrote:
>
> ...
>
> > For now, we maintain the existing SCHED_IDLE semantics. Future patches
> > may make improvements that extend how we treat SCHED_IDLE entities.
> >
> > The per-task_group idle field is an integer that currently only holds
> > either a 0 or a 1. This is explicitly typed as an integer to allow for
> > further extensions to this API. For example, a negative value may
> > indicate a highly latency-sensitive cgroup that should be preferred for
> > preemption/placement/etc.
>
> Hi Josh,
>
> Sounds there is a ready solutions for colocation problem, isn't there?
> I'd like to evaluate its effects if it could be sent out.
>
> Thanks
> Alex
> >

Hi Alex,

I don't have a patch written just yet. The basic idea is to utilize
the (negative) idle values to make relative decisions about task
preemption and wakeup vruntime placement.

Could you describe your use case?

Best,
Josh

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/4] sched: cgroup SCHED_IDLE support
  2021-09-03  1:21       ` Josh Don
@ 2021-09-08 20:38         ` Daniel Jordan
  0 siblings, 0 replies; 28+ messages in thread
From: Daniel Jordan @ 2021-09-08 20:38 UTC (permalink / raw)
  To: Josh Don
  Cc: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Paul Turner, Oleg Rombakh,
	Viresh Kumar, Steve Sistare, Tejun Heo, Rik van Riel,
	linux-kernel

On Thu, Sep 02, 2021 at 06:21:09PM -0700, Josh Don wrote:
> On Wed, Sep 1, 2021 at 6:22 PM Daniel Jordan <daniel.m.jordan@oracle.com> wrote:
> >
> > On Thu, Aug 19, 2021 at 06:04:00PM -0700, Josh Don wrote:
> > > +static int se_is_idle(struct sched_entity *se)
> > > +{
> > > +     return 0;
> > > +}
> >
> > I'm thinking !FAIR_GROUP_SCHED is a rare thing to behold?  So not a big
> > deal, but I think this wants to be
> >
> >         return task_has_idle_policy(task_of(se));
> >
> > so buddies aren't set for SCHED_IDLE.
> 
> Good point. Peter has merged this one already, do you want to send a patch?

Sure, here it comes.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [tip: sched/core] sched: adjust sleeper credit for SCHED_IDLE entities
  2021-08-20  1:04 ` [PATCH v3 4/4] sched: adjust sleeper credit " Josh Don
  2021-08-23 10:09   ` Vincent Guittot
  2021-08-24  8:16   ` Jiang Biao
@ 2021-09-09 11:18   ` tip-bot2 for Josh Don
  2021-10-05 14:12   ` tip-bot2 for Josh Don
  3 siblings, 0 replies; 28+ messages in thread
From: tip-bot2 for Josh Don @ 2021-09-09 11:18 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Josh Don, Peter Zijlstra (Intel),
	Vincent Guittot, Jiang Biao, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     bb1fc3bc521782d018902143c8301ab4a5e53557
Gitweb:        https://git.kernel.org/tip/bb1fc3bc521782d018902143c8301ab4a5e53557
Author:        Josh Don <joshdon@google.com>
AuthorDate:    Thu, 19 Aug 2021 18:04:03 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 09 Sep 2021 11:27:31 +02:00

sched: adjust sleeper credit for SCHED_IDLE entities

Give reduced sleeper credit to SCHED_IDLE entities. As a result, woken
SCHED_IDLE entities will take longer to preempt normal entities.

The benefit of this change is to make it less likely that a newly woken
SCHED_IDLE entity will preempt a short-running normal entity before it
blocks.

We still give a small sleeper credit to SCHED_IDLE entities, so that
idle<->idle competition retains some fairness.

Example: With HZ=1000, spawned four threads affined to one cpu, one of
which was set to SCHED_IDLE. Without this patch, wakeup latency for the
SCHED_IDLE thread was ~1-2ms, with the patch the wakeup latency was
~5ms.

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Jiang Biao <benbjiang@tencent.com>
Link: https://lore.kernel.org/r/20210820010403.946838-5-joshdon@google.com
---
 kernel/sched/fair.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7330a77..b27ed8b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4201,7 +4201,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	/* sleeps up to a single latency don't count. */
 	if (!initial) {
-		unsigned long thresh = sysctl_sched_latency;
+		unsigned long thresh;
+
+		if (se_is_idle(se))
+			thresh = sysctl_sched_min_granularity;
+		else
+			thresh = sysctl_sched_latency;
 
 		/*
 		 * Halve their sleep time's effect, to allow

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [tip: sched/core] sched: reduce sched slice for SCHED_IDLE entities
  2021-08-20  1:04 ` [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities Josh Don
  2021-08-23 10:08   ` Vincent Guittot
  2021-08-24 10:24   ` Jiang Biao
@ 2021-09-09 11:18   ` tip-bot2 for Josh Don
  2021-10-05 14:12   ` tip-bot2 for Josh Don
  3 siblings, 0 replies; 28+ messages in thread
From: tip-bot2 for Josh Don @ 2021-09-09 11:18 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Josh Don, Peter Zijlstra (Intel), Vincent Guittot, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     7e2ce158699bb7b6a489c7c1d89c0dde2d4ceef5
Gitweb:        https://git.kernel.org/tip/7e2ce158699bb7b6a489c7c1d89c0dde2d4ceef5
Author:        Josh Don <joshdon@google.com>
AuthorDate:    Thu, 19 Aug 2021 18:04:02 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 09 Sep 2021 11:27:31 +02:00

sched: reduce sched slice for SCHED_IDLE entities

Use a small, non-scaled min granularity for SCHED_IDLE entities, when
competing with normal entities. This reduces the latency of getting
a normal entity back on cpu, at the expense of increased context
switch frequency of SCHED_IDLE entities.

The benefit of this change is to reduce the round-robin latency for
normal entities when competing with a SCHED_IDLE entity.

Example: on a machine with HZ=1000, spawned two threads, one of which is
SCHED_IDLE, and affined to one cpu. Without this patch, the SCHED_IDLE
thread runs for 4ms then waits for 1.4s. With this patch, it runs for
1ms and waits 340ms (as it round-robins with the other thread).

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20210820010403.946838-4-joshdon@google.com
---
 kernel/sched/debug.c |  2 ++
 kernel/sched/fair.c  | 29 ++++++++++++++++++++++++-----
 kernel/sched/sched.h |  1 +
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 3353857..317ef56 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ static __init int sched_init_debug(void)
 
 	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
 	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+	debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
 	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
 
 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
@@ -806,6 +807,7 @@ static void sched_debug_header(struct seq_file *m)
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
 	PN(sysctl_sched_latency);
 	PN(sysctl_sched_min_granularity);
+	PN(sysctl_sched_idle_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
 	P(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7c0b9d..7330a77 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -60,6 +60,14 @@ unsigned int sysctl_sched_min_granularity			= 750000ULL;
 static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
 
 /*
+ * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
+ * Applies only when SCHED_IDLE tasks compete with normal tasks.
+ *
+ * (default: 0.75 msec)
+ */
+unsigned int sysctl_sched_idle_min_granularity			= 750000ULL;
+
+/*
  * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
  */
 static unsigned int sched_nr_latency = 8;
@@ -665,6 +673,8 @@ static u64 __sched_period(unsigned long nr_running)
 		return sysctl_sched_latency;
 }
 
+static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
+
 /*
  * We calculate the wall-time slice from the period by taking a part
  * proportional to the weight.
@@ -674,6 +684,8 @@ static u64 __sched_period(unsigned long nr_running)
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned int nr_running = cfs_rq->nr_running;
+	struct sched_entity *init_se = se;
+	unsigned int min_gran;
 	u64 slice;
 
 	if (sched_feat(ALT_PERIOD))
@@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	for_each_sched_entity(se) {
 		struct load_weight *load;
 		struct load_weight lw;
+		struct cfs_rq *qcfs_rq;
 
-		cfs_rq = cfs_rq_of(se);
-		load = &cfs_rq->load;
+		qcfs_rq = cfs_rq_of(se);
+		load = &qcfs_rq->load;
 
 		if (unlikely(!se->on_rq)) {
-			lw = cfs_rq->load;
+			lw = qcfs_rq->load;
 
 			update_load_add(&lw, se->load.weight);
 			load = &lw;
@@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		slice = __calc_delta(slice, se->load.weight, load);
 	}
 
-	if (sched_feat(BASE_SLICE))
-		slice = max(slice, (u64)sysctl_sched_min_granularity);
+	if (sched_feat(BASE_SLICE)) {
+		if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
+			min_gran = sysctl_sched_idle_min_granularity;
+		else
+			min_gran = sysctl_sched_min_granularity;
+
+		slice = max_t(u64, slice, min_gran);
+	}
 
 	return slice;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 413298d..6b2d8b7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2400,6 +2400,7 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_idle_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern int sysctl_resched_latency_warn_ms;
 extern int sysctl_resched_latency_warn_once;

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [tip: sched/core] sched: Account number of SCHED_IDLE entities on each cfs_rq
  2021-08-20  1:04 ` [PATCH v3 2/4] sched: account number of SCHED_IDLE entities on each cfs_rq Josh Don
  2021-08-24  7:57   ` Vincent Guittot
@ 2021-09-09 11:18   ` tip-bot2 for Josh Don
  2021-10-05 14:12   ` tip-bot2 for Josh Don
  2 siblings, 0 replies; 28+ messages in thread
From: tip-bot2 for Josh Don @ 2021-09-09 11:18 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Josh Don, Peter Zijlstra (Intel), Vincent Guittot, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     4b1e9afe8af5becd9b33640a7c996ccfce4ea310
Gitweb:        https://git.kernel.org/tip/4b1e9afe8af5becd9b33640a7c996ccfce4ea310
Author:        Josh Don <joshdon@google.com>
AuthorDate:    Thu, 19 Aug 2021 18:04:01 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 09 Sep 2021 11:27:31 +02:00

sched: Account number of SCHED_IDLE entities on each cfs_rq

Adds cfs_rq->idle_nr_running, which accounts the number of idle entities
directly enqueued on the cfs_rq.

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20210820010403.946838-3-joshdon@google.com
---
 kernel/sched/debug.c |  2 ++
 kernel/sched/fair.c  | 25 ++++++++++++++++++++++++-
 kernel/sched/sched.h |  1 +
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4971622..3353857 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,6 +608,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->nr_spread_over);
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+	SEQ_printf(m, "  .%-30s: %d\n", "idle_nr_running",
+			cfs_rq->idle_nr_running);
 	SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running",
 			cfs_rq->idle_h_nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2a5efde..d7c0b9d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2995,6 +2995,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	}
 #endif
 	cfs_rq->nr_running++;
+	if (se_is_idle(se))
+		cfs_rq->idle_nr_running++;
 }
 
 static void
@@ -3008,6 +3010,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	}
 #endif
 	cfs_rq->nr_running--;
+	if (se_is_idle(se))
+		cfs_rq->idle_nr_running--;
 }
 
 /*
@@ -5544,6 +5548,17 @@ static int sched_idle_rq(struct rq *rq)
 			rq->nr_running);
 }
 
+/*
+ * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
+ * of idle_nr_running, which does not consider idle descendants of normal
+ * entities.
+ */
+static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->nr_running &&
+		cfs_rq->nr_running == cfs_rq->idle_nr_running;
+}
+
 #ifdef CONFIG_SMP
 static int sched_idle_cpu(int cpu)
 {
@@ -11542,7 +11557,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
 	for_each_possible_cpu(i) {
 		struct rq *rq = cpu_rq(i);
 		struct sched_entity *se = tg->se[i];
-		struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+		struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
 		bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
 		long idle_task_delta;
 		struct rq_flags rf;
@@ -11553,6 +11568,14 @@ int sched_group_set_idle(struct task_group *tg, long idle)
 		if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
 			goto next_cpu;
 
+		if (se->on_rq) {
+			parent_cfs_rq = cfs_rq_of(se);
+			if (cfs_rq_is_idle(grp_cfs_rq))
+				parent_cfs_rq->idle_nr_running++;
+			else
+				parent_cfs_rq->idle_nr_running--;
+		}
+
 		idle_task_delta = grp_cfs_rq->h_nr_running -
 				  grp_cfs_rq->idle_h_nr_running;
 		if (!cfs_rq_is_idle(grp_cfs_rq))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 30b7bd2..413298d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -530,6 +530,7 @@ struct cfs_rq {
 	struct load_weight	load;
 	unsigned int		nr_running;
 	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
+	unsigned int		idle_nr_running;   /* SCHED_IDLE */
 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
 
 	u64			exec_clock;

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [tip: sched/core] sched: adjust sleeper credit for SCHED_IDLE entities
  2021-08-20  1:04 ` [PATCH v3 4/4] sched: adjust sleeper credit " Josh Don
                     ` (2 preceding siblings ...)
  2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Josh Don
@ 2021-10-05 14:12   ` tip-bot2 for Josh Don
  3 siblings, 0 replies; 28+ messages in thread
From: tip-bot2 for Josh Don @ 2021-10-05 14:12 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Josh Don, Peter Zijlstra (Intel),
	Vincent Guittot, Jiang Biao, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     2cae3948edd488ebdef4deaf1d1043f92f47e665
Gitweb:        https://git.kernel.org/tip/2cae3948edd488ebdef4deaf1d1043f92f47e665
Author:        Josh Don <joshdon@google.com>
AuthorDate:    Thu, 19 Aug 2021 18:04:03 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Oct 2021 15:51:39 +02:00

sched: adjust sleeper credit for SCHED_IDLE entities

Give reduced sleeper credit to SCHED_IDLE entities. As a result, woken
SCHED_IDLE entities will take longer to preempt normal entities.

The benefit of this change is to make it less likely that a newly woken
SCHED_IDLE entity will preempt a short-running normal entity before it
blocks.

We still give a small sleeper credit to SCHED_IDLE entities, so that
idle<->idle competition retains some fairness.

Example: With HZ=1000, spawned four threads affined to one cpu, one of
which was set to SCHED_IDLE. Without this patch, wakeup latency for the
SCHED_IDLE thread was ~1-2ms, with the patch the wakeup latency was
~5ms.

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Jiang Biao <benbjiang@tencent.com>
Link: https://lore.kernel.org/r/20210820010403.946838-5-joshdon@google.com
---
 kernel/sched/fair.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d835061..5457c80 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4230,7 +4230,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 	/* sleeps up to a single latency don't count. */
 	if (!initial) {
-		unsigned long thresh = sysctl_sched_latency;
+		unsigned long thresh;
+
+		if (se_is_idle(se))
+			thresh = sysctl_sched_min_granularity;
+		else
+			thresh = sysctl_sched_latency;
 
 		/*
 		 * Halve their sleep time's effect, to allow

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [tip: sched/core] sched: reduce sched slice for SCHED_IDLE entities
  2021-08-20  1:04 ` [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities Josh Don
                     ` (2 preceding siblings ...)
  2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Josh Don
@ 2021-10-05 14:12   ` tip-bot2 for Josh Don
  3 siblings, 0 replies; 28+ messages in thread
From: tip-bot2 for Josh Don @ 2021-10-05 14:12 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Josh Don, Peter Zijlstra (Intel), Vincent Guittot, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     51ce83ed523b00d58f2937ec014b12daaad55185
Gitweb:        https://git.kernel.org/tip/51ce83ed523b00d58f2937ec014b12daaad55185
Author:        Josh Don <joshdon@google.com>
AuthorDate:    Thu, 19 Aug 2021 18:04:02 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Oct 2021 15:51:37 +02:00

sched: reduce sched slice for SCHED_IDLE entities

Use a small, non-scaled min granularity for SCHED_IDLE entities, when
competing with normal entities. This reduces the latency of getting
a normal entity back on cpu, at the expense of increased context
switch frequency of SCHED_IDLE entities.

The benefit of this change is to reduce the round-robin latency for
normal entities when competing with a SCHED_IDLE entity.

Example: on a machine with HZ=1000, spawned two threads, one of which is
SCHED_IDLE, and affined to one cpu. Without this patch, the SCHED_IDLE
thread runs for 4ms then waits for 1.4s. With this patch, it runs for
1ms and waits 340ms (as it round-robins with the other thread).

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20210820010403.946838-4-joshdon@google.com
---
 kernel/sched/debug.c |  2 ++
 kernel/sched/fair.c  | 29 ++++++++++++++++++++++++-----
 kernel/sched/sched.h |  1 +
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2e5fdd9..34913a7 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -311,6 +311,7 @@ static __init int sched_init_debug(void)
 
 	debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
 	debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+	debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity);
 	debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
 
 	debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
@@ -812,6 +813,7 @@ static void sched_debug_header(struct seq_file *m)
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
 	PN(sysctl_sched_latency);
 	PN(sysctl_sched_min_granularity);
+	PN(sysctl_sched_idle_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
 	P(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9c78c16..d835061 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -60,6 +60,14 @@ unsigned int sysctl_sched_min_granularity			= 750000ULL;
 static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
 
 /*
+ * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
+ * Applies only when SCHED_IDLE tasks compete with normal tasks.
+ *
+ * (default: 0.75 msec)
+ */
+unsigned int sysctl_sched_idle_min_granularity			= 750000ULL;
+
+/*
  * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
  */
 static unsigned int sched_nr_latency = 8;
@@ -665,6 +673,8 @@ static u64 __sched_period(unsigned long nr_running)
 		return sysctl_sched_latency;
 }
 
+static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
+
 /*
  * We calculate the wall-time slice from the period by taking a part
  * proportional to the weight.
@@ -674,6 +684,8 @@ static u64 __sched_period(unsigned long nr_running)
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	unsigned int nr_running = cfs_rq->nr_running;
+	struct sched_entity *init_se = se;
+	unsigned int min_gran;
 	u64 slice;
 
 	if (sched_feat(ALT_PERIOD))
@@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	for_each_sched_entity(se) {
 		struct load_weight *load;
 		struct load_weight lw;
+		struct cfs_rq *qcfs_rq;
 
-		cfs_rq = cfs_rq_of(se);
-		load = &cfs_rq->load;
+		qcfs_rq = cfs_rq_of(se);
+		load = &qcfs_rq->load;
 
 		if (unlikely(!se->on_rq)) {
-			lw = cfs_rq->load;
+			lw = qcfs_rq->load;
 
 			update_load_add(&lw, se->load.weight);
 			load = &lw;
@@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		slice = __calc_delta(slice, se->load.weight, load);
 	}
 
-	if (sched_feat(BASE_SLICE))
-		slice = max(slice, (u64)sysctl_sched_min_granularity);
+	if (sched_feat(BASE_SLICE)) {
+		if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
+			min_gran = sysctl_sched_idle_min_granularity;
+		else
+			min_gran = sysctl_sched_min_granularity;
+
+		slice = max_t(u64, slice, min_gran);
+	}
 
 	return slice;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f2965b5..15a8895 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2403,6 +2403,7 @@ extern const_debug unsigned int sysctl_sched_migration_cost;
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_idle_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern int sysctl_resched_latency_warn_ms;
 extern int sysctl_resched_latency_warn_once;

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [tip: sched/core] sched: Account number of SCHED_IDLE entities on each cfs_rq
  2021-08-20  1:04 ` [PATCH v3 2/4] sched: account number of SCHED_IDLE entities on each cfs_rq Josh Don
  2021-08-24  7:57   ` Vincent Guittot
  2021-09-09 11:18   ` [tip: sched/core] sched: Account " tip-bot2 for Josh Don
@ 2021-10-05 14:12   ` tip-bot2 for Josh Don
  2 siblings, 0 replies; 28+ messages in thread
From: tip-bot2 for Josh Don @ 2021-10-05 14:12 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Josh Don, Peter Zijlstra (Intel), Vincent Guittot, x86, linux-kernel

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     a480addecc0d89c200ec0b41da62ae8ceddca8d7
Gitweb:        https://git.kernel.org/tip/a480addecc0d89c200ec0b41da62ae8ceddca8d7
Author:        Josh Don <joshdon@google.com>
AuthorDate:    Thu, 19 Aug 2021 18:04:01 -07:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Tue, 05 Oct 2021 15:51:36 +02:00

sched: Account number of SCHED_IDLE entities on each cfs_rq

Adds cfs_rq->idle_nr_running, which accounts the number of idle entities
directly enqueued on the cfs_rq.

Signed-off-by: Josh Don <joshdon@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20210820010403.946838-3-joshdon@google.com
---
 kernel/sched/debug.c |  2 ++
 kernel/sched/fair.c  | 25 ++++++++++++++++++++++++-
 kernel/sched/sched.h |  1 +
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 17a653b..2e5fdd9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -614,6 +614,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->nr_spread_over);
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+	SEQ_printf(m, "  .%-30s: %d\n", "idle_nr_running",
+			cfs_rq->idle_nr_running);
 	SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running",
 			cfs_rq->idle_h_nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6cc958e..9c78c16 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2995,6 +2995,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	}
 #endif
 	cfs_rq->nr_running++;
+	if (se_is_idle(se))
+		cfs_rq->idle_nr_running++;
 }
 
 static void
@@ -3008,6 +3010,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	}
 #endif
 	cfs_rq->nr_running--;
+	if (se_is_idle(se))
+		cfs_rq->idle_nr_running--;
 }
 
 /*
@@ -5577,6 +5581,17 @@ static int sched_idle_rq(struct rq *rq)
 			rq->nr_running);
 }
 
+/*
+ * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
+ * of idle_nr_running, which does not consider idle descendants of normal
+ * entities.
+ */
+static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->nr_running &&
+		cfs_rq->nr_running == cfs_rq->idle_nr_running;
+}
+
 #ifdef CONFIG_SMP
 static int sched_idle_cpu(int cpu)
 {
@@ -11575,7 +11590,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
 	for_each_possible_cpu(i) {
 		struct rq *rq = cpu_rq(i);
 		struct sched_entity *se = tg->se[i];
-		struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+		struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
 		bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
 		long idle_task_delta;
 		struct rq_flags rf;
@@ -11586,6 +11601,14 @@ int sched_group_set_idle(struct task_group *tg, long idle)
 		if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
 			goto next_cpu;
 
+		if (se->on_rq) {
+			parent_cfs_rq = cfs_rq_of(se);
+			if (cfs_rq_is_idle(grp_cfs_rq))
+				parent_cfs_rq->idle_nr_running++;
+			else
+				parent_cfs_rq->idle_nr_running--;
+		}
+
 		idle_task_delta = grp_cfs_rq->h_nr_running -
 				  grp_cfs_rq->idle_h_nr_running;
 		if (!cfs_rq_is_idle(grp_cfs_rq))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1fec313..f2965b5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -530,6 +530,7 @@ struct cfs_rq {
 	struct load_weight	load;
 	unsigned int		nr_running;
 	unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */
+	unsigned int		idle_nr_running;   /* SCHED_IDLE */
 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
 
 	u64			exec_clock;

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2021-10-05 14:13 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-20  1:03 [PATCH v3 0/4] SCHED_IDLE extensions Josh Don
2021-08-20  1:04 ` [PATCH v3 2/4] sched: account number of SCHED_IDLE entities on each cfs_rq Josh Don
2021-08-24  7:57   ` Vincent Guittot
2021-09-09 11:18   ` [tip: sched/core] sched: Account " tip-bot2 for Josh Don
2021-10-05 14:12   ` tip-bot2 for Josh Don
2021-08-20  1:04 ` [PATCH v3 3/4] sched: reduce sched slice for SCHED_IDLE entities Josh Don
2021-08-23 10:08   ` Vincent Guittot
2021-08-23 17:40     ` Josh Don
2021-08-24  7:56       ` Vincent Guittot
2021-08-24 10:24   ` Jiang Biao
2021-08-24 17:04     ` Josh Don
2021-08-25  2:43       ` Jiang Biao
2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Josh Don
2021-10-05 14:12   ` tip-bot2 for Josh Don
2021-08-20  1:04 ` [PATCH v3 4/4] sched: adjust sleeper credit " Josh Don
2021-08-23 10:09   ` Vincent Guittot
2021-08-24  8:16   ` Jiang Biao
2021-08-24 17:12     ` Josh Don
2021-09-09 11:18   ` [tip: sched/core] " tip-bot2 for Josh Don
2021-10-05 14:12   ` tip-bot2 for Josh Don
     [not found] ` <20210906124702.Q6G0oOWwFOmQSl_jmRms3XQgfz4ROzfE71r3SNgWSf0@z>
2021-09-06 12:47   ` [PATCH v3 1/4] sched: cgroup SCHED_IDLE support alexs
2021-08-20  8:39     ` Tao Zhou
2021-08-23 17:29       ` Josh Don
2021-08-23 22:45         ` Tao Zhou
2021-09-02  1:22     ` Daniel Jordan
2021-09-03  1:21       ` Josh Don
2021-09-08 20:38         ` Daniel Jordan
2021-09-08 18:36     ` Josh Don

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).