linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] sched/deadline: Fix sched_dl_global_validate()
@ 2020-09-19  1:42 Peng Liu
  2020-09-24 10:57 ` Peter Zijlstra
  0 siblings, 1 reply; 3+ messages in thread
From: Peng Liu @ 2020-09-19  1:42 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, peterz, juri.lelli, vincent.guittot, dietmar.eggemann,
	rostedt, bsegall, mgorman, bristot, valentin.schneider, raistlin,
	iwtbavbm

When user changes sched_rt_{runtime, period}_us, then

  sched_rt_handler()
    -->	sched_dl_bandwidth_validate()
	{
		new_bw = global_rt_runtime()/global_rt_period();

		for_each_possible_cpu(cpu) {
			dl_b = dl_bw_of(cpu);
			if (new_bw < dl_b->total_bw)
				ret = -EBUSY;
		}
	}

Under CONFIG_SMP, dl_bw is per root domain , but not per CPU,
dl_b->total_bw is the allocated bandwidth of the whole root domain.
we should compare dl_b->total_bw against cpus*new_bw, where 'cpus'
is the number of CPUs of the root domain.

Also, below annotation(in kernel/sched/sched.h) implied implementation
only appeared in SCHED_DEADLINE v2[1], then deadline scheduler kept
evolving till got merged(v9), but the annotation remains unchanged,
meaningless and misleading, correct it.

* With respect to SMP, the bandwidth is given on a per-CPU basis,
* meaning that:
*  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
*  - dl_total_bw array contains, in the i-eth element, the currently
*    allocated bandwidth on the i-eth CPU.

[1] https://lkml.org/lkml/2010/2/28/119

[!CONFIG_SMP build error]
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Peng Liu <iwtbavbm@gmail.com>
---
v3 <-- v2:
 Fix build error for !CONFIG_SMP, reported by kernel test robot.

v2 <-- v1:
 Replace cpumask_weight(cpu_rq(cpu)->rd->span) with dl_bw_cpus(cpu),
 suggested by Juri.

 kernel/sched/deadline.c | 43 ++++++++++++++++++++++++++++-------------
 kernel/sched/sched.h    | 17 +++++-----------
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3862a28cd05d..39ec0be574e8 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2511,33 +2511,45 @@ const struct sched_class dl_sched_class
 	.update_curr		= update_curr_dl,
 };
 
+#ifdef CONFIG_SMP
+static struct cpumask dl_local_possible_mask;
+#endif /* CONFIG_SMP */
+
 int sched_dl_global_validate(void)
 {
 	u64 runtime = global_rt_runtime();
 	u64 period = global_rt_period();
 	u64 new_bw = to_ratio(period, runtime);
 	struct dl_bw *dl_b;
-	int cpu, ret = 0;
+	int cpu, cpus, ret = 0;
 	unsigned long flags;
 
 	/*
 	 * Here we want to check the bandwidth not being set to some
 	 * value smaller than the currently allocated bandwidth in
 	 * any of the root_domains.
-	 *
-	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
-	 * cycling on root_domains... Discussion on different/better
-	 * solutions is welcome!
 	 */
+#ifdef CONFIG_SMP
+	cpumask_t *possible_mask = &dl_local_possible_mask;
+
+	cpumask_copy(possible_mask, cpu_possible_mask);
+	for_each_cpu(cpu, possible_mask) {
+#else
 	for_each_possible_cpu(cpu) {
+#endif /* CONFIG_SMP */
 		rcu_read_lock_sched();
 		dl_b = dl_bw_of(cpu);
-
+		cpus = dl_bw_cpus(cpu);
+#ifdef CONFIG_SMP
+		/* Do the "andnot" operation iff it's necessary. */
+		if (cpus > 1)
+			cpumask_andnot(possible_mask, possible_mask,
+				       cpu_rq(cpu)->rd->span);
+#endif /* CONFIG_SMP */
 		raw_spin_lock_irqsave(&dl_b->lock, flags);
-		if (new_bw < dl_b->total_bw)
+		if (new_bw * cpus < dl_b->total_bw)
 			ret = -EBUSY;
 		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
 		rcu_read_unlock_sched();
 
 		if (ret)
@@ -2566,6 +2578,7 @@ void sched_dl_do_global(void)
 	struct dl_bw *dl_b;
 	int cpu;
 	unsigned long flags;
+	cpumask_t *possible_mask = NULL;
 
 	def_dl_bandwidth.dl_period = global_rt_period();
 	def_dl_bandwidth.dl_runtime = global_rt_runtime();
@@ -2573,17 +2586,21 @@ void sched_dl_do_global(void)
 	if (global_rt_runtime() != RUNTIME_INF)
 		new_bw = to_ratio(global_rt_period(), global_rt_runtime());
 
-	/*
-	 * FIXME: As above...
-	 */
-	for_each_possible_cpu(cpu) {
+#ifdef CONFIG_SMP
+	possible_mask = &dl_local_possible_mask;
+	cpumask_copy(possible_mask, cpu_possible_mask);
+#endif /* CONFIG_SMP */
+	for_each_cpu(cpu, possible_mask) {
 		rcu_read_lock_sched();
 		dl_b = dl_bw_of(cpu);
 
 		raw_spin_lock_irqsave(&dl_b->lock, flags);
 		dl_b->bw = new_bw;
 		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
-
+#ifdef CONFIG_SMP
+		cpumask_andnot(possible_mask, possible_mask,
+			       cpu_rq(cpu)->rd->span);
+#endif /* CONFIG_SMP */
 		rcu_read_unlock_sched();
 		init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 28709f6b0975..2602544e06ff 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -258,9 +258,9 @@ struct rt_bandwidth {
 void __dl_clear_params(struct task_struct *p);
 
 /*
- * To keep the bandwidth of -deadline tasks and groups under control
+ * To keep the bandwidth of -deadline tasks under control
  * we need some place where:
- *  - store the maximum -deadline bandwidth of the system (the group);
+ *  - store the maximum -deadline bandwidth of each root domain;
  *  - cache the fraction of that bandwidth that is currently allocated.
  *
  * This is all done in the data structure below. It is similar to the
@@ -269,17 +269,10 @@ void __dl_clear_params(struct task_struct *p);
  * do not decrease any runtime while the group "executes", neither we
  * need a timer to replenish it.
  *
- * With respect to SMP, the bandwidth is given on a per-CPU basis,
+ * With respect to SMP, the bandwidth is given on a per root domain basis,
  * meaning that:
- *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
- *  - dl_total_bw array contains, in the i-eth element, the currently
- *    allocated bandwidth on the i-eth CPU.
- * Moreover, groups consume bandwidth on each CPU, while tasks only
- * consume bandwidth on the CPU they're running on.
- * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
- * that will be shown the next time the proc or cgroup controls will
- * be red. It on its turn can be changed by writing on its own
- * control.
+ *  - bw (< 100%) is the bandwidth of the system on each CPU;
+ *  - total_bw is the currently allocated bandwidth on each root domain.
  */
 struct dl_bandwidth {
 	raw_spinlock_t		dl_runtime_lock;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v3] sched/deadline: Fix sched_dl_global_validate()
  2020-09-19  1:42 [PATCH v3] sched/deadline: Fix sched_dl_global_validate() Peng Liu
@ 2020-09-24 10:57 ` Peter Zijlstra
  2020-09-24 15:59   ` Peng Liu
  0 siblings, 1 reply; 3+ messages in thread
From: Peter Zijlstra @ 2020-09-24 10:57 UTC (permalink / raw)
  To: Peng Liu
  Cc: linux-kernel, mingo, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, bristot,
	valentin.schneider, raistlin

On Sat, Sep 19, 2020 at 09:42:49AM +0800, Peng Liu wrote:
> When user changes sched_rt_{runtime, period}_us, then
> 
>   sched_rt_handler()
>     -->	sched_dl_bandwidth_validate()
> 	{
> 		new_bw = global_rt_runtime()/global_rt_period();
> 
> 		for_each_possible_cpu(cpu) {
> 			dl_b = dl_bw_of(cpu);
> 			if (new_bw < dl_b->total_bw)
> 				ret = -EBUSY;
> 		}
> 	}
> 
> Under CONFIG_SMP, dl_bw is per root domain , but not per CPU,
> dl_b->total_bw is the allocated bandwidth of the whole root domain.
> we should compare dl_b->total_bw against cpus*new_bw, where 'cpus'
> is the number of CPUs of the root domain.

Is there an actual problem there? Spell it out.

> Also, below annotation(in kernel/sched/sched.h) implied implementation
> only appeared in SCHED_DEADLINE v2[1], then deadline scheduler kept
> evolving till got merged(v9), but the annotation remains unchanged,
> meaningless and misleading, correct it.
> 
> * With respect to SMP, the bandwidth is given on a per-CPU basis,
> * meaning that:
> *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
> *  - dl_total_bw array contains, in the i-eth element, the currently
> *    allocated bandwidth on the i-eth CPU.
> 
> [1] https://lkml.org/lkml/2010/2/28/119

Don't use lkml.org links, use lkml.kernel.org/r/$MsgID instead.

> [!CONFIG_SMP build error]
> Reported-by: kernel test robot <lkp@intel.com>
> Signed-off-by: Peng Liu <iwtbavbm@gmail.com>

Quite frankly this patch is horrible #ifdef soup.

Can't you make something like the below work?

---
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3862a28cd05d..3f309e0f69f5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -97,6 +97,17 @@ static inline unsigned long dl_bw_capacity(int i)
 		return __dl_bw_capacity(i);
 	}
 }
+
+static inline bool dl_bw_visited(int cpu, u64 gen)
+{
+	struct root_domain *rd = cpu_rq(i)->rd;
+
+	if (rd->visit_gen == gen)
+		return true;
+
+	rd->visit_gen = gen;
+	return false;
+}
 #else
 static inline struct dl_bw *dl_bw_of(int i)
 {
@@ -112,6 +123,11 @@ static inline unsigned long dl_bw_capacity(int i)
 {
 	return SCHED_CAPACITY_SCALE;
 }
+
+static inline bool dl_bw_visited(int cpu, u64 gen)
+{
+	return false;
+}
 #endif
 
 static inline
@@ -2513,31 +2529,35 @@ const struct sched_class dl_sched_class
 
 int sched_dl_global_validate(void)
 {
+	static u64 generation = 0;
 	u64 runtime = global_rt_runtime();
 	u64 period = global_rt_period();
 	u64 new_bw = to_ratio(period, runtime);
-	struct dl_bw *dl_b;
-	int cpu, ret = 0;
+	int cpu, cpus, ret = 0;
 	unsigned long flags;
+	struct dl_bw *dl_b;
+	u64 gen = ++generation;
 
 	/*
 	 * Here we want to check the bandwidth not being set to some
 	 * value smaller than the currently allocated bandwidth in
 	 * any of the root_domains.
-	 *
-	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
-	 * cycling on root_domains... Discussion on different/better
-	 * solutions is welcome!
 	 */
 	for_each_possible_cpu(cpu) {
+
 		rcu_read_lock_sched();
+		if (dl_bw_visited(cpu, gen))
+			goto next;
+
 		dl_b = dl_bw_of(cpu);
+		cpus = dl_bw_cpus(cpu);
 
 		raw_spin_lock_irqsave(&dl_b->lock, flags);
-		if (new_bw < dl_b->total_bw)
+		if (new_bw * cpus < dl_b->total_bw)
 			ret = -EBUSY;
 		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 
+	next:
 		rcu_read_unlock_sched();
 
 		if (ret)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 28709f6b0975..7f0947db6e2c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -801,6 +801,8 @@ struct root_domain {
 	struct dl_bw		dl_bw;
 	struct cpudl		cpudl;
 
+	u64			visit_gen;
+
 #ifdef HAVE_RT_PUSH_IPI
 	/*
 	 * For IPI pull requests, loop across the rto_mask.

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v3] sched/deadline: Fix sched_dl_global_validate()
  2020-09-24 10:57 ` Peter Zijlstra
@ 2020-09-24 15:59   ` Peng Liu
  0 siblings, 0 replies; 3+ messages in thread
From: Peng Liu @ 2020-09-24 15:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, juri.lelli, vincent.guittot, dietmar.eggemann, rostedt,
	bsegall, mgorman, bristot, valentin.schneider, raistlin,
	iwtbavbm, linux-kernel

Hi Peter,

Thanks for looking at this.

On Thu, Sep 24, 2020 at 12:57:14PM +0200, Peter Zijlstra wrote:
> On Sat, Sep 19, 2020 at 09:42:49AM +0800, Peng Liu wrote:
> > When user changes sched_rt_{runtime, period}_us, then
> > 
> >   sched_rt_handler()
> >     -->	sched_dl_bandwidth_validate()
> > 	{
> > 		new_bw = global_rt_runtime()/global_rt_period();
> > 
> > 		for_each_possible_cpu(cpu) {
> > 			dl_b = dl_bw_of(cpu);
> > 			if (new_bw < dl_b->total_bw)
> > 				ret = -EBUSY;
> > 		}
> > 	}
> > 
> > Under CONFIG_SMP, dl_bw is per root domain , but not per CPU,
> > dl_b->total_bw is the allocated bandwidth of the whole root domain.
> > we should compare dl_b->total_bw against cpus*new_bw, where 'cpus'
> > is the number of CPUs of the root domain.
> 
> Is there an actual problem there? Spell it out.

I created another root domain(contains 2 CPUs) besides the default
one, and the global default rt bandwidth is 95%. Then I launched a
DL process which need 25% bandwidth and moved it to the new root
domain, so far so good.

Then I tried to change global rt bandwidth to 20% with cmd:
	echo 200000 > /proc/sys/kernel/sched_rt_runtime_us
but ending with the "device busy" error. Only values greater than
250000 could work.

The new root domain contains two CPUs, thus should could provide
totally 2*20%(>25%) bandwidth. So the error is strange.

Finally I found it is the sched_dl_global_validate() mistakenly
do the validation. It doesn't multiply the root domain weight.

way to reproduce:
cd /sys/fs/cgroup/cpuset/
mkdir cluster
echo 0 > cpuset.sched_load_balance
cd cluster
echo 10-11 > cpuset.cpus
echo 0 > cpuset.mems
echo 1 > cpuset.cpu_exclusive
echo pid-of-dl25 > tasks

> > 
> > [1] https://lkml.org/lkml/2010/2/28/119
> 
> Don't use lkml.org links, use lkml.kernel.org/r/$MsgID instead.

OK, I will.

> 
> > [!CONFIG_SMP build error]
> > Reported-by: kernel test robot <lkp@intel.com>
> > Signed-off-by: Peng Liu <iwtbavbm@gmail.com>
> 
> Quite frankly this patch is horrible #ifdef soup.

Frankly speaking, I also hate the ugly #ifdef guys, but I have no
idea how to eliminate them until seeing your method. Indeed, quite
clear. I will refine the patch according your suggestion. Thanks.

> 
> Can't you make something like the below work?
> 
> ---
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 3862a28cd05d..3f309e0f69f5 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -97,6 +97,17 @@ static inline unsigned long dl_bw_capacity(int i)
>  		return __dl_bw_capacity(i);
>  	}
>  }
> +
> +static inline bool dl_bw_visited(int cpu, u64 gen)
> +{
> +	struct root_domain *rd = cpu_rq(i)->rd;
> +
> +	if (rd->visit_gen == gen)
> +		return true;
> +
> +	rd->visit_gen = gen;
> +	return false;
> +}
>  #else
>  static inline struct dl_bw *dl_bw_of(int i)
>  {
> @@ -112,6 +123,11 @@ static inline unsigned long dl_bw_capacity(int i)
>  {
>  	return SCHED_CAPACITY_SCALE;
>  }
> +
> +static inline bool dl_bw_visited(int cpu, u64 gen)
> +{
> +	return false;
> +}
>  #endif
>  
>  static inline
> @@ -2513,31 +2529,35 @@ const struct sched_class dl_sched_class
>  
>  int sched_dl_global_validate(void)
>  {
> +	static u64 generation = 0;
>  	u64 runtime = global_rt_runtime();
>  	u64 period = global_rt_period();
>  	u64 new_bw = to_ratio(period, runtime);
> -	struct dl_bw *dl_b;
> -	int cpu, ret = 0;
> +	int cpu, cpus, ret = 0;
>  	unsigned long flags;
> +	struct dl_bw *dl_b;
> +	u64 gen = ++generation;
>  
>  	/*
>  	 * Here we want to check the bandwidth not being set to some
>  	 * value smaller than the currently allocated bandwidth in
>  	 * any of the root_domains.
> -	 *
> -	 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
> -	 * cycling on root_domains... Discussion on different/better
> -	 * solutions is welcome!
>  	 */
>  	for_each_possible_cpu(cpu) {
> +
>  		rcu_read_lock_sched();
> +		if (dl_bw_visited(cpu, gen))
> +			goto next;
> +
>  		dl_b = dl_bw_of(cpu);
> +		cpus = dl_bw_cpus(cpu);
>  
>  		raw_spin_lock_irqsave(&dl_b->lock, flags);
> -		if (new_bw < dl_b->total_bw)
> +		if (new_bw * cpus < dl_b->total_bw)
>  			ret = -EBUSY;
>  		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
>  
> +	next:
>  		rcu_read_unlock_sched();
>  
>  		if (ret)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 28709f6b0975..7f0947db6e2c 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -801,6 +801,8 @@ struct root_domain {
>  	struct dl_bw		dl_bw;
>  	struct cpudl		cpudl;
>  
> +	u64			visit_gen;
> +
>  #ifdef HAVE_RT_PUSH_IPI
>  	/*
>  	 * For IPI pull requests, loop across the rto_mask.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-09-24 15:59 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-19  1:42 [PATCH v3] sched/deadline: Fix sched_dl_global_validate() Peng Liu
2020-09-24 10:57 ` Peter Zijlstra
2020-09-24 15:59   ` Peng Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).