linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] sched/rt: optimize checking group rt scheduler constraints
@ 2020-01-25 14:50 Konstantin Khlebnikov
  2020-01-25 15:32 ` Cyrill Gorcunov
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Konstantin Khlebnikov @ 2020-01-25 14:50 UTC (permalink / raw)
  To: linux-kernel
  Cc: Juri Lelli, Vincent Guittot, Peter Zijlstra, Steven Rostedt,
	Ben Segall, Ingo Molnar, Mel Gorman, Dietmar Eggemann

Group RT scheduler contains protection against setting zero runtime for
cgroup with rt tasks. Right now function tg_set_rt_bandwidth() iterates
over all cpu cgroups and calls tg_has_rt_tasks() for any cgroup which
runtime is zero (not only for changed one). Default rt runtime is zero,
thus tg_has_rt_tasks() will is called for almost at cpu cgroups.

This protection already is slightly racy: runtime limit could be changed
between cpu_cgroup_can_attach() and cpu_cgroup_attach() because changing
cgroup attribute does not lock cgroup_mutex while attach does not lock
rt_constraints_mutex. Changing task scheduler class also races with
changing rt runtime: check in __sched_setscheduler() isn't protected.

Function tg_has_rt_tasks() iterates over all threads in the system.
This gives NR_CGROUPS * NR_TASKS operations under single tasklist_lock
locked for read tg_set_rt_bandwidth(). Any concurrent attempt of locking
tasklist_lock for write (for example fork) will stuck with disabled irqs.

This patch makes two optimizations:
1) Remove locking tasklist_lock and iterate only tasks in cgroup
2) Call tg_has_rt_tasks() iff rt runtime changes from non-zero to zero

All changed code is under CONFIG_RT_GROUP_SCHED.

Testcase:
# mkdir /sys/fs/cgroup/cpu/test{1..10000}
# echo 0 | tee /sys/fs/cgroup/cpu/test*/cpu.rt_runtime_us

At the same time without patch fork time will be >100ms:
# perf trace -e clone --duration 100 stress-ng --fork 1

Also remote ping will show timings >100ms caused by irq latency.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
---
 kernel/sched/rt.c |   24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e591d40fd645..95d1d7be84ef 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2396,10 +2396,11 @@ const struct sched_class rt_sched_class = {
  */
 static DEFINE_MUTEX(rt_constraints_mutex);
 
-/* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
 {
-	struct task_struct *g, *p;
+	struct task_struct *task;
+	struct css_task_iter it;
+	int ret = 0;
 
 	/*
 	 * Autogroups do not have RT tasks; see autogroup_create().
@@ -2407,12 +2408,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
 	if (task_group_is_autogroup(tg))
 		return 0;
 
-	for_each_process_thread(g, p) {
-		if (rt_task(p) && task_group(p) == tg)
-			return 1;
-	}
+	css_task_iter_start(&tg->css, 0, &it);
+	while (!ret && (task = css_task_iter_next(&it)))
+		ret |= rt_task(task);
+	css_task_iter_end(&it);
 
-	return 0;
+	return ret;
 }
 
 struct rt_schedulable_data {
@@ -2443,9 +2444,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 		return -EINVAL;
 
 	/*
-	 * Ensure we don't starve existing RT tasks.
+	 * Ensure we don't starve existing RT tasks if runtime turns zero.
 	 */
-	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+	if (rt_bandwidth_enabled() && !runtime &&
+	    tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 
 	total = to_ratio(period, runtime);
@@ -2511,7 +2513,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 		return -EINVAL;
 
 	mutex_lock(&rt_constraints_mutex);
-	read_lock(&tasklist_lock);
 	err = __rt_schedulable(tg, rt_period, rt_runtime);
 	if (err)
 		goto unlock;
@@ -2529,7 +2530,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 	}
 	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 unlock:
-	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
 	return err;
@@ -2588,9 +2588,7 @@ static int sched_rt_global_constraints(void)
 	int ret = 0;
 
 	mutex_lock(&rt_constraints_mutex);
-	read_lock(&tasklist_lock);
 	ret = __rt_schedulable(NULL, 0, 0);
-	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
 	return ret;


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] sched/rt: optimize checking group rt scheduler constraints
  2020-01-25 14:50 [PATCH] sched/rt: optimize checking group rt scheduler constraints Konstantin Khlebnikov
@ 2020-01-25 15:32 ` Cyrill Gorcunov
  2020-01-25 16:15   ` Konstantin Khlebnikov
  2020-01-27 17:47 ` Phil Auld
  2020-01-29 11:32 ` [tip: sched/core] sched/rt: Optimize checking group RT " tip-bot2 for Konstantin Khlebnikov
  2 siblings, 1 reply; 7+ messages in thread
From: Cyrill Gorcunov @ 2020-01-25 15:32 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-kernel, Juri Lelli, Vincent Guittot, Peter Zijlstra,
	Steven Rostedt, Ben Segall, Ingo Molnar, Mel Gorman,
	Dietmar Eggemann

On Sat, Jan 25, 2020 at 05:50:38PM +0300, Konstantin Khlebnikov wrote:
...
> -/* Must be called with tasklist_lock held */
>  static inline int tg_has_rt_tasks(struct task_group *tg)
>  {
> -	struct task_struct *g, *p;
> +	struct task_struct *task;
> +	struct css_task_iter it;
> +	int ret = 0;
>  
>  	/*
>  	 * Autogroups do not have RT tasks; see autogroup_create().
> @@ -2407,12 +2408,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
>  	if (task_group_is_autogroup(tg))
>  		return 0;
>  
> -	for_each_process_thread(g, p) {
> -		if (rt_task(p) && task_group(p) == tg)
> -			return 1;
> -	}
> +	css_task_iter_start(&tg->css, 0, &it);
> +	while (!ret && (task = css_task_iter_next(&it)))
> +		ret |= rt_task(task);

Plain 'ret = rt_task(task);' won't work?

> +	css_task_iter_end(&it);
>  
> -	return 0;
> +	return ret;
>  }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] sched/rt: optimize checking group rt scheduler constraints
  2020-01-25 15:32 ` Cyrill Gorcunov
@ 2020-01-25 16:15   ` Konstantin Khlebnikov
  2020-01-25 16:40     ` Cyrill Gorcunov
  0 siblings, 1 reply; 7+ messages in thread
From: Konstantin Khlebnikov @ 2020-01-25 16:15 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: linux-kernel, Juri Lelli, Vincent Guittot, Peter Zijlstra,
	Steven Rostedt, Ben Segall, Ingo Molnar, Mel Gorman,
	Dietmar Eggemann



On 25/01/2020 18.32, Cyrill Gorcunov wrote:
> On Sat, Jan 25, 2020 at 05:50:38PM +0300, Konstantin Khlebnikov wrote:
> ...
>> -/* Must be called with tasklist_lock held */
>>   static inline int tg_has_rt_tasks(struct task_group *tg)
>>   {
>> -	struct task_struct *g, *p;
>> +	struct task_struct *task;
>> +	struct css_task_iter it;
>> +	int ret = 0;
>>   
>>   	/*
>>   	 * Autogroups do not have RT tasks; see autogroup_create().
>> @@ -2407,12 +2408,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
>>   	if (task_group_is_autogroup(tg))
>>   		return 0;
>>   
>> -	for_each_process_thread(g, p) {
>> -		if (rt_task(p) && task_group(p) == tg)
>> -			return 1;
>> -	}
>> +	css_task_iter_start(&tg->css, 0, &it);
>> +	while (!ret && (task = css_task_iter_next(&it)))
>> +		ret |= rt_task(task);
> 
> Plain 'ret = rt_task(task);' won't work?

Should work too =)

> 
>> +	css_task_iter_end(&it);
>>   
>> -	return 0;
>> +	return ret;
>>   }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] sched/rt: optimize checking group rt scheduler constraints
  2020-01-25 16:15   ` Konstantin Khlebnikov
@ 2020-01-25 16:40     ` Cyrill Gorcunov
  0 siblings, 0 replies; 7+ messages in thread
From: Cyrill Gorcunov @ 2020-01-25 16:40 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-kernel, Juri Lelli, Vincent Guittot, Peter Zijlstra,
	Steven Rostedt, Ben Segall, Ingo Molnar, Mel Gorman,
	Dietmar Eggemann

On Sat, Jan 25, 2020 at 07:15:16PM +0300, Konstantin Khlebnikov wrote:
> > > +	css_task_iter_start(&tg->css, 0, &it);
> > > +	while (!ret && (task = css_task_iter_next(&it)))
> > > +		ret |= rt_task(task);
> > 
> > Plain 'ret = rt_task(task);' won't work?
> 
> Should work too =)

:-) no need for new patch then, we could update it on top if needed
but i guess compiler would optimize it as is.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] sched/rt: optimize checking group rt scheduler constraints
  2020-01-25 14:50 [PATCH] sched/rt: optimize checking group rt scheduler constraints Konstantin Khlebnikov
  2020-01-25 15:32 ` Cyrill Gorcunov
@ 2020-01-27 17:47 ` Phil Auld
  2020-01-29 11:32 ` [tip: sched/core] sched/rt: Optimize checking group RT " tip-bot2 for Konstantin Khlebnikov
  2 siblings, 0 replies; 7+ messages in thread
From: Phil Auld @ 2020-01-27 17:47 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: linux-kernel, Juri Lelli, Vincent Guittot, Peter Zijlstra,
	Steven Rostedt, Ben Segall, Ingo Molnar, Mel Gorman,
	Dietmar Eggemann

Hi Konstantin,

On Sat, Jan 25, 2020 at 05:50:38PM +0300 Konstantin Khlebnikov wrote:
> Group RT scheduler contains protection against setting zero runtime for
> cgroup with rt tasks. Right now function tg_set_rt_bandwidth() iterates
> over all cpu cgroups and calls tg_has_rt_tasks() for any cgroup which
> runtime is zero (not only for changed one). Default rt runtime is zero,
> thus tg_has_rt_tasks() will is called for almost at cpu cgroups.
> 
> This protection already is slightly racy: runtime limit could be changed
> between cpu_cgroup_can_attach() and cpu_cgroup_attach() because changing
> cgroup attribute does not lock cgroup_mutex while attach does not lock
> rt_constraints_mutex. Changing task scheduler class also races with
> changing rt runtime: check in __sched_setscheduler() isn't protected.
> 
> Function tg_has_rt_tasks() iterates over all threads in the system.
> This gives NR_CGROUPS * NR_TASKS operations under single tasklist_lock
> locked for read tg_set_rt_bandwidth(). Any concurrent attempt of locking
> tasklist_lock for write (for example fork) will stuck with disabled irqs.
> 
> This patch makes two optimizations:
> 1) Remove locking tasklist_lock and iterate only tasks in cgroup
> 2) Call tg_has_rt_tasks() iff rt runtime changes from non-zero to zero
> 
> All changed code is under CONFIG_RT_GROUP_SCHED.
> 
> Testcase:
> # mkdir /sys/fs/cgroup/cpu/test{1..10000}
> # echo 0 | tee /sys/fs/cgroup/cpu/test*/cpu.rt_runtime_us
> 
> At the same time without patch fork time will be >100ms:
> # perf trace -e clone --duration 100 stress-ng --fork 1
> 
> Also remote ping will show timings >100ms caused by irq latency.
> 
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> ---
>  kernel/sched/rt.c |   24 +++++++++++-------------
>  1 file changed, 11 insertions(+), 13 deletions(-)
> 
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index e591d40fd645..95d1d7be84ef 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -2396,10 +2396,11 @@ const struct sched_class rt_sched_class = {
>   */
>  static DEFINE_MUTEX(rt_constraints_mutex);
>  
> -/* Must be called with tasklist_lock held */
>  static inline int tg_has_rt_tasks(struct task_group *tg)
>  {
> -	struct task_struct *g, *p;
> +	struct task_struct *task;
> +	struct css_task_iter it;
> +	int ret = 0;
>  
>  	/*
>  	 * Autogroups do not have RT tasks; see autogroup_create().
> @@ -2407,12 +2408,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
>  	if (task_group_is_autogroup(tg))
>  		return 0;
>  
> -	for_each_process_thread(g, p) {
> -		if (rt_task(p) && task_group(p) == tg)
> -			return 1;
> -	}
> +	css_task_iter_start(&tg->css, 0, &it);
> +	while (!ret && (task = css_task_iter_next(&it)))
> +		ret |= rt_task(task);
> +	css_task_iter_end(&it);
>  

Why not a break; in there alon with the change to "="?


> -	return 0;
> +	return ret;
>  }
>  
>  struct rt_schedulable_data {
> @@ -2443,9 +2444,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
>  		return -EINVAL;
>  
>  	/*
> -	 * Ensure we don't starve existing RT tasks.
> +	 * Ensure we don't starve existing RT tasks if runtime turns zero.
>  	 */
> -	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
> +	if (rt_bandwidth_enabled() && !runtime &&
> +	    tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
>  		return -EBUSY;
>  
>  	total = to_ratio(period, runtime);
> @@ -2511,7 +2513,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
>  		return -EINVAL;
>  
>  	mutex_lock(&rt_constraints_mutex);
> -	read_lock(&tasklist_lock);
>  	err = __rt_schedulable(tg, rt_period, rt_runtime);
>  	if (err)
>  		goto unlock;
> @@ -2529,7 +2530,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
>  	}
>  	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
>  unlock:
> -	read_unlock(&tasklist_lock);
>  	mutex_unlock(&rt_constraints_mutex);
>  
>  	return err;
> @@ -2588,9 +2588,7 @@ static int sched_rt_global_constraints(void)
>  	int ret = 0;
>  
>  	mutex_lock(&rt_constraints_mutex);
> -	read_lock(&tasklist_lock);
>  	ret = __rt_schedulable(NULL, 0, 0);
> -	read_unlock(&tasklist_lock);
>  	mutex_unlock(&rt_constraints_mutex);
>  
>  	return ret;
> 


Thanks,
Phil

-- 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [tip: sched/core] sched/rt: Optimize checking group RT scheduler constraints
  2020-01-25 14:50 [PATCH] sched/rt: optimize checking group rt scheduler constraints Konstantin Khlebnikov
  2020-01-25 15:32 ` Cyrill Gorcunov
  2020-01-27 17:47 ` Phil Auld
@ 2020-01-29 11:32 ` tip-bot2 for Konstantin Khlebnikov
  2020-01-30 19:10   ` Phil Auld
  2 siblings, 1 reply; 7+ messages in thread
From: tip-bot2 for Konstantin Khlebnikov @ 2020-01-29 11:32 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Konstantin Khlebnikov, Peter Zijlstra (Intel), Ingo Molnar, x86, LKML

The following commit has been merged into the sched/core branch of tip:

Commit-ID:     b4fb015eeff7f3e5518a7dbe8061169a3e2f2bc7
Gitweb:        https://git.kernel.org/tip/b4fb015eeff7f3e5518a7dbe8061169a3e2f2bc7
Author:        Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
AuthorDate:    Sat, 25 Jan 2020 17:50:38 +03:00
Committer:     Ingo Molnar <mingo@kernel.org>
CommitterDate: Tue, 28 Jan 2020 21:37:09 +01:00

sched/rt: Optimize checking group RT scheduler constraints

Group RT scheduler contains protection against setting zero runtime for
cgroup with RT tasks. Right now function tg_set_rt_bandwidth() iterates
over all CPU cgroups and calls tg_has_rt_tasks() for any cgroup which
runtime is zero (not only for changed one). Default RT runtime is zero,
thus tg_has_rt_tasks() will is called for almost at CPU cgroups.

This protection already is slightly racy: runtime limit could be changed
between cpu_cgroup_can_attach() and cpu_cgroup_attach() because changing
cgroup attribute does not lock cgroup_mutex while attach does not lock
rt_constraints_mutex. Changing task scheduler class also races with
changing rt runtime: check in __sched_setscheduler() isn't protected.

Function tg_has_rt_tasks() iterates over all threads in the system.
This gives NR_CGROUPS * NR_TASKS operations under single tasklist_lock
locked for read tg_set_rt_bandwidth(). Any concurrent attempt of locking
tasklist_lock for write (for example fork) will stuck with disabled irqs.

This patch makes two optimizations:
1) Remove locking tasklist_lock and iterate only tasks in cgroup
2) Call tg_has_rt_tasks() iff rt runtime changes from non-zero to zero

All changed code is under CONFIG_RT_GROUP_SCHED.

Testcase:

 # mkdir /sys/fs/cgroup/cpu/test{1..10000}
 # echo 0 | tee /sys/fs/cgroup/cpu/test*/cpu.rt_runtime_us

At the same time without patch fork time will be >100ms:

 # perf trace -e clone --duration 100 stress-ng --fork 1

Also remote ping will show timings >100ms caused by irq latency.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/157996383820.4651.11292439232549211693.stgit@buzz
---
 kernel/sched/rt.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4043abe..55a4a50 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2449,10 +2449,11 @@ const struct sched_class rt_sched_class = {
  */
 static DEFINE_MUTEX(rt_constraints_mutex);
 
-/* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
 {
-	struct task_struct *g, *p;
+	struct task_struct *task;
+	struct css_task_iter it;
+	int ret = 0;
 
 	/*
 	 * Autogroups do not have RT tasks; see autogroup_create().
@@ -2460,12 +2461,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
 	if (task_group_is_autogroup(tg))
 		return 0;
 
-	for_each_process_thread(g, p) {
-		if (rt_task(p) && task_group(p) == tg)
-			return 1;
-	}
+	css_task_iter_start(&tg->css, 0, &it);
+	while (!ret && (task = css_task_iter_next(&it)))
+		ret |= rt_task(task);
+	css_task_iter_end(&it);
 
-	return 0;
+	return ret;
 }
 
 struct rt_schedulable_data {
@@ -2496,9 +2497,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 		return -EINVAL;
 
 	/*
-	 * Ensure we don't starve existing RT tasks.
+	 * Ensure we don't starve existing RT tasks if runtime turns zero.
 	 */
-	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+	if (rt_bandwidth_enabled() && !runtime &&
+	    tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 
 	total = to_ratio(period, runtime);
@@ -2564,7 +2566,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 		return -EINVAL;
 
 	mutex_lock(&rt_constraints_mutex);
-	read_lock(&tasklist_lock);
 	err = __rt_schedulable(tg, rt_period, rt_runtime);
 	if (err)
 		goto unlock;
@@ -2582,7 +2583,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 	}
 	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 unlock:
-	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
 	return err;
@@ -2641,9 +2641,7 @@ static int sched_rt_global_constraints(void)
 	int ret = 0;
 
 	mutex_lock(&rt_constraints_mutex);
-	read_lock(&tasklist_lock);
 	ret = __rt_schedulable(NULL, 0, 0);
-	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
 	return ret;

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [tip: sched/core] sched/rt: Optimize checking group RT scheduler constraints
  2020-01-29 11:32 ` [tip: sched/core] sched/rt: Optimize checking group RT " tip-bot2 for Konstantin Khlebnikov
@ 2020-01-30 19:10   ` Phil Auld
  0 siblings, 0 replies; 7+ messages in thread
From: Phil Auld @ 2020-01-30 19:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-tip-commits, Konstantin Khlebnikov, Peter Zijlstra (Intel),
	Ingo Molnar, x86

On Wed, Jan 29, 2020 at 11:32:56AM -0000 tip-bot2 for Konstantin Khlebnikov wrote:
> The following commit has been merged into the sched/core branch of tip:
> 
> Commit-ID:     b4fb015eeff7f3e5518a7dbe8061169a3e2f2bc7
> Gitweb:        https://git.kernel.org/tip/b4fb015eeff7f3e5518a7dbe8061169a3e2f2bc7
> Author:        Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> AuthorDate:    Sat, 25 Jan 2020 17:50:38 +03:00
> Committer:     Ingo Molnar <mingo@kernel.org>
> CommitterDate: Tue, 28 Jan 2020 21:37:09 +01:00
> 
> sched/rt: Optimize checking group RT scheduler constraints
> 
> Group RT scheduler contains protection against setting zero runtime for
> cgroup with RT tasks. Right now function tg_set_rt_bandwidth() iterates
> over all CPU cgroups and calls tg_has_rt_tasks() for any cgroup which
> runtime is zero (not only for changed one). Default RT runtime is zero,
> thus tg_has_rt_tasks() will is called for almost at CPU cgroups.
> 
> This protection already is slightly racy: runtime limit could be changed
> between cpu_cgroup_can_attach() and cpu_cgroup_attach() because changing
> cgroup attribute does not lock cgroup_mutex while attach does not lock
> rt_constraints_mutex. Changing task scheduler class also races with
> changing rt runtime: check in __sched_setscheduler() isn't protected.
> 
> Function tg_has_rt_tasks() iterates over all threads in the system.
> This gives NR_CGROUPS * NR_TASKS operations under single tasklist_lock
> locked for read tg_set_rt_bandwidth(). Any concurrent attempt of locking
> tasklist_lock for write (for example fork) will stuck with disabled irqs.
> 
> This patch makes two optimizations:
> 1) Remove locking tasklist_lock and iterate only tasks in cgroup
> 2) Call tg_has_rt_tasks() iff rt runtime changes from non-zero to zero
> 
> All changed code is under CONFIG_RT_GROUP_SCHED.
> 
> Testcase:
> 
>  # mkdir /sys/fs/cgroup/cpu/test{1..10000}
>  # echo 0 | tee /sys/fs/cgroup/cpu/test*/cpu.rt_runtime_us
> 
> At the same time without patch fork time will be >100ms:
> 
>  # perf trace -e clone --duration 100 stress-ng --fork 1
> 
> Also remote ping will show timings >100ms caused by irq latency.
> 
> Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Signed-off-by: Ingo Molnar <mingo@kernel.org>
> Link: https://lkml.kernel.org/r/157996383820.4651.11292439232549211693.stgit@buzz
> ---
>  kernel/sched/rt.c | 24 +++++++++++-------------
>  1 file changed, 11 insertions(+), 13 deletions(-)
> 
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 4043abe..55a4a50 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -2449,10 +2449,11 @@ const struct sched_class rt_sched_class = {
>   */
>  static DEFINE_MUTEX(rt_constraints_mutex);
>  
> -/* Must be called with tasklist_lock held */
>  static inline int tg_has_rt_tasks(struct task_group *tg)
>  {
> -	struct task_struct *g, *p;
> +	struct task_struct *task;
> +	struct css_task_iter it;
> +	int ret = 0;
>  
>  	/*
>  	 * Autogroups do not have RT tasks; see autogroup_create().
> @@ -2460,12 +2461,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
>  	if (task_group_is_autogroup(tg))
>  		return 0;
>  
> -	for_each_process_thread(g, p) {
> -		if (rt_task(p) && task_group(p) == tg)
> -			return 1;
> -	}
> +	css_task_iter_start(&tg->css, 0, &it);
> +	while (!ret && (task = css_task_iter_next(&it)))
> +		ret |= rt_task(task);
> +	css_task_iter_end(&it);
>  

Heh, I misread it the first time and didn't see the !ret condition. But I think 
it should be just "ret = ".

But thanks for getting this fixed.


Cheers,
Phil


> -	return 0;
> +	return ret;
>  }
>  
>  struct rt_schedulable_data {
> @@ -2496,9 +2497,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
>  		return -EINVAL;
>  
>  	/*
> -	 * Ensure we don't starve existing RT tasks.
> +	 * Ensure we don't starve existing RT tasks if runtime turns zero.
>  	 */
> -	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
> +	if (rt_bandwidth_enabled() && !runtime &&
> +	    tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
>  		return -EBUSY;
>  
>  	total = to_ratio(period, runtime);
> @@ -2564,7 +2566,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
>  		return -EINVAL;
>  
>  	mutex_lock(&rt_constraints_mutex);
> -	read_lock(&tasklist_lock);
>  	err = __rt_schedulable(tg, rt_period, rt_runtime);
>  	if (err)
>  		goto unlock;
> @@ -2582,7 +2583,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
>  	}
>  	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
>  unlock:
> -	read_unlock(&tasklist_lock);
>  	mutex_unlock(&rt_constraints_mutex);
>  
>  	return err;
> @@ -2641,9 +2641,7 @@ static int sched_rt_global_constraints(void)
>  	int ret = 0;
>  
>  	mutex_lock(&rt_constraints_mutex);
> -	read_lock(&tasklist_lock);
>  	ret = __rt_schedulable(NULL, 0, 0);
> -	read_unlock(&tasklist_lock);
>  	mutex_unlock(&rt_constraints_mutex);
>  
>  	return ret;
> 

-- 


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-01-30 19:10 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-01-25 14:50 [PATCH] sched/rt: optimize checking group rt scheduler constraints Konstantin Khlebnikov
2020-01-25 15:32 ` Cyrill Gorcunov
2020-01-25 16:15   ` Konstantin Khlebnikov
2020-01-25 16:40     ` Cyrill Gorcunov
2020-01-27 17:47 ` Phil Auld
2020-01-29 11:32 ` [tip: sched/core] sched/rt: Optimize checking group RT " tip-bot2 for Konstantin Khlebnikov
2020-01-30 19:10   ` Phil Auld

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).