[patch] sched,rt: non-isolated cores lift isolcpus throttle for CONFIG_RT_GROUP

All of lore.kernel.org
 help / color / mirror / Atom feed

* [patch] sched,rt: non-isolated cores lift isolcpus throttle for CONFIG_RT_GROUP_SCHED
@ 2012-04-03  9:08 Mike Galbraith
  2012-04-03  9:19 ` [patch] sched,rt: let the user see rt queues in /proc/sched_debug Mike Galbraith
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-04-03  9:08 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML

s/patch/hack.  Better ideas?

When CONFIG_RT_GROUP_SCHED is enabled, isolcpus have no replentishment timer
running, and unlike !CONFIG_RT_GROUP_SCHED, are not in rd->span of the cpu
running replentishment.  If you trigger the throttle, you're rewarded with a
dead box.  Should the user reassign cpus to a domain, they become sane again,
replentishment starts/stops as usual. 

Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched/core.c  |   15 ++++++++++++++-
 kernel/sched/rt.c    |   16 ++++++++++++++--
 kernel/sched/sched.h |    5 +++++
 3 files changed, 33 insertions(+), 3 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5877,6 +5877,14 @@ cpu_attach_domain(struct sched_domain *s
 
 	sched_domain_debug(sd, cpu);
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	/* If the cpu was an isolcpu, it no longer is. */
+	if (sd) {
+		cpumask_clear_cpu(cpu, cpu_isolated_map);
+		nr_isolated_cpus = cpumask_weight(cpu_isolated_map);
+	}
+#endif
+
 	rq_attach_root(rq, rd);
 	tmp = rq->sd;
 	rcu_assign_pointer(rq->sd, sd);
@@ -5886,13 +5894,18 @@ cpu_attach_domain(struct sched_domain *s
 }
 
 /* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
+cpumask_var_t cpu_isolated_map;
+
+__read_mostly int nr_isolated_cpus;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 	cpulist_parse(str, cpu_isolated_map);
+#ifdef CONFIG_RT_GROUP_SCHED
+	nr_isolated_cpus = cpumask_weight(cpu_isolated_map);
+#endif
 	return 1;
 }
 
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -778,10 +778,11 @@ static inline int balance_runtime(struct
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
-	int i, idle = 1, throttled = 0;
+	int i, idle = 1, throttled = 0, isol_cpus = nr_isolated_cpus;
 	const struct cpumask *span;
 
 	span = sched_rt_period_mask();
+do_isolcpus:
 	for_each_cpu(i, span) {
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
@@ -792,7 +793,7 @@ static int do_sched_rt_period_timer(stru
 			u64 runtime;
 
 			raw_spin_lock(&rt_rq->rt_runtime_lock);
-			if (rt_rq->rt_throttled)
+			if (rt_rq->rt_throttled && span != cpu_isolated_map)
 				balance_runtime(rt_rq);
 			runtime = rt_rq->rt_runtime;
 			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
@@ -823,6 +824,17 @@ static int do_sched_rt_period_timer(stru
 		raw_spin_unlock(&rq->lock);
 	}
 
+	/*
+	 * Hack: unthrottle isolcpus for RT_GROUP_SCHED.  No replentishment
+	 * timer is running on isolcpus, and unlike !RT_GROUP_SCHED, they're
+	 * not in the rd->span of the cpu running the timer.
+	 */
+	if (isol_cpus) {
+		span = cpu_isolated_map;
+		isol_cpus = 0;
+		goto do_isolcpus;
+	}
+
 	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
 		return 1;
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -572,6 +572,11 @@ static inline void set_task_rq(struct ta
 #endif
 }
 
+/* cpus with isolated domains */
+extern  cpumask_var_t cpu_isolated_map;
+
+extern __read_mostly int nr_isolated_cpus;
+
 #else /* CONFIG_CGROUP_SCHED */
 
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [patch] sched,rt: let the user see rt queues in /proc/sched_debug
  2012-04-03  9:08 [patch] sched,rt: non-isolated cores lift isolcpus throttle for CONFIG_RT_GROUP_SCHED Mike Galbraith
@ 2012-04-03  9:19 ` Mike Galbraith
  2012-04-07  8:58   ` [patch] sched,cgroup_sched: fix up task_groups list buglet Mike Galbraith
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-04-03  9:19 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML

ec514c48 made rt queues invisible in sched_debug, restore visibility.

Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched/rt.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2079,11 +2079,10 @@ extern void print_rt_rq(struct seq_file
 
 void print_rt_stats(struct seq_file *m, int cpu)
 {
-	rt_rq_iter_t iter;
 	struct rt_rq *rt_rq;
 
 	rcu_read_lock();
-	for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
+	for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
 		print_rt_rq(m, cpu, rt_rq);
 	rcu_read_unlock();
 }



^ permalink raw reply	[flat|nested] 14+ messages in thread

* [patch] sched,cgroup_sched: fix up task_groups list buglet
  2012-04-03  9:19 ` [patch] sched,rt: let the user see rt queues in /proc/sched_debug Mike Galbraith
@ 2012-04-07  8:58   ` Mike Galbraith
  2012-04-07  9:54     ` RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work Mike Galbraith
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-04-07  8:58 UTC (permalink / raw)
  To: LKML; +Cc: Peter Zijlstra, Ingo Molnar

On Tue, 2012-04-03 at 11:19 +0200, Mike Galbraith wrote: 
> ec514c48 made rt queues invisible in sched_debug, restore visibility.

<g> removing their cloak of invisibility might be a tad better than just
making them visible while futzing with annoying RT_GROUP_SCHED throttle.

sched,cgroup_sched: fix up task_groups list

Oops happened during sched migration to kernel/sched, fix it up.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: stable@kernel.org # v3.3
---
 kernel/sched/core.c  |    1 +
 kernel/sched/sched.h |    2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6871,6 +6871,7 @@ int in_sched_functions(unsigned long add
 
 #ifdef CONFIG_CGROUP_SCHED
 struct task_group root_task_group;
+LIST_HEAD(task_groups);
 #endif
 
 DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
 struct cfs_rq;
 struct rt_rq;
 
-static LIST_HEAD(task_groups);
+extern struct list_head task_groups;
 
 struct cfs_bandwidth {
 #ifdef CONFIG_CFS_BANDWIDTH



^ permalink raw reply	[flat|nested] 14+ messages in thread

* RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-07  8:58   ` [patch] sched,cgroup_sched: fix up task_groups list buglet Mike Galbraith
@ 2012-04-07  9:54     ` Mike Galbraith
  2012-04-10  9:08       ` Mike Galbraith
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-04-07  9:54 UTC (permalink / raw)
  To: LKML; +Cc: Peter Zijlstra, Ingo Molnar

Greetings,

I'm having trouble with RT_GROUP_SCHED throttle kicking in and staying
engaged (timer troubles).  Either groups execute one after the other
(frob timer above you), or bandwidth is wrong, or the reason I started
squabbling with this thing in the first place happens, one or the other
group gets stuck, even with only two groups, with the root task group
throttled, and the victim is marooned until I kill the cgroup setup.  If
(say) grp1 starts first, grp2 is screwed, or the other way around.

With this patch, the thing appears to work perfectly, but it doesn't
look correct, since I'm futzing with ->rt_time where I should not.

Not so pretty ascii-art:

/----------/system cpu 0-2, rt 300000-----/foo cpu 2, rt 100000
    \
     \
      \----/rtcpus cpu 3, rt 300000---\---/bar cpu 3, rt 100000
                                       \
                                        \-/baz cpu 3, rt 100000

It only needs to be two groups, grp1 containing most of the system, the
other rt only.  With the patch, the above setup works (last setup I
prodded box with), and bandwidth looked fine, twiddle budgets or not.

I just happened to notice the throttle wasn't doing it's thing right
after discovering that isolcpus is busted with RT_GROUP_SCHED.  Thought
I should probably beat on it a little.  The darn thing beat me back :)

---
 kernel/sched/rt.c |   76 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 31 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -776,55 +776,69 @@ static inline int balance_runtime(struct
 }
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_RT_GROUP_SCHED
+#define for_each_rt_rq_up_from(rt_rq, iter, rq)			\
+	for (iter = rt_rq->tg; iter; iter = iter->parent,	\
+		rt_rq = iter ? iter->rt_rq[cpu_of(rq)] : NULL)
+#else
+#define for_each_rt_rq_up_from(rt_rq, iter, rq)			\
+	for (iter = rt_rq; iter; iter = NULL)
+#endif
+
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
 	int i, idle = 1, throttled = 0;
 	const struct cpumask *span;
+	rt_rq_iter_t iter;
 
 	span = sched_rt_period_mask();
 	for_each_cpu(i, span) {
-		int enqueue = 0;
+		int enqueue = 0, depth = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+		struct rq *rq = cpu_rq(i);
 
 		raw_spin_lock(&rq->lock);
-		if (rt_rq->rt_time) {
-			u64 runtime;
-
-			raw_spin_lock(&rt_rq->rt_runtime_lock);
-			if (rt_rq->rt_throttled)
-				balance_runtime(rt_rq);
-			runtime = rt_rq->rt_runtime;
-			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
-			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
-				rt_rq->rt_throttled = 0;
+		for_each_rt_rq_up_from(rt_rq, iter, rq) {
+			if (rt_rq->rt_time) {
+				u64 runtime;
+
+				raw_spin_lock(&rt_rq->rt_runtime_lock);
+				if (rt_rq->rt_throttled)
+					balance_runtime(rt_rq);
+				runtime = rt_rq->rt_runtime;
+				rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
+				if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
+					rt_rq->rt_throttled = 0;
+					enqueue = 1;
+
+					/*
+					 * Force a clock update if the CPU was idle,
+					 * lest wakeup -> unthrottle time accumulate.
+					 */
+					if (rt_rq->rt_nr_running && rq->curr == rq->idle)
+						rq->skip_clock_update = -1;
+				}
+				raw_spin_unlock(&rt_rq->rt_runtime_lock);
+			} else if (!rt_rq_throttled(rt_rq))
 				enqueue = 1;
 
-				/*
-				 * Force a clock update if the CPU was idle,
-				 * lest wakeup -> unthrottle time accumulate.
-				 */
-				if (rt_rq->rt_nr_running && rq->curr == rq->idle)
-					rq->skip_clock_update = -1;
+			if (enqueue)
+				sched_rt_rq_enqueue(rt_rq);
+
+			if (!depth++) {
+				if (rt_rq->rt_throttled) {
+					throttled = 1;
+					idle = 0;
+				} else if (rt_rq->rt_time || rt_rq->rt_nr_running)
+					idle = 0;
 			}
-			if (rt_rq->rt_time || rt_rq->rt_nr_running)
-				idle = 0;
-			raw_spin_unlock(&rt_rq->rt_runtime_lock);
-		} else if (rt_rq->rt_nr_running) {
-			idle = 0;
-			if (!rt_rq_throttled(rt_rq))
-				enqueue = 1;
-		}
-		if (rt_rq->rt_throttled)
-			throttled = 1;
 
-		if (enqueue)
-			sched_rt_rq_enqueue(rt_rq);
+		}
 		raw_spin_unlock(&rq->lock);
 	}
 
 	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
-		return 1;
+		idle = 1;
 
 	return idle;
 }



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-07  9:54     ` RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work Mike Galbraith
@ 2012-04-10  9:08       ` Mike Galbraith
  2012-04-14 11:10         ` Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-04-10  9:08 UTC (permalink / raw)
  To: LKML; +Cc: Peter Zijlstra, Ingo Molnar

On Sat, 2012-04-07 at 11:54 +0200, Mike Galbraith wrote:

> With this patch, the thing appears to work perfectly, but it doesn't
> look correct, since I'm futzing with ->rt_time where I should not.

s/doesn't look/definitely isn't.  This failed the "definitely works so
may live on despite horrific appearance" test anyway, so how about just
stop pretending root isn't global instead?

sched,rt: fix isolated CPUs leaving root_task_group indefinitely throttled

Root task group bandwidth replenishment must service all CPUs regardless of
where it was last started.

Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 kernel/sched/rt.c |   13 +++++++++++++
 1 file changed, 13 insertions(+)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -782,6 +782,19 @@ static int do_sched_rt_period_timer(stru
 	const struct cpumask *span;
 
 	span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+	/*
+	 * FIXME: isolated CPUs should really leave the root task group,
+	 * whether they are isolcpus or were isolated via cpusets, lest
+	 * the timer run on a CPU which does not service all runqueues,
+	 * potentially leaving other CPUs indefinitely throttled.  If
+	 * isolation is really required, the user will turn the throttle
+	 * off to kill the perturbations it causes anyway.  Meanwhile,
+	 * this maintains functionality for boot and/or troubleshooting.
+	 */
+	if (rt_b == &root_task_group.rt_bandwidth)
+		span = cpu_online_mask;
+#endif
 	for_each_cpu(i, span) {
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-10  9:08       ` Mike Galbraith
@ 2012-04-14 11:10         ` Peter Zijlstra
  2012-04-15  3:37           ` Mike Galbraith
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2012-04-14 11:10 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: LKML, Ingo Molnar

On Tue, 2012-04-10 at 11:08 +0200, Mike Galbraith wrote:

> sched,rt: fix isolated CPUs leaving root_task_group indefinitely throttled
> 
> Root task group bandwidth replenishment must service all CPUs regardless of
> where it was last started.
> 
> Signed-off-by: Mike Galbraith <efault@gmx.de>
> ---
>  kernel/sched/rt.c |   13 +++++++++++++
>  1 file changed, 13 insertions(+)
> 
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -782,6 +782,19 @@ static int do_sched_rt_period_timer(stru
>  	const struct cpumask *span;
>  
>  	span = sched_rt_period_mask();
> +#ifdef CONFIG_RT_GROUP_SCHED
> +	/*
> +	 * FIXME: isolated CPUs should really leave the root task group,

No no, that's the wrong fix, the right fix is to remove isolcpus :-)

> +	 * whether they are isolcpus or were isolated via cpusets, lest
> +	 * the timer run on a CPU which does not service all runqueues,
> +	 * potentially leaving other CPUs indefinitely throttled.  If
> +	 * isolation is really required, the user will turn the throttle
> +	 * off to kill the perturbations it causes anyway.  Meanwhile,
> +	 * this maintains functionality for boot and/or troubleshooting.
> +	 */
> +	if (rt_b == &root_task_group.rt_bandwidth)
> +		span = cpu_online_mask;
> +#endif
>  	for_each_cpu(i, span) {
>  		int enqueue = 0;
>  		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);

I guess the alternative 'fix' is to not account the rt_runtime on
isolated cpus.. does something like the below actually work?

---
 kernel/sched/rt.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e..dc2b5b6 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -922,6 +922,9 @@ static void update_curr_rt(struct rq *rq)
 	if (!rt_bandwidth_enabled())
 		return;
 
+	if (cpumask_test_cpu(cpu_of(rq), cpu_isolated_map))
+		return;
+
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
 


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-14 11:10         ` Peter Zijlstra
@ 2012-04-15  3:37           ` Mike Galbraith
  2012-04-15  3:44             ` Mike Galbraith
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-04-15  3:37 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Ingo Molnar

On Sat, 2012-04-14 at 13:10 +0200, Peter Zijlstra wrote: 
> On Tue, 2012-04-10 at 11:08 +0200, Mike Galbraith wrote:
>  
> > --- a/kernel/sched/rt.c
> > +++ b/kernel/sched/rt.c
> > @@ -782,6 +782,19 @@ static int do_sched_rt_period_timer(stru
> >  	const struct cpumask *span;
> >  
> >  	span = sched_rt_period_mask();
> > +#ifdef CONFIG_RT_GROUP_SCHED
> > +	/*
> > +	 * FIXME: isolated CPUs should really leave the root task group,
> 
> No no, that's the wrong fix, the right fix is to remove isolcpus :-)

Yeah, isolcpus needs to die, but...

> I guess the alternative 'fix' is to not account the rt_runtime on
> isolated cpus.. does something like the below actually work?

I haven't tried it, because the exact same thing happens when you
isolate via cpusets directly below root.  One timer, two (or more)
rd->span, so _somebody_ is screwed.

-Mike


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-15  3:37           ` Mike Galbraith
@ 2012-04-15  3:44             ` Mike Galbraith
  2012-04-15  4:51               ` Mike Galbraith
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-04-15  3:44 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Ingo Molnar

On Sun, 2012-04-15 at 05:37 +0200, Mike Galbraith wrote: 
> On Sat, 2012-04-14 at 13:10 +0200, Peter Zijlstra wrote: 
> > On Tue, 2012-04-10 at 11:08 +0200, Mike Galbraith wrote:
> >  
> > > --- a/kernel/sched/rt.c
> > > +++ b/kernel/sched/rt.c
> > > @@ -782,6 +782,19 @@ static int do_sched_rt_period_timer(stru
> > >  	const struct cpumask *span;
> > >  
> > >  	span = sched_rt_period_mask();
> > > +#ifdef CONFIG_RT_GROUP_SCHED
> > > +	/*
> > > +	 * FIXME: isolated CPUs should really leave the root task group,
> > 
> > No no, that's the wrong fix, the right fix is to remove isolcpus :-)
> 
> Yeah, isolcpus needs to die, but...
> 
> > I guess the alternative 'fix' is to not account the rt_runtime on
> > isolated cpus.. does something like the below actually work?
> 
> I haven't tried it, because the exact same thing happens when you
> isolate via cpusets directly below root.  One timer, two (or more)
> rd->span, so _somebody_ is screwed.

You _could_ bail on !rq->sd I suppose, but the way I hacked around it,
the user can keep the throttle for testing/troubleshooting their
isolated setup, and turn it off in production.  OTOH, auto throttle
disable for all isolated sets could work just as well.

-Mike


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-15  3:44             ` Mike Galbraith
@ 2012-04-15  4:51               ` Mike Galbraith
  2012-04-18  5:20                 ` Yong Zhang
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-04-15  4:51 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: LKML, Ingo Molnar

On Sun, 2012-04-15 at 05:44 +0200, Mike Galbraith wrote: 
> On Sun, 2012-04-15 at 05:37 +0200, Mike Galbraith wrote: 
> > On Sat, 2012-04-14 at 13:10 +0200, Peter Zijlstra wrote: 
> > > On Tue, 2012-04-10 at 11:08 +0200, Mike Galbraith wrote:
> > >  
> > > > --- a/kernel/sched/rt.c
> > > > +++ b/kernel/sched/rt.c
> > > > @@ -782,6 +782,19 @@ static int do_sched_rt_period_timer(stru
> > > >  	const struct cpumask *span;
> > > >  
> > > >  	span = sched_rt_period_mask();
> > > > +#ifdef CONFIG_RT_GROUP_SCHED
> > > > +	/*
> > > > +	 * FIXME: isolated CPUs should really leave the root task group,
> > > 
> > > No no, that's the wrong fix, the right fix is to remove isolcpus :-)
> > 
> > Yeah, isolcpus needs to die, but...
> > 
> > > I guess the alternative 'fix' is to not account the rt_runtime on
> > > isolated cpus.. does something like the below actually work?
> > 
> > I haven't tried it, because the exact same thing happens when you
> > isolate via cpusets directly below root.  One timer, two (or more)
> > rd->span, so _somebody_ is screwed.
> 
> You _could_ bail on !rq->sd I suppose, but the way I hacked around it,
> the user can keep the throttle for testing/troubleshooting their
> isolated setup, and turn it off in production.  OTOH, auto throttle
> disable for all isolated sets could work just as well.

Like so seems to work.  I personally like 2 lines better, but whatever
solves dinky but deadly problem works for me.

---
 kernel/sched/core.c  |    7 ++++++-
 kernel/sched/rt.c    |    9 +++++++++
 kernel/sched/sched.h |    3 +++
 3 files changed, 18 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5875,6 +5875,11 @@ cpu_attach_domain(struct sched_domain *s
 			sd->child = NULL;
 	}
 
+	if (sd)
+		cpumask_clear_cpu(cpu, cpu_isolated_map);
+	else
+		cpumask_set_cpu(cpu, cpu_isolated_map);
+
 	sched_domain_debug(sd, cpu);
 
 	rq_attach_root(rq, rd);
@@ -5886,7 +5891,7 @@ cpu_attach_domain(struct sched_domain *s
 }
 
 /* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
+cpumask_var_t cpu_isolated_map;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -922,6 +922,9 @@ static void update_curr_rt(struct rq *rq
 	if (!rt_bandwidth_enabled())
 		return;
 
+	if (cpumask_test_cpu(cpu_of(rq), cpu_isolated_map))
+		return;
+
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
 
@@ -1014,6 +1017,9 @@ static inline void dec_rt_prio(struct rt
 static void
 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+	if (cpumask_test_cpu(rq_of_rt_rq(rt_rq)->cpu, cpu_isolated_map))
+		return;
+
 	if (rt_se_boosted(rt_se))
 		rt_rq->rt_nr_boosted++;
 
@@ -1035,6 +1041,9 @@ dec_rt_group(struct sched_rt_entity *rt_
 static void
 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+	if (cpumask_test_cpu(rq_of_rt_rq(rt_rq)->cpu, cpu_isolated_map))
+		return;
+
 	start_rt_bandwidth(&def_rt_bandwidth);
 }
 
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -73,6 +73,9 @@ struct rt_bandwidth {
 
 extern struct mutex sched_domains_mutex;
 
+/* cpus with isolated domains */
+extern cpumask_var_t cpu_isolated_map;
+
 #ifdef CONFIG_CGROUP_SCHED
 
 #include <linux/cgroup.h>



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-15  4:51               ` Mike Galbraith
@ 2012-04-18  5:20                 ` Yong Zhang
  2012-04-18  6:27                   ` Mike Galbraith
  0 siblings, 1 reply; 14+ messages in thread
From: Yong Zhang @ 2012-04-18  5:20 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Peter Zijlstra, LKML, Ingo Molnar

On Sun, Apr 15, 2012 at 06:51:35AM +0200, Mike Galbraith wrote:
> Like so seems to work.  I personally like 2 lines better, but whatever
> solves dinky but deadly problem works for me.
> 
> ---
>  kernel/sched/core.c  |    7 ++++++-
>  kernel/sched/rt.c    |    9 +++++++++
>  kernel/sched/sched.h |    3 +++
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5875,6 +5875,11 @@ cpu_attach_domain(struct sched_domain *s
>  			sd->child = NULL;
>  	}
>  
> +	if (sd)
> +		cpumask_clear_cpu(cpu, cpu_isolated_map);
> +	else
> +		cpumask_set_cpu(cpu, cpu_isolated_map);
> +

Do we allow this?
Seems sched_domain_debug() will bite !sd.

Thanks,
Yong

>  	sched_domain_debug(sd, cpu);
>  
>  	rq_attach_root(rq, rd);
> @@ -5886,7 +5891,7 @@ cpu_attach_domain(struct sched_domain *s
>  }
>  
>  /* cpus with isolated domains */
> -static cpumask_var_t cpu_isolated_map;
> +cpumask_var_t cpu_isolated_map;
>  
>  /* Setup the mask of cpus configured for isolated domains */
>  static int __init isolated_cpu_setup(char *str)
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -922,6 +922,9 @@ static void update_curr_rt(struct rq *rq
>  	if (!rt_bandwidth_enabled())
>  		return;
>  
> +	if (cpumask_test_cpu(cpu_of(rq), cpu_isolated_map))
> +		return;
> +
>  	for_each_sched_rt_entity(rt_se) {
>  		rt_rq = rt_rq_of_se(rt_se);
>  
> @@ -1014,6 +1017,9 @@ static inline void dec_rt_prio(struct rt
>  static void
>  inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>  {
> +	if (cpumask_test_cpu(rq_of_rt_rq(rt_rq)->cpu, cpu_isolated_map))
> +		return;
> +
>  	if (rt_se_boosted(rt_se))
>  		rt_rq->rt_nr_boosted++;
>  
> @@ -1035,6 +1041,9 @@ dec_rt_group(struct sched_rt_entity *rt_
>  static void
>  inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
>  {
> +	if (cpumask_test_cpu(rq_of_rt_rq(rt_rq)->cpu, cpu_isolated_map))
> +		return;
> +
>  	start_rt_bandwidth(&def_rt_bandwidth);
>  }
>  
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -73,6 +73,9 @@ struct rt_bandwidth {
>  
>  extern struct mutex sched_domains_mutex;
>  
> +/* cpus with isolated domains */
> +extern cpumask_var_t cpu_isolated_map;
> +
>  #ifdef CONFIG_CGROUP_SCHED
>  
>  #include <linux/cgroup.h>
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Only stand for myself

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-18  5:20                 ` Yong Zhang
@ 2012-04-18  6:27                   ` Mike Galbraith
  2012-04-18  7:48                     ` Yong Zhang
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-04-18  6:27 UTC (permalink / raw)
  To: Yong Zhang; +Cc: Peter Zijlstra, LKML, Ingo Molnar

On Wed, 2012-04-18 at 13:20 +0800, Yong Zhang wrote:

> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -5875,6 +5875,11 @@ cpu_attach_domain(struct sched_domain *s
> >  			sd->child = NULL;
> >  	}
> >  
> > +	if (sd)
> > +		cpumask_clear_cpu(cpu, cpu_isolated_map);
> > +	else
> > +		cpumask_set_cpu(cpu, cpu_isolated_map);
> > +
> 
> Do we allow this?

Yeah, isolating CPUS 2-3...

[ 3011.444345] CPU0 attaching NULL sched-domain.
[ 3011.448719] CPU1 attaching NULL sched-domain.
[ 3011.453107] CPU2 attaching NULL sched-domain.
[ 3011.457466] CPU3 attaching NULL sched-domain.
[ 3011.461892] CPU0 attaching sched-domain:
[ 3011.465813]  domain 0: span 0-1 level MC
[ 3011.469761]   groups: 0 1
[ 3011.472415] CPU1 attaching sched-domain:
[ 3011.476333]  domain 0: span 0-1 level MC
[ 3011.480266]   groups: 1 0
[ 3011.482988] CPU2 attaching sched-domain:
[ 3011.486919]  domain 0: span 2-3 level MC
[ 3011.490851]   groups: 2 3
[ 3011.493502] CPU3 attaching sched-domain:
[ 3011.497422]  domain 0: span 2-3 level MC
[ 3011.501367]   groups: 3 2
[ 3011.504214] CPU2 attaching NULL sched-domain.
[ 3011.508569] CPU3 attaching NULL sched-domain.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-18  6:27                   ` Mike Galbraith
@ 2012-04-18  7:48                     ` Yong Zhang
  2012-04-18  8:38                       ` Mike Galbraith
  0 siblings, 1 reply; 14+ messages in thread
From: Yong Zhang @ 2012-04-18  7:48 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Peter Zijlstra, LKML, Ingo Molnar

On Wed, Apr 18, 2012 at 08:27:50AM +0200, Mike Galbraith wrote:
> On Wed, 2012-04-18 at 13:20 +0800, Yong Zhang wrote:
> 
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -5875,6 +5875,11 @@ cpu_attach_domain(struct sched_domain *s
> > >  			sd->child = NULL;
> > >  	}
> > >  
> > > +	if (sd)
> > > +		cpumask_clear_cpu(cpu, cpu_isolated_map);
> > > +	else
> > > +		cpumask_set_cpu(cpu, cpu_isolated_map);
> > > +
> > 
> > Do we allow this?
> 
> Yeah, isolating CPUS 2-3...

Hmm...magic cpuset ;-)

> 
> [ 3011.444345] CPU0 attaching NULL sched-domain.
> [ 3011.448719] CPU1 attaching NULL sched-domain.
> [ 3011.453107] CPU2 attaching NULL sched-domain.
> [ 3011.457466] CPU3 attaching NULL sched-domain.
> [ 3011.461892] CPU0 attaching sched-domain:
> [ 3011.465813]  domain 0: span 0-1 level MC
> [ 3011.469761]   groups: 0 1
> [ 3011.472415] CPU1 attaching sched-domain:
> [ 3011.476333]  domain 0: span 0-1 level MC
> [ 3011.480266]   groups: 1 0
> [ 3011.482988] CPU2 attaching sched-domain:
> [ 3011.486919]  domain 0: span 2-3 level MC
> [ 3011.490851]   groups: 2 3
> [ 3011.493502] CPU3 attaching sched-domain:
> [ 3011.497422]  domain 0: span 2-3 level MC
> [ 3011.501367]   groups: 3 2
> [ 3011.504214] CPU2 attaching NULL sched-domain.
> [ 3011.508569] CPU3 attaching NULL sched-domain.

But another scenario comes into head:
What will happen when a rt_rq is throttled the very CPU is
attached to NULL domain?

Thanks,
Yong

> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Only stand for myself

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-18  7:48                     ` Yong Zhang
@ 2012-04-18  8:38                       ` Mike Galbraith
  2012-04-19  6:34                         ` Yong Zhang
  0 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-04-18  8:38 UTC (permalink / raw)
  To: Yong Zhang; +Cc: Peter Zijlstra, LKML, Ingo Molnar

On Wed, 2012-04-18 at 15:48 +0800, Yong Zhang wrote: 
> On Wed, Apr 18, 2012 at 08:27:50AM +0200, Mike Galbraith wrote:
> > On Wed, 2012-04-18 at 13:20 +0800, Yong Zhang wrote:
> > 
> > > > --- a/kernel/sched/core.c
> > > > +++ b/kernel/sched/core.c
> > > > @@ -5875,6 +5875,11 @@ cpu_attach_domain(struct sched_domain *s
> > > >  			sd->child = NULL;
> > > >  	}
> > > >  
> > > > +	if (sd)
> > > > +		cpumask_clear_cpu(cpu, cpu_isolated_map);
> > > > +	else
> > > > +		cpumask_set_cpu(cpu, cpu_isolated_map);
> > > > +
> > > 
> > > Do we allow this?
> > 
> > Yeah, isolating CPUS 2-3...
> 
> Hmm...magic cpuset ;-)

The _only_ cpuset if you have a low jitter requirement. 

> > [ 3011.444345] CPU0 attaching NULL sched-domain.
> > [ 3011.448719] CPU1 attaching NULL sched-domain.
> > [ 3011.453107] CPU2 attaching NULL sched-domain.
> > [ 3011.457466] CPU3 attaching NULL sched-domain.
> > [ 3011.461892] CPU0 attaching sched-domain:
> > [ 3011.465813]  domain 0: span 0-1 level MC
> > [ 3011.469761]   groups: 0 1
> > [ 3011.472415] CPU1 attaching sched-domain:
> > [ 3011.476333]  domain 0: span 0-1 level MC
> > [ 3011.480266]   groups: 1 0
> > [ 3011.482988] CPU2 attaching sched-domain:
> > [ 3011.486919]  domain 0: span 2-3 level MC
> > [ 3011.490851]   groups: 2 3
> > [ 3011.493502] CPU3 attaching sched-domain:
> > [ 3011.497422]  domain 0: span 2-3 level MC
> > [ 3011.501367]   groups: 3 2
> > [ 3011.504214] CPU2 attaching NULL sched-domain.
> > [ 3011.508569] CPU3 attaching NULL sched-domain.
> 
> But another scenario comes into head:
> What will happen when a rt_rq is throttled the very CPU is
> attached to NULL domain?

Yup, _somebody_ will hit the throttle only once :)

That's what gets fixed by either killing the throttle entirely for
isolated CPUs, or making the throttle work until the guy who needed
isolation turns noise maker off.  I prefer reconnect the dots, because
that doesn't touch the fast path.

-Mike


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work
  2012-04-18  8:38                       ` Mike Galbraith
@ 2012-04-19  6:34                         ` Yong Zhang
  0 siblings, 0 replies; 14+ messages in thread
From: Yong Zhang @ 2012-04-19  6:34 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Peter Zijlstra, LKML, Ingo Molnar

On Wed, Apr 18, 2012 at 10:38:07AM +0200, Mike Galbraith wrote:
> On Wed, 2012-04-18 at 15:48 +0800, Yong Zhang wrote: 
> > On Wed, Apr 18, 2012 at 08:27:50AM +0200, Mike Galbraith wrote:
> > > On Wed, 2012-04-18 at 13:20 +0800, Yong Zhang wrote:
> > > 
> > > > > --- a/kernel/sched/core.c
> > > > > +++ b/kernel/sched/core.c
> > > > > @@ -5875,6 +5875,11 @@ cpu_attach_domain(struct sched_domain *s
> > > > >  			sd->child = NULL;
> > > > >  	}
> > > > >  
> > > > > +	if (sd)
> > > > > +		cpumask_clear_cpu(cpu, cpu_isolated_map);
> > > > > +	else
> > > > > +		cpumask_set_cpu(cpu, cpu_isolated_map);
> > > > > +
> > > > 
> > > > Do we allow this?
> > > 
> > > Yeah, isolating CPUS 2-3...
> > 
> > Hmm...magic cpuset ;-)
> 
> The _only_ cpuset if you have a low jitter requirement. 
> 
> > > [ 3011.444345] CPU0 attaching NULL sched-domain.
> > > [ 3011.448719] CPU1 attaching NULL sched-domain.
> > > [ 3011.453107] CPU2 attaching NULL sched-domain.
> > > [ 3011.457466] CPU3 attaching NULL sched-domain.
> > > [ 3011.461892] CPU0 attaching sched-domain:
> > > [ 3011.465813]  domain 0: span 0-1 level MC
> > > [ 3011.469761]   groups: 0 1
> > > [ 3011.472415] CPU1 attaching sched-domain:
> > > [ 3011.476333]  domain 0: span 0-1 level MC
> > > [ 3011.480266]   groups: 1 0
> > > [ 3011.482988] CPU2 attaching sched-domain:
> > > [ 3011.486919]  domain 0: span 2-3 level MC
> > > [ 3011.490851]   groups: 2 3
> > > [ 3011.493502] CPU3 attaching sched-domain:
> > > [ 3011.497422]  domain 0: span 2-3 level MC
> > > [ 3011.501367]   groups: 3 2
> > > [ 3011.504214] CPU2 attaching NULL sched-domain.
> > > [ 3011.508569] CPU3 attaching NULL sched-domain.
> > 
> > But another scenario comes into head:
> > What will happen when a rt_rq is throttled the very CPU is
> > attached to NULL domain?
> 
> Yup, _somebody_ will hit the throttle only once :)
> 
> That's what gets fixed by either killing the throttle entirely for
> isolated CPUs, or making the throttle work until the guy who needed
> isolation turns noise maker off.  I prefer reconnect the dots, because
> that doesn't touch the fast path.

I like it better too :)

Thanks,
Yong

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2012-04-19  6:34 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-03  9:08 [patch] sched,rt: non-isolated cores lift isolcpus throttle for CONFIG_RT_GROUP_SCHED Mike Galbraith
2012-04-03  9:19 ` [patch] sched,rt: let the user see rt queues in /proc/sched_debug Mike Galbraith
2012-04-07  8:58   ` [patch] sched,cgroup_sched: fix up task_groups list buglet Mike Galbraith
2012-04-07  9:54     ` RFC [patch] sched,cgroup_sched: convince RT_GROUP_SCHED throttle to work Mike Galbraith
2012-04-10  9:08       ` Mike Galbraith
2012-04-14 11:10         ` Peter Zijlstra
2012-04-15  3:37           ` Mike Galbraith
2012-04-15  3:44             ` Mike Galbraith
2012-04-15  4:51               ` Mike Galbraith
2012-04-18  5:20                 ` Yong Zhang
2012-04-18  6:27                   ` Mike Galbraith
2012-04-18  7:48                     ` Yong Zhang
2012-04-18  8:38                       ` Mike Galbraith
2012-04-19  6:34                         ` Yong Zhang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.