linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 1/3] sched/fair: Disable runtime_enabled on dying rq
       [not found] <20140625081502.2861.12526.stgit@tkhai>
@ 2014-06-25  8:19 ` Kirill Tkhai
  2014-06-25 17:57   ` bsegall
  2014-07-05 10:45   ` [tip:sched/core] " tip-bot for Kirill Tkhai
  2014-06-25  8:19 ` [PATCH v3 2/3] sched/rt: __disable_runtime: Enqueue just unthrottled rt_rq back on the stack Kirill Tkhai
  2014-06-25  8:19 ` [PATCH v3 3/3] sched: Rework check_for_tasks() Kirill Tkhai
  2 siblings, 2 replies; 10+ messages in thread
From: Kirill Tkhai @ 2014-06-25  8:19 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, tkhai, Srikar Dronamraju,
	Mike Galbraith, khorenko, Ben Segall, Paul Turner


We kill rq->rd on the CPU_DOWN_PREPARE stage:

	cpuset_cpu_inactive -> cpuset_update_active_cpus -> partition_sched_domains ->
	-> cpu_attach_domain -> rq_attach_root -> set_rq_offline

This unthrottles all throttled cfs_rqs.

But the cpu is still able to call schedule() till

	take_cpu_down->__cpu_disable()

is called from stop_machine.

This case the tasks from just unthrottled cfs_rqs are pickable
in a standard scheduler way, and they are picked by dying cpu.
The cfs_rqs becomes throttled again, and migrate_tasks()
in migration_call skips their tasks (one more unthrottle
in migrate_tasks()->CPU_DYING does not happen, because rq->rd
is already NULL).

Patch sets runtime_enabled to zero. This guarantees, the runtime
is not accounted, and the cfs_rqs won't exceed given
cfs_rq->runtime_remaining = 1, and tasks will be pickable
in migrate_tasks(). runtime_enabled is recalculated again
when rq becomes online again.

Ben Segall also noticed, we always enable runtime in
tg_set_cfs_bandwidth(). Actually, we should do that for online
cpus only. To prevent races with unthrottle_offline_cfs_rqs()
we take get_online_cpus() lock.

v2: Fix race with tg_set_cfs_bandwidth().
    Move cfs_rq->runtime_enabled=0 above unthrottle_cfs_rq().
v3: {get,put}_online_cpus()

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
CC: Konstantin Khorenko <khorenko@parallels.com>
CC: Ben Segall <bsegall@google.com>
CC: Paul Turner <pjt@google.com>
CC: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
CC: Mike Galbraith <umgwanakikbuti@gmail.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c |    8 +++++++-
 kernel/sched/fair.c |   22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ceea8d0..d9c4a08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7807,6 +7807,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	if (period > max_cfs_quota_period)
 		return -EINVAL;
 
+	/*
+	 * Prevent race between setting of cfs_rq->runtime_enabled and
+	 * unthrottle_offline_cfs_rqs().
+	 */
+	get_online_cpus();
 	mutex_lock(&cfs_constraints_mutex);
 	ret = __cfs_schedulable(tg, period, quota);
 	if (ret)
@@ -7832,7 +7837,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	}
 	raw_spin_unlock_irq(&cfs_b->lock);
 
-	for_each_possible_cpu(i) {
+	for_each_online_cpu(i) {
 		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
 		struct rq *rq = cfs_rq->rq;
 
@@ -7848,6 +7853,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 		cfs_bandwidth_usage_dec();
 out_unlock:
 	mutex_unlock(&cfs_constraints_mutex);
+	put_online_cpus();
 
 	return ret;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1f9c457..5616d23 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3776,6 +3776,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	hrtimer_cancel(&cfs_b->slack_timer);
 }
 
+static void __maybe_unused update_runtime_enabled(struct rq *rq)
+{
+	struct cfs_rq *cfs_rq;
+
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+
+		raw_spin_lock(&cfs_b->lock);
+		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
+		raw_spin_unlock(&cfs_b->lock);
+	}
+}
+
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
 	struct cfs_rq *cfs_rq;
@@ -3789,6 +3802,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 		 * there's some valid quota amount
 		 */
 		cfs_rq->runtime_remaining = 1;
+		/*
+		 * Offline rq is schedulable till cpu is completely disabled
+		 * in take_cpu_down(), so we prevent new cfs throttling here.
+		 */
+		cfs_rq->runtime_enabled = 0;
+
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 	}
@@ -3832,6 +3851,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 	return NULL;
 }
 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline void update_runtime_enabled(struct rq *rq) {}
 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 
 #endif /* CONFIG_CFS_BANDWIDTH */
@@ -7325,6 +7345,8 @@ void trigger_load_balance(struct rq *rq)
 static void rq_online_fair(struct rq *rq)
 {
 	update_sysctl();
+
+	update_runtime_enabled(rq);
 }
 
 static void rq_offline_fair(struct rq *rq)




^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 2/3] sched/rt: __disable_runtime: Enqueue just unthrottled rt_rq back on the stack
       [not found] <20140625081502.2861.12526.stgit@tkhai>
  2014-06-25  8:19 ` [PATCH v3 1/3] sched/fair: Disable runtime_enabled on dying rq Kirill Tkhai
@ 2014-06-25  8:19 ` Kirill Tkhai
  2014-06-26 11:08   ` Srikar Dronamraju
                     ` (2 more replies)
  2014-06-25  8:19 ` [PATCH v3 3/3] sched: Rework check_for_tasks() Kirill Tkhai
  2 siblings, 3 replies; 10+ messages in thread
From: Kirill Tkhai @ 2014-06-25  8:19 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, tkhai, Srikar Dronamraju,
	Mike Galbraith, khorenko, Ben Segall, Paul Turner


Make rt_rq available for pick_next_task(). Otherwise, their tasks
stay prisoned long time till dead cpu becomes alive again.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
CC: Konstantin Khorenko <khorenko@parallels.com>
CC: Ben Segall <bsegall@google.com>
CC: Paul Turner <pjt@google.com>
CC: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
CC: Mike Galbraith <umgwanakikbuti@gmail.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a490831..671a8b5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -740,6 +740,9 @@ static void __disable_runtime(struct rq *rq)
 		rt_rq->rt_throttled = 0;
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 		raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+		/* Make rt_rq available for pick_next_task() */
+		sched_rt_rq_enqueue(rt_rq);
 	}
 }
 




^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH v3 3/3] sched: Rework check_for_tasks()
       [not found] <20140625081502.2861.12526.stgit@tkhai>
  2014-06-25  8:19 ` [PATCH v3 1/3] sched/fair: Disable runtime_enabled on dying rq Kirill Tkhai
  2014-06-25  8:19 ` [PATCH v3 2/3] sched/rt: __disable_runtime: Enqueue just unthrottled rt_rq back on the stack Kirill Tkhai
@ 2014-06-25  8:19 ` Kirill Tkhai
  2014-06-26 11:30   ` Srikar Dronamraju
  2014-07-05 10:46   ` [tip:sched/core] " tip-bot for Kirill Tkhai
  2 siblings, 2 replies; 10+ messages in thread
From: Kirill Tkhai @ 2014-06-25  8:19 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Ingo Molnar, tkhai, Srikar Dronamraju,
	Mike Galbraith, khorenko, Ben Segall, Paul Turner


1)Iterate thru all of threads in the system.
  Check for all threads, not only for group leaders.

2)Check for p->on_rq instead of p->state and cputime.
  Preempted task in !TASK_RUNNING state  OR just
  created task may be queued, that we want to be
  reported too.

3)Use read_lock() instead of write_lock().
  This function does not change any structures, and
  read_lock() is enough.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
CC: Konstantin Khorenko <khorenko@parallels.com>
CC: Ben Segall <bsegall@google.com>
CC: Paul Turner <pjt@google.com>
CC: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
CC: Mike Galbraith <umgwanakikbuti@gmail.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c |   33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a343bde..81e2a38 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)
 	rcu_read_unlock();
 }
 
-static inline void check_for_tasks(int cpu)
+static inline void check_for_tasks(int dead_cpu)
 {
-	struct task_struct *p;
-	cputime_t utime, stime;
+	struct task_struct *g, *p;
 
-	write_lock_irq(&tasklist_lock);
-	for_each_process(p) {
-		task_cputime(p, &utime, &stime);
-		if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
-		    (utime || stime))
-			pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
-				p->comm, task_pid_nr(p), cpu,
-				p->state, p->flags);
-	}
-	write_unlock_irq(&tasklist_lock);
+	read_lock_irq(&tasklist_lock);
+	do_each_thread(g, p) {
+		if (!p->on_rq)
+			continue;
+		/*
+		 * We do the check with unlocked task_rq(p)->lock.
+		 * Order the reading to do not warn about a task,
+		 * which was running on this cpu in the past, and
+		 * it's just been woken on another cpu.
+		 */
+		rmb();
+		if (task_cpu(p) != dead_cpu)
+			continue;
+
+		pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
+			p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
+	} while_each_thread(g, p);
+	read_unlock_irq(&tasklist_lock);
 }
 
 struct take_cpu_down_param {




^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 1/3] sched/fair: Disable runtime_enabled on dying rq
  2014-06-25  8:19 ` [PATCH v3 1/3] sched/fair: Disable runtime_enabled on dying rq Kirill Tkhai
@ 2014-06-25 17:57   ` bsegall
  2014-07-05 10:45   ` [tip:sched/core] " tip-bot for Kirill Tkhai
  1 sibling, 0 replies; 10+ messages in thread
From: bsegall @ 2014-06-25 17:57 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, tkhai,
	Srikar Dronamraju, Mike Galbraith, khorenko, Paul Turner

Kirill Tkhai <ktkhai@parallels.com> writes:

> We kill rq->rd on the CPU_DOWN_PREPARE stage:
>
> 	cpuset_cpu_inactive -> cpuset_update_active_cpus -> partition_sched_domains ->
> 	-> cpu_attach_domain -> rq_attach_root -> set_rq_offline
>
> This unthrottles all throttled cfs_rqs.
>
> But the cpu is still able to call schedule() till
>
> 	take_cpu_down->__cpu_disable()
>
> is called from stop_machine.
>
> This case the tasks from just unthrottled cfs_rqs are pickable
> in a standard scheduler way, and they are picked by dying cpu.
> The cfs_rqs becomes throttled again, and migrate_tasks()
> in migration_call skips their tasks (one more unthrottle
> in migrate_tasks()->CPU_DYING does not happen, because rq->rd
> is already NULL).
>
> Patch sets runtime_enabled to zero. This guarantees, the runtime
> is not accounted, and the cfs_rqs won't exceed given
> cfs_rq->runtime_remaining = 1, and tasks will be pickable
> in migrate_tasks(). runtime_enabled is recalculated again
> when rq becomes online again.
>
> Ben Segall also noticed, we always enable runtime in
> tg_set_cfs_bandwidth(). Actually, we should do that for online
> cpus only. To prevent races with unthrottle_offline_cfs_rqs()
> we take get_online_cpus() lock.
>
> v2: Fix race with tg_set_cfs_bandwidth().
>     Move cfs_rq->runtime_enabled=0 above unthrottle_cfs_rq().
> v3: {get,put}_online_cpus()
>
> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> CC: Konstantin Khorenko <khorenko@parallels.com>
> CC: Ben Segall <bsegall@google.com>
> CC: Paul Turner <pjt@google.com>
> CC: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> CC: Mike Galbraith <umgwanakikbuti@gmail.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Ben Segall <bsegall@google.com>

Although #ifdefing rq->online might work too. It's possibly faster but a
bit more ugly to look at, I don't care which way it is done.


> ---
>  kernel/sched/core.c |    8 +++++++-
>  kernel/sched/fair.c |   22 ++++++++++++++++++++++
>  2 files changed, 29 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index ceea8d0..d9c4a08 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7807,6 +7807,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
>  	if (period > max_cfs_quota_period)
>  		return -EINVAL;
>  
> +	/*
> +	 * Prevent race between setting of cfs_rq->runtime_enabled and
> +	 * unthrottle_offline_cfs_rqs().
> +	 */
> +	get_online_cpus();
>  	mutex_lock(&cfs_constraints_mutex);
>  	ret = __cfs_schedulable(tg, period, quota);
>  	if (ret)
> @@ -7832,7 +7837,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
>  	}
>  	raw_spin_unlock_irq(&cfs_b->lock);
>  
> -	for_each_possible_cpu(i) {
> +	for_each_online_cpu(i) {
>  		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
>  		struct rq *rq = cfs_rq->rq;
>  
> @@ -7848,6 +7853,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
>  		cfs_bandwidth_usage_dec();
>  out_unlock:
>  	mutex_unlock(&cfs_constraints_mutex);
> +	put_online_cpus();
>  
>  	return ret;
>  }
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1f9c457..5616d23 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3776,6 +3776,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
>  	hrtimer_cancel(&cfs_b->slack_timer);
>  }
>  
> +static void __maybe_unused update_runtime_enabled(struct rq *rq)
> +{
> +	struct cfs_rq *cfs_rq;
> +
> +	for_each_leaf_cfs_rq(rq, cfs_rq) {
> +		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
> +
> +		raw_spin_lock(&cfs_b->lock);
> +		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
> +		raw_spin_unlock(&cfs_b->lock);
> +	}
> +}
> +
>  static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
>  {
>  	struct cfs_rq *cfs_rq;
> @@ -3789,6 +3802,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
>  		 * there's some valid quota amount
>  		 */
>  		cfs_rq->runtime_remaining = 1;
> +		/*
> +		 * Offline rq is schedulable till cpu is completely disabled
> +		 * in take_cpu_down(), so we prevent new cfs throttling here.
> +		 */
> +		cfs_rq->runtime_enabled = 0;
> +
>  		if (cfs_rq_throttled(cfs_rq))
>  			unthrottle_cfs_rq(cfs_rq);
>  	}
> @@ -3832,6 +3851,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
>  	return NULL;
>  }
>  static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
> +static inline void update_runtime_enabled(struct rq *rq) {}
>  static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
>  
>  #endif /* CONFIG_CFS_BANDWIDTH */
> @@ -7325,6 +7345,8 @@ void trigger_load_balance(struct rq *rq)
>  static void rq_online_fair(struct rq *rq)
>  {
>  	update_sysctl();
> +
> +	update_runtime_enabled(rq);
>  }
>  
>  static void rq_offline_fair(struct rq *rq)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/3] sched/rt: __disable_runtime: Enqueue just unthrottled rt_rq back on the stack
  2014-06-25  8:19 ` [PATCH v3 2/3] sched/rt: __disable_runtime: Enqueue just unthrottled rt_rq back on the stack Kirill Tkhai
@ 2014-06-26 11:08   ` Srikar Dronamraju
  2014-06-26 11:09   ` Srikar Dronamraju
  2014-07-05 10:46   ` [tip:sched/core] sched/rt: Enqueue just unthrottled rt_rq back on the stack in __disable_runtime() tip-bot for Kirill Tkhai
  2 siblings, 0 replies; 10+ messages in thread
From: Srikar Dronamraju @ 2014-06-26 11:08 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, tkhai, Mike Galbraith,
	khorenko, Ben Segall, Paul Turner

* Kirill Tkhai <ktkhai@parallels.com> [2014-06-25 12:19:48]:

> 
> Make rt_rq available for pick_next_task(). Otherwise, their tasks
> stay prisoned long time till dead cpu becomes alive again.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> CC: Konstantin Khorenko <khorenko@parallels.com>
> CC: Ben Segall <bsegall@google.com>
> CC: Paul Turner <pjt@google.com>
> CC: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> CC: Mike Galbraith <umgwanakikbuti@gmail.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Ingo Molnar <mingo@kernel.org>
> ---
looks good to me.

Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

-- 
Thanks and Regards
Srikar Dronamraju


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 2/3] sched/rt: __disable_runtime: Enqueue just unthrottled rt_rq back on the stack
  2014-06-25  8:19 ` [PATCH v3 2/3] sched/rt: __disable_runtime: Enqueue just unthrottled rt_rq back on the stack Kirill Tkhai
  2014-06-26 11:08   ` Srikar Dronamraju
@ 2014-06-26 11:09   ` Srikar Dronamraju
  2014-07-05 10:46   ` [tip:sched/core] sched/rt: Enqueue just unthrottled rt_rq back on the stack in __disable_runtime() tip-bot for Kirill Tkhai
  2 siblings, 0 replies; 10+ messages in thread
From: Srikar Dronamraju @ 2014-06-26 11:09 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, tkhai, Mike Galbraith,
	khorenko, Ben Segall, Paul Turner

* Kirill Tkhai <ktkhai@parallels.com> [2014-06-25 12:19:48]:

> 
> Make rt_rq available for pick_next_task(). Otherwise, their tasks
> stay prisoned long time till dead cpu becomes alive again.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> CC: Konstantin Khorenko <khorenko@parallels.com>
> CC: Ben Segall <bsegall@google.com>
> CC: Paul Turner <pjt@google.com>
> CC: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> CC: Mike Galbraith <umgwanakikbuti@gmail.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Ingo Molnar <mingo@kernel.org>
> ---

Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

-- 
Thanks and Regards
Srikar Dronamraju


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3 3/3] sched: Rework check_for_tasks()
  2014-06-25  8:19 ` [PATCH v3 3/3] sched: Rework check_for_tasks() Kirill Tkhai
@ 2014-06-26 11:30   ` Srikar Dronamraju
  2014-07-05 10:46   ` [tip:sched/core] " tip-bot for Kirill Tkhai
  1 sibling, 0 replies; 10+ messages in thread
From: Srikar Dronamraju @ 2014-06-26 11:30 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, Peter Zijlstra, Ingo Molnar, tkhai, Mike Galbraith,
	khorenko, Ben Segall, Paul Turner

* Kirill Tkhai <ktkhai@parallels.com> [2014-06-25 12:19:55]:

> 
> 1)Iterate thru all of threads in the system.
>   Check for all threads, not only for group leaders.
> 
> 2)Check for p->on_rq instead of p->state and cputime.
>   Preempted task in !TASK_RUNNING state  OR just
>   created task may be queued, that we want to be
>   reported too.
> 
> 3)Use read_lock() instead of write_lock().
>   This function does not change any structures, and
>   read_lock() is enough.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> CC: Konstantin Khorenko <khorenko@parallels.com>
> CC: Ben Segall <bsegall@google.com>
> CC: Paul Turner <pjt@google.com>
> CC: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> CC: Mike Galbraith <umgwanakikbuti@gmail.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Ingo Molnar <mingo@kernel.org>
> ---
 
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

-- 
Thanks and Regards
Srikar Dronamraju


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [tip:sched/core] sched/fair: Disable runtime_enabled on dying rq
  2014-06-25  8:19 ` [PATCH v3 1/3] sched/fair: Disable runtime_enabled on dying rq Kirill Tkhai
  2014-06-25 17:57   ` bsegall
@ 2014-07-05 10:45   ` tip-bot for Kirill Tkhai
  1 sibling, 0 replies; 10+ messages in thread
From: tip-bot for Kirill Tkhai @ 2014-07-05 10:45 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, bsegall, ktkhai, hpa, mingo, torvalds, peterz, pjt,
	umgwanakikbuti, khorenko, srikar, tglx

Commit-ID:  0e59bdaea75f12a7d7c03672f4ac22c0119a1bc0
Gitweb:     http://git.kernel.org/tip/0e59bdaea75f12a7d7c03672f4ac22c0119a1bc0
Author:     Kirill Tkhai <ktkhai@parallels.com>
AuthorDate: Wed, 25 Jun 2014 12:19:42 +0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 5 Jul 2014 11:17:42 +0200

sched/fair: Disable runtime_enabled on dying rq

We kill rq->rd on the CPU_DOWN_PREPARE stage:

	cpuset_cpu_inactive -> cpuset_update_active_cpus -> partition_sched_domains ->
	-> cpu_attach_domain -> rq_attach_root -> set_rq_offline

This unthrottles all throttled cfs_rqs.

But the cpu is still able to call schedule() till

	take_cpu_down->__cpu_disable()

is called from stop_machine.

This case the tasks from just unthrottled cfs_rqs are pickable
in a standard scheduler way, and they are picked by dying cpu.
The cfs_rqs becomes throttled again, and migrate_tasks()
in migration_call skips their tasks (one more unthrottle
in migrate_tasks()->CPU_DYING does not happen, because rq->rd
is already NULL).

Patch sets runtime_enabled to zero. This guarantees, the runtime
is not accounted, and the cfs_rqs won't exceed given
cfs_rq->runtime_remaining = 1, and tasks will be pickable
in migrate_tasks(). runtime_enabled is recalculated again
when rq becomes online again.

Ben Segall also noticed, we always enable runtime in
tg_set_cfs_bandwidth(). Actually, we should do that for online
cpus only. To prevent races with unthrottle_offline_cfs_rqs()
we take get_online_cpus() lock.

Reviewed-by: Ben Segall <bsegall@google.com>
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
CC: Konstantin Khorenko <khorenko@parallels.com>
CC: Paul Turner <pjt@google.com>
CC: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1403684382.3462.42.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c |  8 +++++++-
 kernel/sched/fair.c | 22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e50234b..2dbc63d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7817,6 +7817,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	if (period > max_cfs_quota_period)
 		return -EINVAL;
 
+	/*
+	 * Prevent race between setting of cfs_rq->runtime_enabled and
+	 * unthrottle_offline_cfs_rqs().
+	 */
+	get_online_cpus();
 	mutex_lock(&cfs_constraints_mutex);
 	ret = __cfs_schedulable(tg, period, quota);
 	if (ret)
@@ -7842,7 +7847,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	}
 	raw_spin_unlock_irq(&cfs_b->lock);
 
-	for_each_possible_cpu(i) {
+	for_each_online_cpu(i) {
 		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
 		struct rq *rq = cfs_rq->rq;
 
@@ -7858,6 +7863,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 		cfs_bandwidth_usage_dec();
 out_unlock:
 	mutex_unlock(&cfs_constraints_mutex);
+	put_online_cpus();
 
 	return ret;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a140c6a..923fe32 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3798,6 +3798,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	hrtimer_cancel(&cfs_b->slack_timer);
 }
 
+static void __maybe_unused update_runtime_enabled(struct rq *rq)
+{
+	struct cfs_rq *cfs_rq;
+
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+
+		raw_spin_lock(&cfs_b->lock);
+		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
+		raw_spin_unlock(&cfs_b->lock);
+	}
+}
+
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
 	struct cfs_rq *cfs_rq;
@@ -3811,6 +3824,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 		 * there's some valid quota amount
 		 */
 		cfs_rq->runtime_remaining = 1;
+		/*
+		 * Offline rq is schedulable till cpu is completely disabled
+		 * in take_cpu_down(), so we prevent new cfs throttling here.
+		 */
+		cfs_rq->runtime_enabled = 0;
+
 		if (cfs_rq_throttled(cfs_rq))
 			unthrottle_cfs_rq(cfs_rq);
 	}
@@ -3854,6 +3873,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 	return NULL;
 }
 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline void update_runtime_enabled(struct rq *rq) {}
 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 
 #endif /* CONFIG_CFS_BANDWIDTH */
@@ -7362,6 +7382,8 @@ void trigger_load_balance(struct rq *rq)
 static void rq_online_fair(struct rq *rq)
 {
 	update_sysctl();
+
+	update_runtime_enabled(rq);
 }
 
 static void rq_offline_fair(struct rq *rq)

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [tip:sched/core] sched/rt: Enqueue just unthrottled rt_rq back on the stack in __disable_runtime()
  2014-06-25  8:19 ` [PATCH v3 2/3] sched/rt: __disable_runtime: Enqueue just unthrottled rt_rq back on the stack Kirill Tkhai
  2014-06-26 11:08   ` Srikar Dronamraju
  2014-06-26 11:09   ` Srikar Dronamraju
@ 2014-07-05 10:46   ` tip-bot for Kirill Tkhai
  2 siblings, 0 replies; 10+ messages in thread
From: tip-bot for Kirill Tkhai @ 2014-07-05 10:46 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, bsegall, ktkhai, hpa, mingo, torvalds, peterz, pjt,
	umgwanakikbuti, khorenko, srikar, tglx

Commit-ID:  99b625670f1447ecf0739161efbe7f2f43c0e0b6
Gitweb:     http://git.kernel.org/tip/99b625670f1447ecf0739161efbe7f2f43c0e0b6
Author:     Kirill Tkhai <ktkhai@parallels.com>
AuthorDate: Wed, 25 Jun 2014 12:19:48 +0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 5 Jul 2014 11:17:44 +0200

sched/rt: Enqueue just unthrottled rt_rq back on the stack in __disable_runtime()

Make rt_rq available for pick_next_task(). Otherwise, their tasks
stay prisoned long time till dead cpu becomes alive again.

Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
CC: Konstantin Khorenko <khorenko@parallels.com>
CC: Ben Segall <bsegall@google.com>
CC: Paul Turner <pjt@google.com>
CC: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1403684388.3462.43.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a490831..671a8b5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -740,6 +740,9 @@ balanced:
 		rt_rq->rt_throttled = 0;
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 		raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+		/* Make rt_rq available for pick_next_task() */
+		sched_rt_rq_enqueue(rt_rq);
 	}
 }
 

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [tip:sched/core] sched: Rework check_for_tasks()
  2014-06-25  8:19 ` [PATCH v3 3/3] sched: Rework check_for_tasks() Kirill Tkhai
  2014-06-26 11:30   ` Srikar Dronamraju
@ 2014-07-05 10:46   ` tip-bot for Kirill Tkhai
  1 sibling, 0 replies; 10+ messages in thread
From: tip-bot for Kirill Tkhai @ 2014-07-05 10:46 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: ktkhai, mingo, torvalds, peterz, fabf, todd.e.brandt, akpm,
	wangyun, tglx, linux-kernel, hpa, bsegall, pjt, srivatsa.bhat,
	rafael.j.wysocki, toshi.kani, khorenko, umgwanakikbuti, ego,
	srikar, paul.gortmaker

Commit-ID:  b728ca06029d085a1585c1926610f26de93b9146
Gitweb:     http://git.kernel.org/tip/b728ca06029d085a1585c1926610f26de93b9146
Author:     Kirill Tkhai <ktkhai@parallels.com>
AuthorDate: Wed, 25 Jun 2014 12:19:55 +0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 5 Jul 2014 11:17:45 +0200

sched: Rework check_for_tasks()

1) Iterate thru all of threads in the system.
   Check for all threads, not only for group leaders.

2) Check for p->on_rq instead of p->state and cputime.
   Preempted task in !TASK_RUNNING state  OR just
   created task may be queued, that we want to be
   reported too.

3) Use read_lock() instead of write_lock().
   This function does not change any structures, and
   read_lock() is enough.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Cc: Konstantin Khorenko <khorenko@parallels.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael wang <wangyun@linux.vnet.ibm.com>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paul Turner <pjt@google.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Todd E Brandt <todd.e.brandt@linux.intel.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1403684395.3462.44.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a343bde..81e2a38 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)
 	rcu_read_unlock();
 }
 
-static inline void check_for_tasks(int cpu)
+static inline void check_for_tasks(int dead_cpu)
 {
-	struct task_struct *p;
-	cputime_t utime, stime;
+	struct task_struct *g, *p;
 
-	write_lock_irq(&tasklist_lock);
-	for_each_process(p) {
-		task_cputime(p, &utime, &stime);
-		if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
-		    (utime || stime))
-			pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
-				p->comm, task_pid_nr(p), cpu,
-				p->state, p->flags);
-	}
-	write_unlock_irq(&tasklist_lock);
+	read_lock_irq(&tasklist_lock);
+	do_each_thread(g, p) {
+		if (!p->on_rq)
+			continue;
+		/*
+		 * We do the check with unlocked task_rq(p)->lock.
+		 * Order the reading to do not warn about a task,
+		 * which was running on this cpu in the past, and
+		 * it's just been woken on another cpu.
+		 */
+		rmb();
+		if (task_cpu(p) != dead_cpu)
+			continue;
+
+		pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
+			p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
+	} while_each_thread(g, p);
+	read_unlock_irq(&tasklist_lock);
 }
 
 struct take_cpu_down_param {

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-07-05 10:47 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20140625081502.2861.12526.stgit@tkhai>
2014-06-25  8:19 ` [PATCH v3 1/3] sched/fair: Disable runtime_enabled on dying rq Kirill Tkhai
2014-06-25 17:57   ` bsegall
2014-07-05 10:45   ` [tip:sched/core] " tip-bot for Kirill Tkhai
2014-06-25  8:19 ` [PATCH v3 2/3] sched/rt: __disable_runtime: Enqueue just unthrottled rt_rq back on the stack Kirill Tkhai
2014-06-26 11:08   ` Srikar Dronamraju
2014-06-26 11:09   ` Srikar Dronamraju
2014-07-05 10:46   ` [tip:sched/core] sched/rt: Enqueue just unthrottled rt_rq back on the stack in __disable_runtime() tip-bot for Kirill Tkhai
2014-06-25  8:19 ` [PATCH v3 3/3] sched: Rework check_for_tasks() Kirill Tkhai
2014-06-26 11:30   ` Srikar Dronamraju
2014-07-05 10:46   ` [tip:sched/core] " tip-bot for Kirill Tkhai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).