All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 1/6] sched/fair: Fix reschedule which is generated on throttled cfs_rq
       [not found] <20140806075138.24858.23816.stgit@tkhai>
@ 2014-08-06  8:06 ` Kirill Tkhai
  2014-08-20  8:20   ` [tip:sched/core] " tip-bot for Kirill Tkhai
  2014-10-23 23:27   ` [PATCH v4 1/6] " Wanpeng Li
  2014-08-06  8:06 ` [PATCH v4 2/6] sched: Wrapper for checking task_struct::on_rq Kirill Tkhai
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 21+ messages in thread
From: Kirill Tkhai @ 2014-08-06  8:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, pjt, oleg, rostedt, umgwanakikbuti, tkhai, tim.c.chen,
	mingo, nicolas.pitre


(sched_entity::on_rq == 1) does not guarantee the task is pickable;
changes on throttled cfs_rq must not lead to reschedule.

Check for task_struct::on_rq instead.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/fair.c |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86..6f0ce2b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7465,7 +7465,7 @@ static void task_fork_fair(struct task_struct *p)
 static void
 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (!p->se.on_rq)
+	if (!p->on_rq)
 		return;
 
 	/*
@@ -7521,15 +7521,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
-	struct sched_entity *se = &p->se;
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_entity *se = &p->se;
 	/*
 	 * Since the real-depth could have been changed (only FAIR
 	 * class maintain depth value), reset depth properly.
 	 */
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
-	if (!se->on_rq)
+	if (!p->on_rq)
 		return;
 
 	/*




^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 2/6] sched: Wrapper for checking task_struct::on_rq
       [not found] <20140806075138.24858.23816.stgit@tkhai>
  2014-08-06  8:06 ` [PATCH v4 1/6] sched/fair: Fix reschedule which is generated on throttled cfs_rq Kirill Tkhai
@ 2014-08-06  8:06 ` Kirill Tkhai
  2014-08-20  7:52   ` Ingo Molnar
  2014-08-06  8:06 ` [PATCH v4 3/6] sched: Teach scheduler to understand ONRQ_MIGRATING state Kirill Tkhai
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 21+ messages in thread
From: Kirill Tkhai @ 2014-08-06  8:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, pjt, oleg, rostedt, umgwanakikbuti, tkhai, tim.c.chen,
	mingo, nicolas.pitre


Implement task_queued() and use it everywhere instead of on_rq check.
No functional changes.

The only exception is we do not use the wrapper in check_for_tasks(),
because it requires to export task_queued() in global header files.
Next patch in series would return it back, so it doesn't matter.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/core.c      |   82 +++++++++++++++++++++++-----------------------
 kernel/sched/deadline.c  |   14 ++++----
 kernel/sched/fair.c      |   22 ++++++------
 kernel/sched/rt.c        |   16 ++++-----
 kernel/sched/sched.h     |    7 ++++
 kernel/sched/stop_task.c |    2 +
 6 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1211575..67e8d1e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1043,7 +1043,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 	 * A queue event has occurred, and we're going to schedule.  In
 	 * this case, we can save a useless back to back clock update.
 	 */
-	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+	if (task_queued(rq->curr) && test_tsk_need_resched(rq->curr))
 		rq->skip_clock_update = 1;
 }
 
@@ -1088,7 +1088,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
-	if (p->on_rq) {
+	if (task_queued(p)) {
 		struct rq *src_rq, *dst_rq;
 
 		src_rq = task_rq(p);
@@ -1214,7 +1214,7 @@ static int migration_cpu_stop(void *data);
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
-	int running, on_rq;
+	int running, queued;
 	unsigned long ncsw;
 	struct rq *rq;
 
@@ -1252,7 +1252,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(rq, p);
-		on_rq = p->on_rq;
+		queued = task_queued(p);
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1284,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
-		if (unlikely(on_rq)) {
+		if (unlikely(queued)) {
 			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
 
 			set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1478,7 +1478,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
-	p->on_rq = 1;
+	p->on_rq = ONRQ_QUEUED;
 
 	/* if a worker is waking up, notify workqueue */
 	if (p->flags & PF_WQ_WORKER)
@@ -1537,7 +1537,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 	int ret = 0;
 
 	rq = __task_rq_lock(p);
-	if (p->on_rq) {
+	if (task_queued(p)) {
 		/* check_preempt_curr() may use rq clock */
 		update_rq_clock(rq);
 		ttwu_do_wakeup(rq, p, wake_flags);
@@ -1678,7 +1678,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 
-	if (p->on_rq && ttwu_remote(p, wake_flags))
+	if (task_queued(p) && ttwu_remote(p, wake_flags))
 		goto stat;
 
 #ifdef CONFIG_SMP
@@ -1742,7 +1742,7 @@ static void try_to_wake_up_local(struct task_struct *p)
 	if (!(p->state & TASK_NORMAL))
 		goto out;
 
-	if (!p->on_rq)
+	if (!task_queued(p))
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
 	ttwu_do_wakeup(rq, p, 0);
@@ -2095,7 +2095,7 @@ void wake_up_new_task(struct task_struct *p)
 	init_task_runnable_average(p);
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
-	p->on_rq = 1;
+	p->on_rq = ONRQ_QUEUED;
 	trace_sched_wakeup_new(p, true);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -2444,7 +2444,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 	 * project cycles that may never be accounted to this
 	 * thread, breaking clock_gettime().
 	 */
-	if (task_current(rq, p) && p->on_rq) {
+	if (task_current(rq, p) && task_queued(p)) {
 		update_rq_clock(rq);
 		ns = rq_clock_task(rq) - p->se.exec_start;
 		if ((s64)ns < 0)
@@ -2490,7 +2490,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
 	 * been accounted, so we're correct here as well.
 	 */
-	if (!p->on_cpu || !p->on_rq)
+	if (!p->on_cpu || !task_queued(p))
 		return p->se.sum_exec_runtime;
 #endif
 
@@ -2794,7 +2794,7 @@ static void __sched __schedule(void)
 		switch_count = &prev->nvcsw;
 	}
 
-	if (prev->on_rq || rq->skip_clock_update < 0)
+	if (task_queued(prev) || rq->skip_clock_update < 0)
 		update_rq_clock(rq);
 
 	next = pick_next_task(rq, prev);
@@ -2959,7 +2959,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, on_rq, running, enqueue_flag = 0;
+	int oldprio, queued, running, enqueue_flag = 0;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -2988,9 +2988,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	prev_class = p->sched_class;
-	on_rq = p->on_rq;
+	queued = task_queued(p);
 	running = task_current(rq, p);
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
@@ -3030,7 +3030,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq)
+	if (queued)
 		enqueue_task(rq, p, enqueue_flag);
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -3041,7 +3041,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	int old_prio, delta, on_rq;
+	int old_prio, delta, queued;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -3062,8 +3062,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-	on_rq = p->on_rq;
-	if (on_rq)
+	queued = task_queued(p);
+	if (queued)
 		dequeue_task(rq, p, 0);
 
 	p->static_prio = NICE_TO_PRIO(nice);
@@ -3072,7 +3072,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 
-	if (on_rq) {
+	if (queued) {
 		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
@@ -3344,7 +3344,7 @@ static int __sched_setscheduler(struct task_struct *p,
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
-	int retval, oldprio, oldpolicy = -1, on_rq, running;
+	int retval, oldprio, oldpolicy = -1, queued, running;
 	int policy = attr->sched_policy;
 	unsigned long flags;
 	const struct sched_class *prev_class;
@@ -3541,9 +3541,9 @@ static int __sched_setscheduler(struct task_struct *p,
 		return 0;
 	}
 
-	on_rq = p->on_rq;
+	queued = task_queued(p);
 	running = task_current(rq, p);
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
@@ -3553,7 +3553,7 @@ static int __sched_setscheduler(struct task_struct *p,
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq) {
+	if (queued) {
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
@@ -4568,7 +4568,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	rcu_read_unlock();
 
 	rq->curr = rq->idle = idle;
-	idle->on_rq = 1;
+	idle->on_rq = ONRQ_QUEUED;
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
@@ -4645,7 +4645,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-	if (p->on_rq) {
+	if (task_queued(p)) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, p, &flags);
@@ -4695,7 +4695,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	 * If we're not on a rq, the next wake-up will ensure we're
 	 * placed properly.
 	 */
-	if (p->on_rq) {
+	if (task_queued(p)) {
 		dequeue_task(rq_src, p, 0);
 		set_task_cpu(p, dest_cpu);
 		enqueue_task(rq_dest, p, 0);
@@ -4736,13 +4736,13 @@ void sched_setnuma(struct task_struct *p, int nid)
 {
 	struct rq *rq;
 	unsigned long flags;
-	bool on_rq, running;
+	bool queued, running;
 
 	rq = task_rq_lock(p, &flags);
-	on_rq = p->on_rq;
+	queued = task_queued(p);
 	running = task_current(rq, p);
 
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
@@ -4751,7 +4751,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq)
+	if (queued)
 		enqueue_task(rq, p, 0);
 	task_rq_unlock(rq, p, &flags);
 }
@@ -7117,13 +7117,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 		.sched_policy = SCHED_NORMAL,
 	};
 	int old_prio = p->prio;
-	int on_rq;
+	int queued;
 
-	on_rq = p->on_rq;
-	if (on_rq)
+	queued = task_queued(p);
+	if (queued)
 		dequeue_task(rq, p, 0);
 	__setscheduler(rq, p, &attr);
-	if (on_rq) {
+	if (queued) {
 		enqueue_task(rq, p, 0);
 		resched_curr(rq);
 	}
@@ -7311,16 +7311,16 @@ void sched_offline_group(struct task_group *tg)
 void sched_move_task(struct task_struct *tsk)
 {
 	struct task_group *tg;
-	int on_rq, running;
+	int queued, running;
 	unsigned long flags;
 	struct rq *rq;
 
 	rq = task_rq_lock(tsk, &flags);
 
 	running = task_current(rq, tsk);
-	on_rq = tsk->on_rq;
+	queued = task_queued(tsk);
 
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, tsk, 0);
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
@@ -7333,14 +7333,14 @@ void sched_move_task(struct task_struct *tsk)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_move_group)
-		tsk->sched_class->task_move_group(tsk, on_rq);
+		tsk->sched_class->task_move_group(tsk, queued);
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
-	if (on_rq)
+	if (queued)
 		enqueue_task(rq, tsk, 0);
 
 	task_rq_unlock(rq, tsk, &flags);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce13..4cc3b14 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -530,7 +530,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	update_rq_clock(rq);
 	dl_se->dl_throttled = 0;
 	dl_se->dl_yielded = 0;
-	if (p->on_rq) {
+	if (task_queued(p)) {
 		enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
 		if (task_has_dl_policy(rq->curr))
 			check_preempt_curr_dl(rq, p, 0);
@@ -1030,7 +1030,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 		 * means a stop task can slip in, in which case we need to
 		 * re-start task selection.
 		 */
-		if (rq->stop && rq->stop->on_rq)
+		if (rq->stop && task_queued(rq->stop))
 			return RETRY_TASK;
 	}
 
@@ -1257,7 +1257,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 			if (unlikely(task_rq(task) != rq ||
 				     !cpumask_test_cpu(later_rq->cpu,
 				                       &task->cpus_allowed) ||
-				     task_running(rq, task) || !task->on_rq)) {
+				     task_running(rq, task) || !task_queued(task))) {
 				double_unlock_balance(rq, later_rq);
 				later_rq = NULL;
 				break;
@@ -1296,7 +1296,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
 	BUG_ON(task_current(rq, p));
 	BUG_ON(p->nr_cpus_allowed <= 1);
 
-	BUG_ON(!p->on_rq);
+	BUG_ON(!task_queued(p));
 	BUG_ON(!dl_task(p));
 
 	return p;
@@ -1443,7 +1443,7 @@ static int pull_dl_task(struct rq *this_rq)
 		     dl_time_before(p->dl.deadline,
 				    this_rq->dl.earliest_dl.curr))) {
 			WARN_ON(p == src_rq->curr);
-			WARN_ON(!p->on_rq);
+			WARN_ON(!task_queued(p));
 
 			/*
 			 * Then we pull iff p has actually an earlier
@@ -1596,7 +1596,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 	if (unlikely(p->dl.dl_throttled))
 		return;
 
-	if (p->on_rq && rq->curr != p) {
+	if (task_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
 			/* Only reschedule if pushing failed */
@@ -1614,7 +1614,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 			    int oldprio)
 {
-	if (p->on_rq || rq->curr == p) {
+	if (task_queued(p) || rq->curr == p) {
 #ifdef CONFIG_SMP
 		/*
 		 * This might be too much, but unfortunately
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6f0ce2b..d54b72c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7465,7 +7465,7 @@ static void task_fork_fair(struct task_struct *p)
 static void
 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (!p->on_rq)
+	if (!task_queued(p))
 		return;
 
 	/*
@@ -7490,11 +7490,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 	 * switched back to the fair class the enqueue_entity(.flags=0) will
 	 * do the right thing.
 	 *
-	 * If it's on_rq, then the dequeue_entity(.flags=0) will already
-	 * have normalized the vruntime, if it's !on_rq, then only when
+	 * If it's queued, then the dequeue_entity(.flags=0) will already
+	 * have normalized the vruntime, if it's !queued, then only when
 	 * the task is sleeping will it still have non-normalized vruntime.
 	 */
-	if (!p->on_rq && p->state != TASK_RUNNING) {
+	if (!task_queued(p) && p->state != TASK_RUNNING) {
 		/*
 		 * Fix up our vruntime so that the current sleep doesn't
 		 * cause 'unlimited' sleep bonus.
@@ -7529,7 +7529,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
 	 */
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
-	if (!p->on_rq)
+	if (!task_queued(p))
 		return;
 
 	/*
@@ -7575,7 +7575,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq;
@@ -7594,7 +7594,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	 * fair sleeper stuff for the first placement, but who cares.
 	 */
 	/*
-	 * When !on_rq, vruntime of the task has usually NOT been normalized.
+	 * When !queued, vruntime of the task has usually NOT been normalized.
 	 * But there are some cases where it has already been normalized:
 	 *
 	 * - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7605,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
-	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
-		on_rq = 1;
+	if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+		queued = 1;
 
-	if (!on_rq)
+	if (!queued)
 		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
-	if (!on_rq) {
+	if (!queued) {
 		cfs_rq = cfs_rq_of(se);
 		se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca..9395320 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 		 * means a dl or stop task can slip in, in which case we need
 		 * to re-start task selection.
 		 */
-		if (unlikely((rq->stop && rq->stop->on_rq) ||
+		if (unlikely((rq->stop && task_queued(rq->stop)) ||
 			     rq->dl.dl_nr_running))
 			return RETRY_TASK;
 	}
@@ -1624,7 +1624,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 				     !cpumask_test_cpu(lowest_rq->cpu,
 						       tsk_cpus_allowed(task)) ||
 				     task_running(rq, task) ||
-				     !task->on_rq)) {
+				     !task_queued(task))) {
 
 				double_unlock_balance(rq, lowest_rq);
 				lowest_rq = NULL;
@@ -1658,7 +1658,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
 	BUG_ON(task_current(rq, p));
 	BUG_ON(p->nr_cpus_allowed <= 1);
 
-	BUG_ON(!p->on_rq);
+	BUG_ON(!task_queued(p));
 	BUG_ON(!rt_task(p));
 
 	return p;
@@ -1809,7 +1809,7 @@ static int pull_rt_task(struct rq *this_rq)
 		 */
 		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
 			WARN_ON(p == src_rq->curr);
-			WARN_ON(!p->on_rq);
+			WARN_ON(!task_queued(p));
 
 			/*
 			 * There's a chance that p is higher in priority
@@ -1870,7 +1870,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 
 	BUG_ON(!rt_task(p));
 
-	if (!p->on_rq)
+	if (!task_queued(p))
 		return;
 
 	weight = cpumask_weight(new_mask);
@@ -1936,7 +1936,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!p->on_rq || rq->rt.rt_nr_running)
+	if (!task_queued(p) || rq->rt.rt_nr_running)
 		return;
 
 	if (pull_rt_task(rq))
@@ -1970,7 +1970,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 * If that current running task is also an RT task
 	 * then see if we can move to another run queue.
 	 */
-	if (p->on_rq && rq->curr != p) {
+	if (task_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
 		    /* Don't resched if we changed runqueues */
@@ -1989,7 +1989,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 static void
 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (!p->on_rq)
+	if (!task_queued(p))
 		return;
 
 	if (rq->curr == p) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f..2c83b6e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -15,6 +15,9 @@
 
 struct rq;
 
+/* task_struct::on_rq states: */
+#define ONRQ_QUEUED	1
+
 extern __read_mostly int scheduler_running;
 
 extern unsigned long calc_load_update;
@@ -942,6 +945,10 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
 #endif
 }
 
+static inline int task_queued(struct task_struct *p)
+{
+	return p->on_rq == ONRQ_QUEUED;
+}
 
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0eda..1a4bb0f 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *stop = rq->stop;
 
-	if (!stop || !stop->on_rq)
+	if (!stop || !task_queued(stop))
 		return NULL;
 
 	put_prev_task(rq, prev);




^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 3/6] sched: Teach scheduler to understand ONRQ_MIGRATING state
       [not found] <20140806075138.24858.23816.stgit@tkhai>
  2014-08-06  8:06 ` [PATCH v4 1/6] sched/fair: Fix reschedule which is generated on throttled cfs_rq Kirill Tkhai
  2014-08-06  8:06 ` [PATCH v4 2/6] sched: Wrapper for checking task_struct::on_rq Kirill Tkhai
@ 2014-08-06  8:06 ` Kirill Tkhai
  2014-08-12  7:55   ` Peter Zijlstra
  2014-08-06  8:06 ` [PATCH v4 4/6] sched: Remove double_rq_lock() from __migrate_task() Kirill Tkhai
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 21+ messages in thread
From: Kirill Tkhai @ 2014-08-06  8:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, pjt, oleg, rostedt, umgwanakikbuti, tkhai, tim.c.chen,
	mingo, nicolas.pitre


This is a new state which will be used to indicate that
a task is in a process of migrating between two RQs. It
allows to get rid of double_rq_lock(), which we used to
use to change rq of a queued task before.

Let's consider the example. To move a task between src_rq
and dst_rq we will do the following:

	raw_spin_lock(&src_rq->lock);
	/* p is a task which is queued on src_rq */
	p = ...;

	dequeue_task(src_rq, p, 0);
	p->on_rq = ONRQ_MIGRATING;
	set_task_cpu(p, dst_cpu);
	raw_spin_unlock(&src_rq->lock);

	/*
	 * Both of RQs are unlocked here.
	 * Task p is dequeued from src_rq
	 * but its on_rq is not zero.
	 */

	raw_spin_lock(&dst_rq->lock);
	p->on_rq = ONRQ_QUEUED;
	enqueue_task(dst_rq, p, 0);
	raw_spin_unlock(&dst_rq->lock);

While p->on_rq is ONRQ_MIGRATING, task is considered
as "migrating", and other parallel scheduler actions
with it are not available for parallel caller. The
parallel caller's spining till migration is completed.

The unavailable actions are changing of cpu affinity,
changing of priority etc, in other words all the
functionality which used to require task_rq(p)->lock
before (and related to the task).

To implement ONRQ_MIGRATING support we primarily are
using the following fact. Most of scheduler users
(from which we are protecting a migrating task) use
task_rq_lock() and __task_rq_lock() to get the lock
of task_rq(p). These primitives know that task's cpu
may change, and they are spining while the lock of
the right RQ is not held. We add one more condition
into them, so they will be also spinning until the
migration is finished.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/core.c  |   14 +++++++++++---
 kernel/sched/sched.h |    6 ++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67e8d1e..1cf5109 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -331,9 +331,13 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 	lockdep_assert_held(&p->pi_lock);
 
 	for (;;) {
+		while (unlikely(task_migrating(p)))
+			cpu_relax();
+
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p)))
+		if (likely(rq == task_rq(p) &&
+			   !task_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 	}
@@ -349,10 +353,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	struct rq *rq;
 
 	for (;;) {
+		while (unlikely(task_migrating(p)))
+			cpu_relax();
+
 		raw_spin_lock_irqsave(&p->pi_lock, *flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p)))
+		if (likely(rq == task_rq(p) &&
+			   !task_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
@@ -1678,7 +1686,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 
-	if (task_queued(p) && ttwu_remote(p, wake_flags))
+	if (p->on_rq && ttwu_remote(p, wake_flags))
 		goto stat;
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2c83b6e..ac7c1c8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -17,6 +17,7 @@ struct rq;
 
 /* task_struct::on_rq states: */
 #define ONRQ_QUEUED	1
+#define ONRQ_MIGRATING	2
 
 extern __read_mostly int scheduler_running;
 
@@ -950,6 +951,11 @@ static inline int task_queued(struct task_struct *p)
 	return p->on_rq == ONRQ_QUEUED;
 }
 
+static inline int task_migrating(struct task_struct *p)
+{
+	return p->on_rq == ONRQ_MIGRATING;
+}
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif




^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 4/6] sched: Remove double_rq_lock() from __migrate_task()
       [not found] <20140806075138.24858.23816.stgit@tkhai>
                   ` (2 preceding siblings ...)
  2014-08-06  8:06 ` [PATCH v4 3/6] sched: Teach scheduler to understand ONRQ_MIGRATING state Kirill Tkhai
@ 2014-08-06  8:06 ` Kirill Tkhai
  2014-08-12  8:21   ` Peter Zijlstra
  2014-08-06  8:06 ` [PATCH v4 5/6] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Kirill Tkhai
  2014-08-06  8:07 ` [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance() Kirill Tkhai
  5 siblings, 1 reply; 21+ messages in thread
From: Kirill Tkhai @ 2014-08-06  8:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, pjt, oleg, rostedt, umgwanakikbuti, tkhai, tim.c.chen,
	mingo, nicolas.pitre


Let's use ONRQ_MIGRATING instead.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/core.c |   23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1cf5109..05687ce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4681,20 +4681,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
-	struct rq *rq_dest, *rq_src;
+	struct rq *rq;
 	int ret = 0;
 
 	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 
-	rq_src = cpu_rq(src_cpu);
-	rq_dest = cpu_rq(dest_cpu);
+	rq = cpu_rq(src_cpu);
 
 	raw_spin_lock(&p->pi_lock);
-	double_rq_lock(rq_src, rq_dest);
+	raw_spin_lock(&rq->lock);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto done;
+
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 		goto fail;
@@ -4704,15 +4704,22 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	 * placed properly.
 	 */
 	if (task_queued(p)) {
-		dequeue_task(rq_src, p, 0);
+		dequeue_task(rq, p, 0);
+		p->on_rq = ONRQ_MIGRATING;
 		set_task_cpu(p, dest_cpu);
-		enqueue_task(rq_dest, p, 0);
-		check_preempt_curr(rq_dest, p, 0);
+		raw_spin_unlock(&rq->lock);
+
+		rq = cpu_rq(dest_cpu);
+		raw_spin_lock(&rq->lock);
+		BUG_ON(task_rq(p) != rq);
+		p->on_rq = ONRQ_QUEUED;
+		enqueue_task(rq, p, 0);
+		check_preempt_curr(rq, p, 0);
 	}
 done:
 	ret = 1;
 fail:
-	double_rq_unlock(rq_src, rq_dest);
+	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock(&p->pi_lock);
 	return ret;
 }




^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 5/6] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
       [not found] <20140806075138.24858.23816.stgit@tkhai>
                   ` (3 preceding siblings ...)
  2014-08-06  8:06 ` [PATCH v4 4/6] sched: Remove double_rq_lock() from __migrate_task() Kirill Tkhai
@ 2014-08-06  8:06 ` Kirill Tkhai
  2014-08-12  9:03   ` Peter Zijlstra
  2014-08-12  9:22   ` Peter Zijlstra
  2014-08-06  8:07 ` [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance() Kirill Tkhai
  5 siblings, 2 replies; 21+ messages in thread
From: Kirill Tkhai @ 2014-08-06  8:06 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, pjt, oleg, rostedt, umgwanakikbuti, tkhai, tim.c.chen,
	mingo, nicolas.pitre


Bad situation:

double_lock_balance() drops busiest_rq lock. The busiest_rq is *busiest*,
and a lot of tasks and context switches there. We are dropping the lock
and waiting for it again.

Let's just detach the task and once finally unlock it!

Warning: this admits unlocked using of can_migrate_task(), throttled_lb_pair(),
and task_hot(). I've added lockdep asserts to point on this.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/fair.c |   55 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d54b72c..cfeafb1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3297,6 +3297,8 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
  * Ensure that neither of the group entities corresponding to src_cpu or
  * dest_cpu are members of a throttled hierarchy when performing group
  * load-balance operations.
+ *
+ * Note: RQs may be unlocked.
  */
 static inline int throttled_lb_pair(struct task_group *tg,
 				    int src_cpu, int dest_cpu)
@@ -5133,6 +5135,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 {
 	s64 delta;
 
+	lockdep_assert_held(&env->src_rq->lock);
+
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 
@@ -5252,6 +5256,9 @@ static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
 	int tsk_cache_hot = 0;
+
+	lockdep_assert_held(&env->src_rq->lock);
+
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
@@ -5336,30 +5343,34 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 }
 
 /*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * detach_one_task tries to dequeue exactly one task from env->src_rq, as
  * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
+ * Returns a task if successful and NULL otherwise.
  */
-static int move_one_task(struct lb_env *env)
+static struct task_struct *detach_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 
+	lockdep_assert_held(&env->src_rq->lock);
+
 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
 		if (!can_migrate_task(p, env))
 			continue;
 
-		move_task(p, env);
+		deactivate_task(env->src_rq, p, 0);
+		p->on_rq = ONRQ_MIGRATING;
+		set_task_cpu(p, env->dst_cpu);
+
 		/*
-		 * Right now, this is only the second place move_task()
-		 * is called, so we can safely collect move_task()
-		 * stats here rather than inside move_task().
+		 * Right now, this is only the second place where
+		 * lb_gained[env->idle] is updated (other is move_tasks)
+		 * so we can safely collect stats here rather than
+		 * inside move_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
-		return 1;
+		return p;
 	}
-	return 0;
+	return NULL;
 }
 
 static const unsigned int sched_nr_migrate_break = 32;
@@ -6914,6 +6925,7 @@ static int active_load_balance_cpu_stop(void *data)
 	int target_cpu = busiest_rq->push_cpu;
 	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd;
+	struct task_struct *p = NULL;
 
 	raw_spin_lock_irq(&busiest_rq->lock);
 
@@ -6933,9 +6945,6 @@ static int active_load_balance_cpu_stop(void *data)
 	 */
 	BUG_ON(busiest_rq == target_rq);
 
-	/* move a task from busiest_rq to target_rq */
-	double_lock_balance(busiest_rq, target_rq);
-
 	/* Search for an sd spanning us and the target CPU. */
 	rcu_read_lock();
 	for_each_domain(target_cpu, sd) {
@@ -6956,16 +6965,28 @@ static int active_load_balance_cpu_stop(void *data)
 
 		schedstat_inc(sd, alb_count);
 
-		if (move_one_task(&env))
+		p = detach_one_task(&env);
+		if (p)
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
 	}
 	rcu_read_unlock();
-	double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
 	busiest_rq->active_balance = 0;
-	raw_spin_unlock_irq(&busiest_rq->lock);
+	raw_spin_unlock(&busiest_rq->lock);
+
+	if (p) {
+		raw_spin_lock(&target_rq->lock);
+		BUG_ON(task_rq(p) != target_rq);
+		p->on_rq = ONRQ_QUEUED;
+		activate_task(target_rq, p, 0);
+		check_preempt_curr(target_rq, p, 0);
+		raw_spin_unlock(&target_rq->lock);
+	}
+
+	local_irq_enable();
+
 	return 0;
 }
 




^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance()
       [not found] <20140806075138.24858.23816.stgit@tkhai>
                   ` (4 preceding siblings ...)
  2014-08-06  8:06 ` [PATCH v4 5/6] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Kirill Tkhai
@ 2014-08-06  8:07 ` Kirill Tkhai
  2014-08-12  9:36   ` Peter Zijlstra
  5 siblings, 1 reply; 21+ messages in thread
From: Kirill Tkhai @ 2014-08-06  8:07 UTC (permalink / raw)
  To: linux-kernel
  Cc: peterz, pjt, oleg, rostedt, umgwanakikbuti, tkhai, tim.c.chen,
	mingo, nicolas.pitre


Keep on_rq = ONRQ_MIGRATING, while task is migrating, instead.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/fair.c |   99 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cfeafb1..ed276e6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4706,7 +4706,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 
 	/*
-	 * This is possible from callers such as move_task(), in which we
+	 * This is possible from callers such as attach_tasks(), in which we
 	 * unconditionally check_prempt_curr() after an enqueue (which may have
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
@@ -5114,18 +5114,20 @@ struct lb_env {
 	unsigned int		loop_max;
 
 	enum fbq_type		fbq_type;
+	struct list_head	tasks;
 };
 
 /*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
+ * detach_task - detach the task for the migration specified in env
  */
-static void move_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct lb_env *env)
 {
+	lockdep_assert_held(&env->src_rq->lock);
+
 	deactivate_task(env->src_rq, p, 0);
+	list_add(&p->se.group_node, &env->tasks);
+	p->on_rq = ONRQ_MIGRATING;
 	set_task_cpu(p, env->dst_cpu);
-	activate_task(env->dst_rq, p, 0);
-	check_preempt_curr(env->dst_rq, p, 0);
 }
 
 /*
@@ -5363,9 +5365,9 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 
 		/*
 		 * Right now, this is only the second place where
-		 * lb_gained[env->idle] is updated (other is move_tasks)
+		 * lb_gained[env->idle] is updated (other is detach_tasks)
 		 * so we can safely collect stats here rather than
-		 * inside move_tasks().
+		 * inside detach_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
 		return p;
@@ -5376,18 +5378,18 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 static const unsigned int sched_nr_migrate_break = 32;
 
 /*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
- *
- * Called with both runqueues locked.
+ * detach_tasks tries to detach up to imbalance weighted load from busiest_rq,
+ * as part of a balancing operation within domain "sd".
+ * Returns number of detached tasks if successful and 0 otherwise.
  */
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
 {
 	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
 	unsigned long load;
-	int pulled = 0;
+	int detached = 0;
+
+	lockdep_assert_held(&env->src_rq->lock);
 
 	if (env->imbalance <= 0)
 		return 0;
@@ -5418,14 +5420,15 @@ static int move_tasks(struct lb_env *env)
 		if ((load / 2) > env->imbalance)
 			goto next;
 
-		move_task(p, env);
-		pulled++;
+		detach_task(p, env);
+
+		detached++;
 		env->imbalance -= load;
 
 #ifdef CONFIG_PREEMPT
 		/*
 		 * NEWIDLE balancing is a source of latency, so preemptible
-		 * kernels will stop after the first task is pulled to minimize
+		 * kernels will stop after the first task is detached to minimize
 		 * the critical section.
 		 */
 		if (env->idle == CPU_NEWLY_IDLE)
@@ -5445,13 +5448,31 @@ static int move_tasks(struct lb_env *env)
 	}
 
 	/*
-	 * Right now, this is one of only two places move_task() is called,
-	 * so we can safely collect move_task() stats here rather than
-	 * inside move_task().
+	 * Right now, this is one of only two places we collect this stat
+	 * so we can safely collect detach_one_task() stats here rather
+	 * than inside detach_one_task().
 	 */
-	schedstat_add(env->sd, lb_gained[env->idle], pulled);
+	schedstat_add(env->sd, lb_gained[env->idle], detached);
 
-	return pulled;
+	return detached;
+}
+
+/* Attach tasks previously detached in detach_tasks() */
+static void attach_tasks(struct lb_env *env)
+{
+	struct list_head *tasks = &env->tasks;
+	struct task_struct *p;
+
+	lockdep_assert_held(&env->dst_rq->lock);
+
+	while (!list_empty(tasks)) {
+		p = list_first_entry(tasks, struct task_struct, se.group_node);
+		BUG_ON(task_rq(p) != env->dst_rq);
+		list_del_init(&p->se.group_node);
+		p->on_rq = ONRQ_QUEUED;
+		activate_task(env->dst_rq, p, 0);
+		check_preempt_curr(env->dst_rq, p, 0);
+	}
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6561,6 +6582,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.loop_break	= sched_nr_migrate_break,
 		.cpus		= cpus,
 		.fbq_type	= all,
+		.tasks		= LIST_HEAD_INIT(env.tasks),
 	};
 
 	/*
@@ -6610,16 +6632,35 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
 more_balance:
-		local_irq_save(flags);
-		double_rq_lock(env.dst_rq, busiest);
+		raw_spin_lock_irqsave(&busiest->lock, flags);
 
 		/*
 		 * cur_ld_moved - load moved in current iteration
 		 * ld_moved     - cumulative load moved across iterations
 		 */
-		cur_ld_moved = move_tasks(&env);
-		ld_moved += cur_ld_moved;
-		double_rq_unlock(env.dst_rq, busiest);
+		cur_ld_moved = detach_tasks(&env);
+
+		/*
+		 * We've detached some tasks from busiest_rq. Every
+		 * task is masked "ONRQ_MIGRATED", so we can safely
+		 * unlock busiest->lock, and we are able to be sure
+		 * that nobody can manipulate the tasks in parallel.
+		 * See task_rq_lock() family for the details.
+		 */
+
+		raw_spin_unlock(&busiest->lock);
+
+		if (cur_ld_moved) {
+			raw_spin_lock(&env.dst_rq->lock);
+			/*
+			 * Attach the tasks to env->dst_rq
+			 * and mask them "ONRQ_QUEUED".
+			 */
+			attach_tasks(&env);
+			raw_spin_unlock(&env.dst_rq->lock);
+			ld_moved += cur_ld_moved;
+		}
+
 		local_irq_restore(flags);
 
 		/*
@@ -6755,7 +6796,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
-		 * move_tasks).
+		 * detach_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;




^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 3/6] sched: Teach scheduler to understand ONRQ_MIGRATING state
  2014-08-06  8:06 ` [PATCH v4 3/6] sched: Teach scheduler to understand ONRQ_MIGRATING state Kirill Tkhai
@ 2014-08-12  7:55   ` Peter Zijlstra
  2014-08-12  8:34     ` Kirill Tkhai
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2014-08-12  7:55 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, pjt, oleg, rostedt, umgwanakikbuti, tkhai,
	tim.c.chen, mingo, nicolas.pitre

[-- Attachment #1: Type: text/plain, Size: 2182 bytes --]

On Wed, Aug 06, 2014 at 12:06:19PM +0400, Kirill Tkhai wrote:
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -331,9 +331,13 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
>  	lockdep_assert_held(&p->pi_lock);
>  
>  	for (;;) {
> +		while (unlikely(task_migrating(p)))
> +			cpu_relax();
> +
>  		rq = task_rq(p);
>  		raw_spin_lock(&rq->lock);
> -		if (likely(rq == task_rq(p)))
> +		if (likely(rq == task_rq(p) &&
> +			   !task_migrating(p)))
>  			return rq;
>  		raw_spin_unlock(&rq->lock);
>  	}
> @@ -349,10 +353,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
>  	struct rq *rq;
>  
>  	for (;;) {
> +		while (unlikely(task_migrating(p)))
> +			cpu_relax();
> +
>  		raw_spin_lock_irqsave(&p->pi_lock, *flags);
>  		rq = task_rq(p);
>  		raw_spin_lock(&rq->lock);
> -		if (likely(rq == task_rq(p)))
> +		if (likely(rq == task_rq(p) &&
> +			   !task_migrating(p)))
>  			return rq;
>  		raw_spin_unlock(&rq->lock);
>  		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

I know I suggested that; but I changed it like the below. The advantage
is of not having two task_migrating() tests on the likely path.

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -331,15 +331,15 @@ static inline struct rq *__task_rq_lock(
 	lockdep_assert_held(&p->pi_lock);
 
 	for (;;) {
-		while (unlikely(task_migrating(p)))
-			cpu_relax();
-
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p) &&
 			   !task_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
+
+		while (unlikely(task_migrating(p)))
+			cpu_relax();
 	}
 }
 
@@ -353,9 +353,6 @@ static struct rq *task_rq_lock(struct ta
 	struct rq *rq;
 
 	for (;;) {
-		while (unlikely(task_migrating(p)))
-			cpu_relax();
-
 		raw_spin_lock_irqsave(&p->pi_lock, *flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
@@ -364,6 +361,9 @@ static struct rq *task_rq_lock(struct ta
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+		while (unlikely(task_migrating(p)))
+			cpu_relax();
 	}
 }
 

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 4/6] sched: Remove double_rq_lock() from __migrate_task()
  2014-08-06  8:06 ` [PATCH v4 4/6] sched: Remove double_rq_lock() from __migrate_task() Kirill Tkhai
@ 2014-08-12  8:21   ` Peter Zijlstra
  0 siblings, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2014-08-12  8:21 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, pjt, oleg, rostedt, umgwanakikbuti, tkhai,
	tim.c.chen, mingo, nicolas.pitre

[-- Attachment #1: Type: text/plain, Size: 675 bytes --]

On Wed, Aug 06, 2014 at 12:06:27PM +0400, Kirill Tkhai wrote:
> 
> Let's use ONRQ_MIGRATING instead.

I feel there should be a little more in the changelog; how about
something like:

Avoid double_rq_lock() and use ONRQ_MIGRATING for __migrate_task(). The
advantage is (obviously) not holding two 'rq->lock's at the same time
and thereby increasing parallelism.

The important point to note is that because we acquire dst->lock
immediately after releasing src->lock the potential wait time of
task_rq_lock() callers on ONRQ_MIGRATING is not longer than it would
have been in the double rq lock scenario.

> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 3/6] sched: Teach scheduler to understand ONRQ_MIGRATING state
  2014-08-12  7:55   ` Peter Zijlstra
@ 2014-08-12  8:34     ` Kirill Tkhai
  2014-08-12  9:43       ` Peter Zijlstra
  0 siblings, 1 reply; 21+ messages in thread
From: Kirill Tkhai @ 2014-08-12  8:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, pjt, oleg, rostedt, umgwanakikbuti, tkhai,
	tim.c.chen, mingo, nicolas.pitre

В Вт, 12/08/2014 в 09:55 +0200, Peter Zijlstra пишет:
> On Wed, Aug 06, 2014 at 12:06:19PM +0400, Kirill Tkhai wrote:
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -331,9 +331,13 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
> >  	lockdep_assert_held(&p->pi_lock);
> >  
> >  	for (;;) {
> > +		while (unlikely(task_migrating(p)))
> > +			cpu_relax();
> > +
> >  		rq = task_rq(p);
> >  		raw_spin_lock(&rq->lock);
> > -		if (likely(rq == task_rq(p)))
> > +		if (likely(rq == task_rq(p) &&
> > +			   !task_migrating(p)))
> >  			return rq;
> >  		raw_spin_unlock(&rq->lock);
> >  	}
> > @@ -349,10 +353,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
> >  	struct rq *rq;
> >  
> >  	for (;;) {
> > +		while (unlikely(task_migrating(p)))
> > +			cpu_relax();
> > +
> >  		raw_spin_lock_irqsave(&p->pi_lock, *flags);
> >  		rq = task_rq(p);
> >  		raw_spin_lock(&rq->lock);
> > -		if (likely(rq == task_rq(p)))
> > +		if (likely(rq == task_rq(p) &&
> > +			   !task_migrating(p)))
> >  			return rq;
> >  		raw_spin_unlock(&rq->lock);
> >  		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
> 
> I know I suggested that; but I changed it like the below. The advantage
> is of not having two task_migrating() tests on the likely path.

I don't have objections. Should I resend the series (also with new [4/6] log
commentary)?

> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -331,15 +331,15 @@ static inline struct rq *__task_rq_lock(
>  	lockdep_assert_held(&p->pi_lock);
>  
>  	for (;;) {
> -		while (unlikely(task_migrating(p)))
> -			cpu_relax();
> -
>  		rq = task_rq(p);
>  		raw_spin_lock(&rq->lock);
>  		if (likely(rq == task_rq(p) &&
>  			   !task_migrating(p)))
>  			return rq;
>  		raw_spin_unlock(&rq->lock);
> +
> +		while (unlikely(task_migrating(p)))
> +			cpu_relax();
>  	}
>  }
>  
> @@ -353,9 +353,6 @@ static struct rq *task_rq_lock(struct ta
>  	struct rq *rq;
>  
>  	for (;;) {
> -		while (unlikely(task_migrating(p)))
> -			cpu_relax();
> -
>  		raw_spin_lock_irqsave(&p->pi_lock, *flags);
>  		rq = task_rq(p);
>  		raw_spin_lock(&rq->lock);
> @@ -364,6 +361,9 @@ static struct rq *task_rq_lock(struct ta
>  			return rq;
>  		raw_spin_unlock(&rq->lock);
>  		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
> +
> +		while (unlikely(task_migrating(p)))
> +			cpu_relax();
>  	}
>  }
>  



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 5/6] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
  2014-08-06  8:06 ` [PATCH v4 5/6] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Kirill Tkhai
@ 2014-08-12  9:03   ` Peter Zijlstra
  2014-08-12  9:22   ` Peter Zijlstra
  1 sibling, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2014-08-12  9:03 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, pjt, oleg, rostedt, umgwanakikbuti, tkhai,
	tim.c.chen, mingo, nicolas.pitre

[-- Attachment #1: Type: text/plain, Size: 1638 bytes --]

On Wed, Aug 06, 2014 at 12:06:56PM +0400, Kirill Tkhai wrote:
> 
> Bad situation:
> 
> double_lock_balance() drops busiest_rq lock. The busiest_rq is *busiest*,
> and a lot of tasks and context switches there. We are dropping the lock
> and waiting for it again.
> 
> Let's just detach the task and once finally unlock it!

that wants rewording, much like the previous one I did.

> 
> Warning: this admits unlocked using of can_migrate_task(), throttled_lb_pair(),
> and task_hot(). I've added lockdep asserts to point on this.

That doesn't make sense; see below.

> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> ---
>  kernel/sched/fair.c |   55 +++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 38 insertions(+), 17 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d54b72c..cfeafb1 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3297,6 +3297,8 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
>   * Ensure that neither of the group entities corresponding to src_cpu or
>   * dest_cpu are members of a throttled hierarchy when performing group
>   * load-balance operations.
> + *
> + * Note: RQs may be unlocked.
>   */
>  static inline int throttled_lb_pair(struct task_group *tg,
>  				    int src_cpu, int dest_cpu)

I'm not immediately seeing this; this function is only ever called from
can_migrate_task(); and there you assert that we must be holding src_rq.

And at this point src_rq is the only relevant rq, since that is the one
the task is still on.

so let me remove this comment.

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 5/6] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
  2014-08-06  8:06 ` [PATCH v4 5/6] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Kirill Tkhai
  2014-08-12  9:03   ` Peter Zijlstra
@ 2014-08-12  9:22   ` Peter Zijlstra
  2014-08-12  9:39     ` Kirill Tkhai
  1 sibling, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2014-08-12  9:22 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, pjt, oleg, rostedt, umgwanakikbuti, tkhai,
	tim.c.chen, mingo, nicolas.pitre

[-- Attachment #1: Type: text/plain, Size: 4528 bytes --]


Something like so?

---
Subject: sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
From: Kirill Tkhai <ktkhai@parallels.com>
Date: Wed, 6 Aug 2014 12:06:56 +0400

Avoid double_rq_lock() and use ONRQ_MIGRATING for
active_load_balance_cpu_stop(). The advantage is (obviously) not
holding two 'rq->lock's at the same time and thereby increasing
parallelism.

Further note that if there was no task to migrate we will not have
acquired the second rq->lock at all.

The important point to note is that because we acquire dst->lock
immediately after releasing src->lock the potential wait time of
task_rq_lock() callers on ONRQ_MIGRATING is not longer than it would
have been in the double rq lock scenario.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1407312416.8424.47.camel@tkhai
---
 kernel/sched/fair.c |   60 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 16 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5135,6 +5135,8 @@ static int task_hot(struct task_struct *
 {
 	s64 delta;
 
+	lockdep_assert_held(&env->src_rq->lock);
+
 	if (p->sched_class != &fair_sched_class)
 		return 0;
 
@@ -5254,6 +5256,9 @@ static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
 	int tsk_cache_hot = 0;
+
+	lockdep_assert_held(&env->src_rq->lock);
+
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
@@ -5338,30 +5343,49 @@ int can_migrate_task(struct task_struct
 }
 
 /*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
  * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
  *
- * Called with both runqueues locked.
+ * Returns a task if successful and NULL otherwise.
  */
-static int move_one_task(struct lb_env *env)
+static struct task_struct *detach_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 
+	lockdep_assert_held(&env->src_rq->lock);
+
 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
 		if (!can_migrate_task(p, env))
 			continue;
 
-		move_task(p, env);
+		deactivate_task(env->src_rq, p, 0);
+		p->on_rq = ONRQ_MIGRATING;
+		set_task_cpu(p, env->dst_cpu);
+
 		/*
-		 * Right now, this is only the second place move_task()
-		 * is called, so we can safely collect move_task()
-		 * stats here rather than inside move_task().
+		 * Right now, this is only the second place where
+		 * lb_gained[env->idle] is updated (other is move_tasks)
+		 * so we can safely collect stats here rather than
+		 * inside move_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
-		return 1;
+		return p;
 	}
-	return 0;
+	return NULL;
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+	raw_spin_lock(&rq->lock);
+	BUG_ON(task_rq(p) != rq);
+	p->on_rq = ONRQ_QUEUED;
+	activate_task(rq, p, 0);
+	check_preempt_curr(rq, p, 0);
+	raw_spin_unlock(&rq->lock);
 }
 
 static const unsigned int sched_nr_migrate_break = 32;
@@ -6940,6 +6964,7 @@ static int active_load_balance_cpu_stop(
 	int target_cpu = busiest_rq->push_cpu;
 	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd;
+	struct task_struct *p = NULL;
 
 	raw_spin_lock_irq(&busiest_rq->lock);
 
@@ -6959,9 +6984,6 @@ static int active_load_balance_cpu_stop(
 	 */
 	BUG_ON(busiest_rq == target_rq);
 
-	/* move a task from busiest_rq to target_rq */
-	double_lock_balance(busiest_rq, target_rq);
-
 	/* Search for an sd spanning us and the target CPU. */
 	rcu_read_lock();
 	for_each_domain(target_cpu, sd) {
@@ -6982,16 +7004,22 @@ static int active_load_balance_cpu_stop(
 
 		schedstat_inc(sd, alb_count);
 
-		if (move_one_task(&env))
+		p = detach_one_task(&env);
+		if (p)
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
 	}
 	rcu_read_unlock();
-	double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
 	busiest_rq->active_balance = 0;
-	raw_spin_unlock_irq(&busiest_rq->lock);
+	raw_spin_unlock(&busiest_rq->lock);
+
+	if (p)
+		attach_one_task(target_rq, p);
+
+	local_irq_enable();
+
 	return 0;
 }
 

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance()
  2014-08-06  8:07 ` [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance() Kirill Tkhai
@ 2014-08-12  9:36   ` Peter Zijlstra
  2014-08-12 10:27     ` Kirill Tkhai
  0 siblings, 1 reply; 21+ messages in thread
From: Peter Zijlstra @ 2014-08-12  9:36 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, pjt, oleg, rostedt, umgwanakikbuti, tkhai,
	tim.c.chen, mingo, nicolas.pitre

[-- Attachment #1: Type: text/plain, Size: 8532 bytes --]



Changed quite a bit around, _should_ be more or less the same end result
I suppose. Only compile tested so far.

---
Subject: sched/fair: Remove double_lock_balance() from load_balance()
From: Kirill Tkhai <ktkhai@parallels.com>
Date: Wed, 6 Aug 2014 12:07:04 +0400

Avoid double_rq_lock() and use ONRQ_MIGRATING for load_balance(). The
advantage is (obviously) not holding two 'rq->lock's at the same time
and thereby increasing parallelism.

Further note that if there was no task to migrate we will not have
acquired the second rq->lock at all.

The important point to note is that because we acquire dst->lock
immediately after releasing src->lock the potential wait time of
task_rq_lock() callers on ONRQ_MIGRATING is not longer than it would
have been in the double rq lock scenario.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1407312424.8424.48.camel@tkhai
---
 kernel/sched/fair.c |  151 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 99 insertions(+), 52 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4706,7 +4706,7 @@ static void check_preempt_wakeup(struct
 		return;
 
 	/*
-	 * This is possible from callers such as move_task(), in which we
+	 * This is possible from callers such as attach_tasks(), in which we
 	 * unconditionally check_prempt_curr() after an enqueue (which may have
 	 * lead to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
@@ -5114,21 +5114,10 @@ struct lb_env {
 	unsigned int		loop_max;
 
 	enum fbq_type		fbq_type;
+	struct list_head	tasks;
 };
 
 /*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
-	deactivate_task(env->src_rq, p, 0);
-	set_task_cpu(p, env->dst_cpu);
-	activate_task(env->dst_rq, p, 0);
-	check_preempt_curr(env->dst_rq, p, 0);
-}
-
-/*
  * Is this task likely cache-hot:
  */
 static int task_hot(struct task_struct *p, struct lb_env *env)
@@ -5343,6 +5332,18 @@ int can_migrate_task(struct task_struct
 }
 
 /*
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+	lockdep_assert_held(&env->src_rq->lock);
+
+	deactivate_task(env->src_rq, p, 0);
+	p->on_rq = ONRQ_MIGRATING;
+	set_task_cpu(p, env->dst_cpu);
+}
+
+/*
  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
  * part of active balancing operations within "domain".
  *
@@ -5358,15 +5359,13 @@ static struct task_struct *detach_one_ta
 		if (!can_migrate_task(p, env))
 			continue;
 
-		deactivate_task(env->src_rq, p, 0);
-		p->on_rq = ONRQ_MIGRATING;
-		set_task_cpu(p, env->dst_cpu);
+		detach_task(p, env);
 
 		/*
 		 * Right now, this is only the second place where
-		 * lb_gained[env->idle] is updated (other is move_tasks)
+		 * lb_gained[env->idle] is updated (other is detach_tasks)
 		 * so we can safely collect stats here rather than
-		 * inside move_tasks().
+		 * inside detach_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
 		return p;
@@ -5374,35 +5373,22 @@ static struct task_struct *detach_one_ta
 	return NULL;
 }
 
-/*
- * attach_one_task() -- attaches the task returned from detach_one_task() to
- * its new rq.
- */
-static void attach_one_task(struct rq *rq, struct task_struct *p)
-{
-	raw_spin_lock(&rq->lock);
-	BUG_ON(task_rq(p) != rq);
-	p->on_rq = ONRQ_QUEUED;
-	activate_task(rq, p, 0);
-	check_preempt_curr(rq, p, 0);
-	raw_spin_unlock(&rq->lock);
-}
-
 static const unsigned int sched_nr_migrate_break = 32;
 
 /*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * busiest_rq, as part of a balancing operation within domain "sd".
  *
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
  */
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
 {
 	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
 	unsigned long load;
-	int pulled = 0;
+	int detached = 0;
+
+	lockdep_assert_held(&env->src_rq->lock);
 
 	if (env->imbalance <= 0)
 		return 0;
@@ -5433,14 +5419,16 @@ static int move_tasks(struct lb_env *env
 		if ((load / 2) > env->imbalance)
 			goto next;
 
-		move_task(p, env);
-		pulled++;
+		detach_task(p, env);
+		list_add(&p->se.group_node, &env->tasks);
+
+		detached++;
 		env->imbalance -= load;
 
 #ifdef CONFIG_PREEMPT
 		/*
 		 * NEWIDLE balancing is a source of latency, so preemptible
-		 * kernels will stop after the first task is pulled to minimize
+		 * kernels will stop after the first task is detached to minimize
 		 * the critical section.
 		 */
 		if (env->idle == CPU_NEWLY_IDLE)
@@ -5460,13 +5448,58 @@ static int move_tasks(struct lb_env *env
 	}
 
 	/*
-	 * Right now, this is one of only two places move_task() is called,
-	 * so we can safely collect move_task() stats here rather than
-	 * inside move_task().
+	 * Right now, this is one of only two places we collect this stat
+	 * so we can safely collect detach_one_task() stats here rather
+	 * than inside detach_one_task().
 	 */
-	schedstat_add(env->sd, lb_gained[env->idle], pulled);
+	schedstat_add(env->sd, lb_gained[env->idle], detached);
 
-	return pulled;
+	return detached;
+}
+
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+	lockdep_assert_held(&rq->lock);
+
+	BUG_ON(task_rq(p) != rq);
+	p->on_rq = ONRQ_QUEUED;
+	activate_task(rq, p, 0);
+	check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+	raw_spin_lock(&rq->lock);
+	attach_task(rq, p);
+	raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+	struct list_head *tasks = &env->tasks;
+	struct task_struct *p;
+
+	raw_spin_lock(&env->dst_rq->lock);
+
+	while (!list_empty(tasks)) {
+		p = list_first_entry(tasks, struct task_struct, se.group_node);
+		list_del_init(&p->se.group_node);
+
+		attach_task(env->dst_rq, p);
+	}
+
+	raw_spin_unlock(&env->dst_rq->lock);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6600,6 +6633,7 @@ static int load_balance(int this_cpu, st
 		.loop_break	= sched_nr_migrate_break,
 		.cpus		= cpus,
 		.fbq_type	= all,
+		.tasks		= LIST_HEAD_INIT(env.tasks),
 	};
 
 	/*
@@ -6649,16 +6683,29 @@ static int load_balance(int this_cpu, st
 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
 more_balance:
-		local_irq_save(flags);
-		double_rq_lock(env.dst_rq, busiest);
+		raw_spin_lock_irqsave(&busiest->lock, flags);
 
 		/*
 		 * cur_ld_moved - load moved in current iteration
 		 * ld_moved     - cumulative load moved across iterations
 		 */
-		cur_ld_moved = move_tasks(&env);
-		ld_moved += cur_ld_moved;
-		double_rq_unlock(env.dst_rq, busiest);
+		cur_ld_moved = detach_tasks(&env);
+
+		/*
+		 * We've detached some tasks from busiest_rq. Every
+		 * task is masked "ONRQ_MIGRATED", so we can safely
+		 * unlock busiest->lock, and we are able to be sure
+		 * that nobody can manipulate the tasks in parallel.
+		 * See task_rq_lock() family for the details.
+		 */
+
+		raw_spin_unlock(&busiest->lock);
+
+		if (cur_ld_moved) {
+			attach_tasks(&env);
+			ld_moved += cur_ld_moved;
+		}
+
 		local_irq_restore(flags);
 
 		/*
@@ -6794,7 +6841,7 @@ static int load_balance(int this_cpu, st
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
-		 * move_tasks).
+		 * detach_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 5/6] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
  2014-08-12  9:22   ` Peter Zijlstra
@ 2014-08-12  9:39     ` Kirill Tkhai
  0 siblings, 0 replies; 21+ messages in thread
From: Kirill Tkhai @ 2014-08-12  9:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, pjt, oleg, rostedt, umgwanakikbuti, tkhai,
	tim.c.chen, mingo, nicolas.pitre

В Вт, 12/08/2014 в 11:22 +0200, Peter Zijlstra пишет:
> Something like so?

Pair brackets detach_one_task()/attach_one_task() look good.
No objections.

> ---
> Subject: sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
> From: Kirill Tkhai <ktkhai@parallels.com>
> Date: Wed, 6 Aug 2014 12:06:56 +0400
> 
> Avoid double_rq_lock() and use ONRQ_MIGRATING for
> active_load_balance_cpu_stop(). The advantage is (obviously) not
> holding two 'rq->lock's at the same time and thereby increasing
> parallelism.
> 
> Further note that if there was no task to migrate we will not have
> acquired the second rq->lock at all.
> 
> The important point to note is that because we acquire dst->lock
> immediately after releasing src->lock the potential wait time of
> task_rq_lock() callers on ONRQ_MIGRATING is not longer than it would
> have been in the double rq lock scenario.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> Signed-off-by: Peter Zijlstra <peterz@infradead.org>
> Link: http://lkml.kernel.org/r/1407312416.8424.47.camel@tkhai
> ---
>  kernel/sched/fair.c |   60 ++++++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 44 insertions(+), 16 deletions(-)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5135,6 +5135,8 @@ static int task_hot(struct task_struct *
>  {
>  	s64 delta;
>  
> +	lockdep_assert_held(&env->src_rq->lock);
> +
>  	if (p->sched_class != &fair_sched_class)
>  		return 0;
>  
> @@ -5254,6 +5256,9 @@ static
>  int can_migrate_task(struct task_struct *p, struct lb_env *env)
>  {
>  	int tsk_cache_hot = 0;
> +
> +	lockdep_assert_held(&env->src_rq->lock);
> +
>  	/*
>  	 * We do not migrate tasks that are:
>  	 * 1) throttled_lb_pair, or
> @@ -5338,30 +5343,49 @@ int can_migrate_task(struct task_struct
>  }
>  
>  /*
> - * move_one_task tries to move exactly one task from busiest to this_rq, as
> + * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
>   * part of active balancing operations within "domain".
> - * Returns 1 if successful and 0 otherwise.
>   *
> - * Called with both runqueues locked.
> + * Returns a task if successful and NULL otherwise.
>   */
> -static int move_one_task(struct lb_env *env)
> +static struct task_struct *detach_one_task(struct lb_env *env)
>  {
>  	struct task_struct *p, *n;
>  
> +	lockdep_assert_held(&env->src_rq->lock);
> +
>  	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
>  		if (!can_migrate_task(p, env))
>  			continue;
>  
> -		move_task(p, env);
> +		deactivate_task(env->src_rq, p, 0);
> +		p->on_rq = ONRQ_MIGRATING;
> +		set_task_cpu(p, env->dst_cpu);
> +
>  		/*
> -		 * Right now, this is only the second place move_task()
> -		 * is called, so we can safely collect move_task()
> -		 * stats here rather than inside move_task().
> +		 * Right now, this is only the second place where
> +		 * lb_gained[env->idle] is updated (other is move_tasks)
> +		 * so we can safely collect stats here rather than
> +		 * inside move_tasks().
>  		 */
>  		schedstat_inc(env->sd, lb_gained[env->idle]);
> -		return 1;
> +		return p;
>  	}
> -	return 0;
> +	return NULL;
> +}
> +
> +/*
> + * attach_one_task() -- attaches the task returned from detach_one_task() to
> + * its new rq.
> + */
> +static void attach_one_task(struct rq *rq, struct task_struct *p)
> +{
> +	raw_spin_lock(&rq->lock);
> +	BUG_ON(task_rq(p) != rq);
> +	p->on_rq = ONRQ_QUEUED;
> +	activate_task(rq, p, 0);
> +	check_preempt_curr(rq, p, 0);
> +	raw_spin_unlock(&rq->lock);
>  }
>  
>  static const unsigned int sched_nr_migrate_break = 32;
> @@ -6940,6 +6964,7 @@ static int active_load_balance_cpu_stop(
>  	int target_cpu = busiest_rq->push_cpu;
>  	struct rq *target_rq = cpu_rq(target_cpu);
>  	struct sched_domain *sd;
> +	struct task_struct *p = NULL;
>  
>  	raw_spin_lock_irq(&busiest_rq->lock);
>  
> @@ -6959,9 +6984,6 @@ static int active_load_balance_cpu_stop(
>  	 */
>  	BUG_ON(busiest_rq == target_rq);
>  
> -	/* move a task from busiest_rq to target_rq */
> -	double_lock_balance(busiest_rq, target_rq);
> -
>  	/* Search for an sd spanning us and the target CPU. */
>  	rcu_read_lock();
>  	for_each_domain(target_cpu, sd) {
> @@ -6982,16 +7004,22 @@ static int active_load_balance_cpu_stop(
>  
>  		schedstat_inc(sd, alb_count);
>  
> -		if (move_one_task(&env))
> +		p = detach_one_task(&env);
> +		if (p)
>  			schedstat_inc(sd, alb_pushed);
>  		else
>  			schedstat_inc(sd, alb_failed);
>  	}
>  	rcu_read_unlock();
> -	double_unlock_balance(busiest_rq, target_rq);
>  out_unlock:
>  	busiest_rq->active_balance = 0;
> -	raw_spin_unlock_irq(&busiest_rq->lock);
> +	raw_spin_unlock(&busiest_rq->lock);
> +
> +	if (p)
> +		attach_one_task(target_rq, p);
> +
> +	local_irq_enable();
> +
>  	return 0;
>  }
>  



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 3/6] sched: Teach scheduler to understand ONRQ_MIGRATING state
  2014-08-12  8:34     ` Kirill Tkhai
@ 2014-08-12  9:43       ` Peter Zijlstra
  0 siblings, 0 replies; 21+ messages in thread
From: Peter Zijlstra @ 2014-08-12  9:43 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, pjt, oleg, rostedt, umgwanakikbuti, tkhai,
	tim.c.chen, mingo, nicolas.pitre

[-- Attachment #1: Type: text/plain, Size: 290 bytes --]

On Tue, Aug 12, 2014 at 12:34:23PM +0400, Kirill Tkhai wrote:
> I don't have objections. Should I resend the series (also with new [4/6] log
> commentary)?

Nah, I'll keep the patches as I have them now, I'll push them out to the
queue.git tree somewhat later today so you can have a peek.

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance()
  2014-08-12  9:36   ` Peter Zijlstra
@ 2014-08-12 10:27     ` Kirill Tkhai
  0 siblings, 0 replies; 21+ messages in thread
From: Kirill Tkhai @ 2014-08-12 10:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, pjt, oleg, rostedt, umgwanakikbuti, tkhai,
	tim.c.chen, mingo, nicolas.pitre

В Вт, 12/08/2014 в 11:36 +0200, Peter Zijlstra пишет:
> 
> Changed quite a bit around, _should_ be more or less the same end result
> I suppose. Only compile tested so far.

No objections. I've tested it in a virtual machine; simple "make -jxx"
test passed.

> 
> ---
> Subject: sched/fair: Remove double_lock_balance() from load_balance()
> From: Kirill Tkhai <ktkhai@parallels.com>
> Date: Wed, 6 Aug 2014 12:07:04 +0400
> 
> Avoid double_rq_lock() and use ONRQ_MIGRATING for load_balance(). The
> advantage is (obviously) not holding two 'rq->lock's at the same time
> and thereby increasing parallelism.
> 
> Further note that if there was no task to migrate we will not have
> acquired the second rq->lock at all.
> 
> The important point to note is that because we acquire dst->lock
> immediately after releasing src->lock the potential wait time of
> task_rq_lock() callers on ONRQ_MIGRATING is not longer than it would
> have been in the double rq lock scenario.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> Signed-off-by: Peter Zijlstra <peterz@infradead.org>
> Link: http://lkml.kernel.org/r/1407312424.8424.48.camel@tkhai
> ---
>  kernel/sched/fair.c |  151 ++++++++++++++++++++++++++++++++++------------------
>  1 file changed, 99 insertions(+), 52 deletions(-)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4706,7 +4706,7 @@ static void check_preempt_wakeup(struct
>  		return;
>  
>  	/*
> -	 * This is possible from callers such as move_task(), in which we
> +	 * This is possible from callers such as attach_tasks(), in which we
>  	 * unconditionally check_prempt_curr() after an enqueue (which may have
>  	 * lead to a throttle).  This both saves work and prevents false
>  	 * next-buddy nomination below.
> @@ -5114,21 +5114,10 @@ struct lb_env {
>  	unsigned int		loop_max;
>  
>  	enum fbq_type		fbq_type;
> +	struct list_head	tasks;
>  };
>  
>  /*
> - * move_task - move a task from one runqueue to another runqueue.
> - * Both runqueues must be locked.
> - */
> -static void move_task(struct task_struct *p, struct lb_env *env)
> -{
> -	deactivate_task(env->src_rq, p, 0);
> -	set_task_cpu(p, env->dst_cpu);
> -	activate_task(env->dst_rq, p, 0);
> -	check_preempt_curr(env->dst_rq, p, 0);
> -}
> -
> -/*
>   * Is this task likely cache-hot:
>   */
>  static int task_hot(struct task_struct *p, struct lb_env *env)
> @@ -5343,6 +5332,18 @@ int can_migrate_task(struct task_struct
>  }
>  
>  /*
> + * detach_task() -- detach the task for the migration specified in env
> + */
> +static void detach_task(struct task_struct *p, struct lb_env *env)
> +{
> +	lockdep_assert_held(&env->src_rq->lock);
> +
> +	deactivate_task(env->src_rq, p, 0);
> +	p->on_rq = ONRQ_MIGRATING;
> +	set_task_cpu(p, env->dst_cpu);
> +}
> +
> +/*
>   * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
>   * part of active balancing operations within "domain".
>   *
> @@ -5358,15 +5359,13 @@ static struct task_struct *detach_one_ta
>  		if (!can_migrate_task(p, env))
>  			continue;
>  
> -		deactivate_task(env->src_rq, p, 0);
> -		p->on_rq = ONRQ_MIGRATING;
> -		set_task_cpu(p, env->dst_cpu);
> +		detach_task(p, env);
>  
>  		/*
>  		 * Right now, this is only the second place where
> -		 * lb_gained[env->idle] is updated (other is move_tasks)
> +		 * lb_gained[env->idle] is updated (other is detach_tasks)
>  		 * so we can safely collect stats here rather than
> -		 * inside move_tasks().
> +		 * inside detach_tasks().
>  		 */
>  		schedstat_inc(env->sd, lb_gained[env->idle]);
>  		return p;
> @@ -5374,35 +5373,22 @@ static struct task_struct *detach_one_ta
>  	return NULL;
>  }
>  
> -/*
> - * attach_one_task() -- attaches the task returned from detach_one_task() to
> - * its new rq.
> - */
> -static void attach_one_task(struct rq *rq, struct task_struct *p)
> -{
> -	raw_spin_lock(&rq->lock);
> -	BUG_ON(task_rq(p) != rq);
> -	p->on_rq = ONRQ_QUEUED;
> -	activate_task(rq, p, 0);
> -	check_preempt_curr(rq, p, 0);
> -	raw_spin_unlock(&rq->lock);
> -}
> -
>  static const unsigned int sched_nr_migrate_break = 32;
>  
>  /*
> - * move_tasks tries to move up to imbalance weighted load from busiest to
> - * this_rq, as part of a balancing operation within domain "sd".
> - * Returns 1 if successful and 0 otherwise.
> + * detach_tasks() -- tries to detach up to imbalance weighted load from
> + * busiest_rq, as part of a balancing operation within domain "sd".
>   *
> - * Called with both runqueues locked.
> + * Returns number of detached tasks if successful and 0 otherwise.
>   */
> -static int move_tasks(struct lb_env *env)
> +static int detach_tasks(struct lb_env *env)
>  {
>  	struct list_head *tasks = &env->src_rq->cfs_tasks;
>  	struct task_struct *p;
>  	unsigned long load;
> -	int pulled = 0;
> +	int detached = 0;
> +
> +	lockdep_assert_held(&env->src_rq->lock);
>  
>  	if (env->imbalance <= 0)
>  		return 0;
> @@ -5433,14 +5419,16 @@ static int move_tasks(struct lb_env *env
>  		if ((load / 2) > env->imbalance)
>  			goto next;
>  
> -		move_task(p, env);
> -		pulled++;
> +		detach_task(p, env);
> +		list_add(&p->se.group_node, &env->tasks);
> +
> +		detached++;
>  		env->imbalance -= load;
>  
>  #ifdef CONFIG_PREEMPT
>  		/*
>  		 * NEWIDLE balancing is a source of latency, so preemptible
> -		 * kernels will stop after the first task is pulled to minimize
> +		 * kernels will stop after the first task is detached to minimize
>  		 * the critical section.
>  		 */
>  		if (env->idle == CPU_NEWLY_IDLE)
> @@ -5460,13 +5448,58 @@ static int move_tasks(struct lb_env *env
>  	}
>  
>  	/*
> -	 * Right now, this is one of only two places move_task() is called,
> -	 * so we can safely collect move_task() stats here rather than
> -	 * inside move_task().
> +	 * Right now, this is one of only two places we collect this stat
> +	 * so we can safely collect detach_one_task() stats here rather
> +	 * than inside detach_one_task().
>  	 */
> -	schedstat_add(env->sd, lb_gained[env->idle], pulled);
> +	schedstat_add(env->sd, lb_gained[env->idle], detached);
>  
> -	return pulled;
> +	return detached;
> +}
> +
> +/*
> + * attach_task() -- attach the task detached by detach_task() to its new rq.
> + */
> +static void attach_task(struct rq *rq, struct task_struct *p)
> +{
> +	lockdep_assert_held(&rq->lock);
> +
> +	BUG_ON(task_rq(p) != rq);
> +	p->on_rq = ONRQ_QUEUED;
> +	activate_task(rq, p, 0);
> +	check_preempt_curr(rq, p, 0);
> +}
> +
> +/*
> + * attach_one_task() -- attaches the task returned from detach_one_task() to
> + * its new rq.
> + */
> +static void attach_one_task(struct rq *rq, struct task_struct *p)
> +{
> +	raw_spin_lock(&rq->lock);
> +	attach_task(rq, p);
> +	raw_spin_unlock(&rq->lock);
> +}
> +
> +/*
> + * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
> + * new rq.
> + */
> +static void attach_tasks(struct lb_env *env)
> +{
> +	struct list_head *tasks = &env->tasks;
> +	struct task_struct *p;
> +
> +	raw_spin_lock(&env->dst_rq->lock);
> +
> +	while (!list_empty(tasks)) {
> +		p = list_first_entry(tasks, struct task_struct, se.group_node);
> +		list_del_init(&p->se.group_node);
> +
> +		attach_task(env->dst_rq, p);
> +	}
> +
> +	raw_spin_unlock(&env->dst_rq->lock);
>  }
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -6600,6 +6633,7 @@ static int load_balance(int this_cpu, st
>  		.loop_break	= sched_nr_migrate_break,
>  		.cpus		= cpus,
>  		.fbq_type	= all,
> +		.tasks		= LIST_HEAD_INIT(env.tasks),
>  	};
>  
>  	/*
> @@ -6649,16 +6683,29 @@ static int load_balance(int this_cpu, st
>  		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
>  
>  more_balance:
> -		local_irq_save(flags);
> -		double_rq_lock(env.dst_rq, busiest);
> +		raw_spin_lock_irqsave(&busiest->lock, flags);
>  
>  		/*
>  		 * cur_ld_moved - load moved in current iteration
>  		 * ld_moved     - cumulative load moved across iterations
>  		 */
> -		cur_ld_moved = move_tasks(&env);
> -		ld_moved += cur_ld_moved;
> -		double_rq_unlock(env.dst_rq, busiest);
> +		cur_ld_moved = detach_tasks(&env);
> +
> +		/*
> +		 * We've detached some tasks from busiest_rq. Every
> +		 * task is masked "ONRQ_MIGRATED", so we can safely
> +		 * unlock busiest->lock, and we are able to be sure
> +		 * that nobody can manipulate the tasks in parallel.
> +		 * See task_rq_lock() family for the details.
> +		 */
> +
> +		raw_spin_unlock(&busiest->lock);
> +
> +		if (cur_ld_moved) {
> +			attach_tasks(&env);
> +			ld_moved += cur_ld_moved;
> +		}
> +
>  		local_irq_restore(flags);
>  
>  		/*
> @@ -6794,7 +6841,7 @@ static int load_balance(int this_cpu, st
>  		 * If we've begun active balancing, start to back off. This
>  		 * case may not be covered by the all_pinned logic if there
>  		 * is only 1 task on the busy runqueue (because we don't call
> -		 * move_tasks).
> +		 * detach_tasks).
>  		 */
>  		if (sd->balance_interval < sd->max_interval)
>  			sd->balance_interval *= 2;



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 2/6] sched: Wrapper for checking task_struct::on_rq
  2014-08-06  8:06 ` [PATCH v4 2/6] sched: Wrapper for checking task_struct::on_rq Kirill Tkhai
@ 2014-08-20  7:52   ` Ingo Molnar
  0 siblings, 0 replies; 21+ messages in thread
From: Ingo Molnar @ 2014-08-20  7:52 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, peterz, pjt, oleg, rostedt, umgwanakikbuti, tkhai,
	tim.c.chen, nicolas.pitre


* Kirill Tkhai <ktkhai@parallels.com> wrote:

> 
> Implement task_queued() and use it everywhere instead of on_rq check.
> No functional changes.
> 
> The only exception is we do not use the wrapper in check_for_tasks(),
> because it requires to export task_queued() in global header files.
> Next patch in series would return it back, so it doesn't matter.
> 
> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> ---
>  kernel/sched/core.c      |   82 +++++++++++++++++++++++-----------------------
>  kernel/sched/deadline.c  |   14 ++++----
>  kernel/sched/fair.c      |   22 ++++++------
>  kernel/sched/rt.c        |   16 ++++-----
>  kernel/sched/sched.h     |    7 ++++
>  kernel/sched/stop_task.c |    2 +
>  6 files changed, 75 insertions(+), 68 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 1211575..67e8d1e 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1043,7 +1043,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
>  	 * A queue event has occurred, and we're going to schedule.  In
>  	 * this case, we can save a useless back to back clock update.
>  	 */
> -	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
> +	if (task_queued(rq->curr) && test_tsk_need_resched(rq->curr))
>  		rq->skip_clock_update = 1;

> -	p->on_rq = 1;
> +	p->on_rq = ONRQ_QUEUED;

> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -15,6 +15,9 @@
>  
>  struct rq;
>  
> +/* task_struct::on_rq states: */
> +#define ONRQ_QUEUED	1
> +
>  extern __read_mostly int scheduler_running;
>  
>  extern unsigned long calc_load_update;
> @@ -942,6 +945,10 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
>  #endif
>  }
>  
> +static inline int task_queued(struct task_struct *p)
> +{
> +	return p->on_rq == ONRQ_QUEUED;
> +}

So I agree with splitting p->on_rq into more states, but the 
new naming used looks pretty random, we can and should do 
better.

For example 'task_queued()' gives very little clue that it's 
all about the p->on_rq state. The 'ONRQ_QUEUED' name does not 
signal that this is a task's scheduler internal state, etc.

So I'd suggest a more structured naming scheme, something along 
the lines of:

	TASK_ON_RQ_QUEUED
	TASK_ON_RQ_MIGRATING

	task_on_rq_queued()
	task_on_rq_migrating()

etc.

It's a bit longer, but also more logical and thus easier to 
read and maintain.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [tip:sched/core] sched/fair: Fix reschedule which is generated on throttled cfs_rq
  2014-08-06  8:06 ` [PATCH v4 1/6] sched/fair: Fix reschedule which is generated on throttled cfs_rq Kirill Tkhai
@ 2014-08-20  8:20   ` tip-bot for Kirill Tkhai
  2014-10-23 23:27   ` [PATCH v4 1/6] " Wanpeng Li
  1 sibling, 0 replies; 21+ messages in thread
From: tip-bot for Kirill Tkhai @ 2014-08-20  8:20 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, ktkhai, hpa, mingo, torvalds, peterz, tglx

Commit-ID:  f36c019c79edb3a89920afae1b2b45987af1a112
Gitweb:     http://git.kernel.org/tip/f36c019c79edb3a89920afae1b2b45987af1a112
Author:     Kirill Tkhai <ktkhai@parallels.com>
AuthorDate: Wed, 6 Aug 2014 12:06:01 +0400
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Wed, 20 Aug 2014 09:47:20 +0200

sched/fair: Fix reschedule which is generated on throttled cfs_rq

(sched_entity::on_rq == 1) does not guarantee the task is pickable;
changes on throttled cfs_rq must not lead to reschedule.

Check for task_struct::on_rq instead.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1407312361.8424.35.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1413c44..bc37bb9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7494,7 +7494,7 @@ static void task_fork_fair(struct task_struct *p)
 static void
 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (!p->se.on_rq)
+	if (!p->on_rq)
 		return;
 
 	/*
@@ -7550,15 +7550,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
-	struct sched_entity *se = &p->se;
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_entity *se = &p->se;
 	/*
 	 * Since the real-depth could have been changed (only FAIR
 	 * class maintain depth value), reset depth properly.
 	 */
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
-	if (!se->on_rq)
+	if (!p->on_rq)
 		return;
 
 	/*

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 1/6] sched/fair: Fix reschedule which is generated on throttled cfs_rq
  2014-08-06  8:06 ` [PATCH v4 1/6] sched/fair: Fix reschedule which is generated on throttled cfs_rq Kirill Tkhai
  2014-08-20  8:20   ` [tip:sched/core] " tip-bot for Kirill Tkhai
@ 2014-10-23 23:27   ` Wanpeng Li
  2014-10-24  6:01     ` Kirill Tkhai
  1 sibling, 1 reply; 21+ messages in thread
From: Wanpeng Li @ 2014-10-23 23:27 UTC (permalink / raw)
  To: Kirill Tkhai, linux-kernel
  Cc: peterz, pjt, oleg, rostedt, umgwanakikbuti, tkhai, tim.c.chen,
	mingo, nicolas.pitre

Hi Kirill,
8/6/14, 4:06 PM, Kirill Tkhai:
> (sched_entity::on_rq == 1) does not guarantee the task is pickable;
> changes on throttled cfs_rq must not lead to reschedule.

Why (sched_entity::on_rq == 1) doesn't guarantee the task is pickable 
since entity will be dequeued during throttling cfs_rq?

>
> Check for task_struct::on_rq instead.

Do you mean task_struct::on_rq will be cleared during throttling cfs_rq? 
I can't find codes do this.

Regards,
Wanpeng Li

>
> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> ---
>   kernel/sched/fair.c |    6 +++---
>   1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bfa3c86..6f0ce2b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7465,7 +7465,7 @@ static void task_fork_fair(struct task_struct *p)
>   static void
>   prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
>   {
> -	if (!p->se.on_rq)
> +	if (!p->on_rq)
>   		return;
>   
>   	/*
> @@ -7521,15 +7521,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
>    */
>   static void switched_to_fair(struct rq *rq, struct task_struct *p)
>   {
> -	struct sched_entity *se = &p->se;
>   #ifdef CONFIG_FAIR_GROUP_SCHED
> +	struct sched_entity *se = &p->se;
>   	/*
>   	 * Since the real-depth could have been changed (only FAIR
>   	 * class maintain depth value), reset depth properly.
>   	 */
>   	se->depth = se->parent ? se->parent->depth + 1 : 0;
>   #endif
> -	if (!se->on_rq)
> +	if (!p->on_rq)
>   		return;
>   
>   	/*
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 1/6] sched/fair: Fix reschedule which is generated on throttled cfs_rq
  2014-10-23 23:27   ` [PATCH v4 1/6] " Wanpeng Li
@ 2014-10-24  6:01     ` Kirill Tkhai
  2014-10-24  9:32       ` Wanpeng Li
  0 siblings, 1 reply; 21+ messages in thread
From: Kirill Tkhai @ 2014-10-24  6:01 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: Kirill Tkhai, linux-kernel, peterz, pjt, oleg, rostedt,
	umgwanakikbuti, tim.c.chen, mingo, nicolas.pitre

Hi, Wanpeng,

the commit commentary confuses, I'm agree. Really it's just a cleanup.

On Пт, 2014-10-24 at 07:27 +0800, Wanpeng Li wrote:
> Hi Kirill,
> 8/6/14, 4:06 PM, Kirill Tkhai:
> > (sched_entity::on_rq == 1) does not guarantee the task is pickable;
> > changes on throttled cfs_rq must not lead to reschedule.
> 
> Why (sched_entity::on_rq == 1) doesn't guarantee the task is pickable 
> since entity will be dequeued during throttling cfs_rq?

Because one of task's (grand)parents in hierarhy may be throtthed and
dequeued.

But task_struct::on_rq check doesn't guarantee this too. So, just ignore
commit commentary; the commentary is wrong.

> > Check for task_struct::on_rq instead.
> 
> Do you mean task_struct::on_rq will be cleared during throttling cfs_rq? 
> I can't find codes do this.

No, it not cleared. The commit commentary should be:
"sched: Cleanup. Check task_struct::on_rq instead of sched_entity::on_rq,
because it is the same for a task"


> Regards,
> Wanpeng Li
> 
> >
> > Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> > ---
> >   kernel/sched/fair.c |    6 +++---
> >   1 file changed, 3 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index bfa3c86..6f0ce2b 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7465,7 +7465,7 @@ static void task_fork_fair(struct task_struct *p)
> >   static void
> >   prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
> >   {
> > -	if (!p->se.on_rq)
> > +	if (!p->on_rq)
> >   		return;
> >   
> >   	/*
> > @@ -7521,15 +7521,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
> >    */
> >   static void switched_to_fair(struct rq *rq, struct task_struct *p)
> >   {
> > -	struct sched_entity *se = &p->se;
> >   #ifdef CONFIG_FAIR_GROUP_SCHED
> > +	struct sched_entity *se = &p->se;
> >   	/*
> >   	 * Since the real-depth could have been changed (only FAIR
> >   	 * class maintain depth value), reset depth properly.
> >   	 */
> >   	se->depth = se->parent ? se->parent->depth + 1 : 0;
> >   #endif
> > -	if (!se->on_rq)
> > +	if (!p->on_rq)
> >   		return;
> >   
> >   	/*
> >
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> 



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 1/6] sched/fair: Fix reschedule which is generated on throttled cfs_rq
  2014-10-24  6:01     ` Kirill Tkhai
@ 2014-10-24  9:32       ` Wanpeng Li
  2014-10-24 15:12         ` Kirill Tkhai
  0 siblings, 1 reply; 21+ messages in thread
From: Wanpeng Li @ 2014-10-24  9:32 UTC (permalink / raw)
  To: tkhai
  Cc: Kirill Tkhai, linux-kernel, peterz, pjt, oleg, rostedt,
	umgwanakikbuti, tim.c.chen, mingo, nicolas.pitre

Hi Kirill,
10/24/14, 2:01 PM, Kirill Tkhai:
> Hi, Wanpeng,
>
> the commit commentary confuses, I'm agree. Really it's just a cleanup.
>
> On Пт, 2014-10-24 at 07:27 +0800, Wanpeng Li wrote:
>> Hi Kirill,
>> 8/6/14, 4:06 PM, Kirill Tkhai:
>>> (sched_entity::on_rq == 1) does not guarantee the task is pickable;
>>> changes on throttled cfs_rq must not lead to reschedule.
>> Why (sched_entity::on_rq == 1) doesn't guarantee the task is pickable
>> since entity will be dequeued during throttling cfs_rq?
> Because one of task's (grand)parents in hierarhy may be throtthed and
> dequeued.
>
> But task_struct::on_rq check doesn't guarantee this too. So, just ignore
> commit commentary; the commentary is wrong.
>
>>> Check for task_struct::on_rq instead.
>> Do you mean task_struct::on_rq will be cleared during throttling cfs_rq?
>> I can't find codes do this.
> No, it not cleared. The commit commentary should be:
> "sched: Cleanup. Check task_struct::on_rq instead of sched_entity::on_rq,
> because it is the same for a task"

IIUR, for fair class, sched_entity::on_rq will be set/clear during 
enqueue/dequeue, task_struct::on_rq will changed during task migration, 
I'm not sure why they are the same.

Regards,
Wanpeng Li

>
>
>> Regards,
>> Wanpeng Li
>>
>>> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
>>> ---
>>>    kernel/sched/fair.c |    6 +++---
>>>    1 file changed, 3 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index bfa3c86..6f0ce2b 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -7465,7 +7465,7 @@ static void task_fork_fair(struct task_struct *p)
>>>    static void
>>>    prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
>>>    {
>>> -	if (!p->se.on_rq)
>>> +	if (!p->on_rq)
>>>    		return;
>>>    
>>>    	/*
>>> @@ -7521,15 +7521,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
>>>     */
>>>    static void switched_to_fair(struct rq *rq, struct task_struct *p)
>>>    {
>>> -	struct sched_entity *se = &p->se;
>>>    #ifdef CONFIG_FAIR_GROUP_SCHED
>>> +	struct sched_entity *se = &p->se;
>>>    	/*
>>>    	 * Since the real-depth could have been changed (only FAIR
>>>    	 * class maintain depth value), reset depth properly.
>>>    	 */
>>>    	se->depth = se->parent ? se->parent->depth + 1 : 0;
>>>    #endif
>>> -	if (!se->on_rq)
>>> +	if (!p->on_rq)
>>>    		return;
>>>    
>>>    	/*
>>>
>>>
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> Please read the FAQ at  http://www.tux.org/lkml/
>


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4 1/6] sched/fair: Fix reschedule which is generated on throttled cfs_rq
  2014-10-24  9:32       ` Wanpeng Li
@ 2014-10-24 15:12         ` Kirill Tkhai
  0 siblings, 0 replies; 21+ messages in thread
From: Kirill Tkhai @ 2014-10-24 15:12 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: tkhai, linux-kernel, peterz, pjt, oleg, rostedt, umgwanakikbuti,
	tim.c.chen, mingo, nicolas.pitre

В Пт, 24/10/2014 в 17:32 +0800, Wanpeng Li пишет:
> Hi Kirill,
> 10/24/14, 2:01 PM, Kirill Tkhai:
> > Hi, Wanpeng,
> >
> > the commit commentary confuses, I'm agree. Really it's just a cleanup.
> >
> > On Пт, 2014-10-24 at 07:27 +0800, Wanpeng Li wrote:
> >> Hi Kirill,
> >> 8/6/14, 4:06 PM, Kirill Tkhai:
> >>> (sched_entity::on_rq == 1) does not guarantee the task is pickable;
> >>> changes on throttled cfs_rq must not lead to reschedule.
> >> Why (sched_entity::on_rq == 1) doesn't guarantee the task is pickable
> >> since entity will be dequeued during throttling cfs_rq?
> > Because one of task's (grand)parents in hierarhy may be throtthed and
> > dequeued.
> >
> > But task_struct::on_rq check doesn't guarantee this too. So, just ignore
> > commit commentary; the commentary is wrong.
> >
> >>> Check for task_struct::on_rq instead.
> >> Do you mean task_struct::on_rq will be cleared during throttling cfs_rq?
> >> I can't find codes do this.
> > No, it not cleared. The commit commentary should be:
> > "sched: Cleanup. Check task_struct::on_rq instead of sched_entity::on_rq,
> > because it is the same for a task"
> 
> IIUR, for fair class, sched_entity::on_rq will be set/clear during 
> enqueue/dequeue, task_struct::on_rq will changed during task migration, 
> I'm not sure why they are the same.

prio_changed_fair() and switched_to_fair() can't be called during migration.
They are called under pi_lock, and migration needs this lock too.

> >>> Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
> >>> ---
> >>>    kernel/sched/fair.c |    6 +++---
> >>>    1 file changed, 3 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index bfa3c86..6f0ce2b 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -7465,7 +7465,7 @@ static void task_fork_fair(struct task_struct *p)
> >>>    static void
> >>>    prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
> >>>    {
> >>> -	if (!p->se.on_rq)
> >>> +	if (!p->on_rq)
> >>>    		return;
> >>>    
> >>>    	/*
> >>> @@ -7521,15 +7521,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
> >>>     */
> >>>    static void switched_to_fair(struct rq *rq, struct task_struct *p)
> >>>    {
> >>> -	struct sched_entity *se = &p->se;
> >>>    #ifdef CONFIG_FAIR_GROUP_SCHED
> >>> +	struct sched_entity *se = &p->se;
> >>>    	/*
> >>>    	 * Since the real-depth could have been changed (only FAIR
> >>>    	 * class maintain depth value), reset depth properly.
> >>>    	 */
> >>>    	se->depth = se->parent ? se->parent->depth + 1 : 0;
> >>>    #endif
> >>> -	if (!se->on_rq)
> >>> +	if (!p->on_rq)
> >>>    		return;
> >>>    
> >>>    	/*
> >>>
> >>>
> >>>
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >>> the body of a message to majordomo@vger.kernel.org
> >>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>> Please read the FAQ at  http://www.tux.org/lkml/
> >
> 



^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2014-10-24 15:12 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20140806075138.24858.23816.stgit@tkhai>
2014-08-06  8:06 ` [PATCH v4 1/6] sched/fair: Fix reschedule which is generated on throttled cfs_rq Kirill Tkhai
2014-08-20  8:20   ` [tip:sched/core] " tip-bot for Kirill Tkhai
2014-10-23 23:27   ` [PATCH v4 1/6] " Wanpeng Li
2014-10-24  6:01     ` Kirill Tkhai
2014-10-24  9:32       ` Wanpeng Li
2014-10-24 15:12         ` Kirill Tkhai
2014-08-06  8:06 ` [PATCH v4 2/6] sched: Wrapper for checking task_struct::on_rq Kirill Tkhai
2014-08-20  7:52   ` Ingo Molnar
2014-08-06  8:06 ` [PATCH v4 3/6] sched: Teach scheduler to understand ONRQ_MIGRATING state Kirill Tkhai
2014-08-12  7:55   ` Peter Zijlstra
2014-08-12  8:34     ` Kirill Tkhai
2014-08-12  9:43       ` Peter Zijlstra
2014-08-06  8:06 ` [PATCH v4 4/6] sched: Remove double_rq_lock() from __migrate_task() Kirill Tkhai
2014-08-12  8:21   ` Peter Zijlstra
2014-08-06  8:06 ` [PATCH v4 5/6] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Kirill Tkhai
2014-08-12  9:03   ` Peter Zijlstra
2014-08-12  9:22   ` Peter Zijlstra
2014-08-12  9:39     ` Kirill Tkhai
2014-08-06  8:07 ` [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance() Kirill Tkhai
2014-08-12  9:36   ` Peter Zijlstra
2014-08-12 10:27     ` Kirill Tkhai

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.