All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] sched: Wrapper for checking task_struct's .on_rq
       [not found] <20140722102425.29682.24086.stgit@tkhai>
@ 2014-07-22 11:30 ` Kirill Tkhai
  2014-07-22 11:30 ` [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state Kirill Tkhai
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 13+ messages in thread
From: Kirill Tkhai @ 2014-07-22 11:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Mike Galbraith, Steven Rostedt, Tim Chen,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai


Use task_queued() everywhere instead of raw .on_rq check.
No functional changes.

The only exception is we do not use wrapper in check_for_tasks()
of file kernel/cpu.c, because it requires export of task_queued()
in global header files. Next patch in series would return it back,
so it doesn't matter.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/core.c      |   82 +++++++++++++++++++++++-----------------------
 kernel/sched/deadline.c  |   14 ++++----
 kernel/sched/fair.c      |   22 ++++++------
 kernel/sched/rt.c        |   16 ++++-----
 kernel/sched/sched.h     |    7 ++++
 kernel/sched/stop_task.c |    2 +
 6 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7bc599d..205f99a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1043,7 +1043,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 	 * A queue event has occurred, and we're going to schedule.  In
 	 * this case, we can save a useless back to back clock update.
 	 */
-	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+	if (task_queued(rq->curr) && test_tsk_need_resched(rq->curr))
 		rq->skip_clock_update = 1;
 }
 
@@ -1088,7 +1088,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
-	if (p->on_rq) {
+	if (task_queued(p)) {
 		struct rq *src_rq, *dst_rq;
 
 		src_rq = task_rq(p);
@@ -1214,7 +1214,7 @@ static int migration_cpu_stop(void *data);
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
-	int running, on_rq;
+	int running, queued;
 	unsigned long ncsw;
 	struct rq *rq;
 
@@ -1252,7 +1252,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(rq, p);
-		on_rq = p->on_rq;
+		queued = task_queued(p);
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1284,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
-		if (unlikely(on_rq)) {
+		if (unlikely(queued)) {
 			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
 
 			set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1478,7 +1478,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
-	p->on_rq = 1;
+	p->on_rq = ONRQ_QUEUED;
 
 	/* if a worker is waking up, notify workqueue */
 	if (p->flags & PF_WQ_WORKER)
@@ -1537,7 +1537,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 	int ret = 0;
 
 	rq = __task_rq_lock(p);
-	if (p->on_rq) {
+	if (task_queued(p)) {
 		/* check_preempt_curr() may use rq clock */
 		update_rq_clock(rq);
 		ttwu_do_wakeup(rq, p, wake_flags);
@@ -1678,7 +1678,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 
-	if (p->on_rq && ttwu_remote(p, wake_flags))
+	if (task_queued(p) && ttwu_remote(p, wake_flags))
 		goto stat;
 
 #ifdef CONFIG_SMP
@@ -1742,7 +1742,7 @@ static void try_to_wake_up_local(struct task_struct *p)
 	if (!(p->state & TASK_NORMAL))
 		goto out;
 
-	if (!p->on_rq)
+	if (!task_queued(p))
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
 	ttwu_do_wakeup(rq, p, 0);
@@ -2095,7 +2095,7 @@ void wake_up_new_task(struct task_struct *p)
 	init_task_runnable_average(p);
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
-	p->on_rq = 1;
+	p->on_rq = ONRQ_QUEUED;
 	trace_sched_wakeup_new(p, true);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -2444,7 +2444,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 	 * project cycles that may never be accounted to this
 	 * thread, breaking clock_gettime().
 	 */
-	if (task_current(rq, p) && p->on_rq) {
+	if (task_current(rq, p) && task_queued(p)) {
 		update_rq_clock(rq);
 		ns = rq_clock_task(rq) - p->se.exec_start;
 		if ((s64)ns < 0)
@@ -2490,7 +2490,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
 	 * been accounted, so we're correct here as well.
 	 */
-	if (!p->on_cpu || !p->on_rq)
+	if (!p->on_cpu || !task_queued(p))
 		return p->se.sum_exec_runtime;
 #endif
 
@@ -2794,7 +2794,7 @@ static void __sched __schedule(void)
 		switch_count = &prev->nvcsw;
 	}
 
-	if (prev->on_rq || rq->skip_clock_update < 0)
+	if (task_queued(prev) || rq->skip_clock_update < 0)
 		update_rq_clock(rq);
 
 	next = pick_next_task(rq, prev);
@@ -2959,7 +2959,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, on_rq, running, enqueue_flag = 0;
+	int oldprio, queued, running, enqueue_flag = 0;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -2988,9 +2988,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 	prev_class = p->sched_class;
-	on_rq = p->on_rq;
+	queued = task_queued(p);
 	running = task_current(rq, p);
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
@@ -3030,7 +3030,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq)
+	if (queued)
 		enqueue_task(rq, p, enqueue_flag);
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -3041,7 +3041,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	int old_prio, delta, on_rq;
+	int old_prio, delta, queued;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -3062,8 +3062,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-	on_rq = p->on_rq;
-	if (on_rq)
+	queued = task_queued(p);
+	if (queued)
 		dequeue_task(rq, p, 0);
 
 	p->static_prio = NICE_TO_PRIO(nice);
@@ -3072,7 +3072,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
 
-	if (on_rq) {
+	if (queued) {
 		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
@@ -3338,7 +3338,7 @@ static int __sched_setscheduler(struct task_struct *p,
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
-	int retval, oldprio, oldpolicy = -1, on_rq, running;
+	int retval, oldprio, oldpolicy = -1, queued, running;
 	int policy = attr->sched_policy;
 	unsigned long flags;
 	const struct sched_class *prev_class;
@@ -3535,9 +3535,9 @@ static int __sched_setscheduler(struct task_struct *p,
 		return 0;
 	}
 
-	on_rq = p->on_rq;
+	queued = task_queued(p);
 	running = task_current(rq, p);
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
@@ -3547,7 +3547,7 @@ static int __sched_setscheduler(struct task_struct *p,
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq) {
+	if (queued) {
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
@@ -4564,7 +4564,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	rcu_read_unlock();
 
 	rq->curr = rq->idle = idle;
-	idle->on_rq = 1;
+	idle->on_rq = ONRQ_QUEUED;
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
@@ -4641,7 +4641,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 		goto out;
 
 	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-	if (p->on_rq) {
+	if (task_queued(p)) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
 		task_rq_unlock(rq, p, &flags);
@@ -4691,7 +4691,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	 * If we're not on a rq, the next wake-up will ensure we're
 	 * placed properly.
 	 */
-	if (p->on_rq) {
+	if (task_queued(p)) {
 		dequeue_task(rq_src, p, 0);
 		set_task_cpu(p, dest_cpu);
 		enqueue_task(rq_dest, p, 0);
@@ -4732,13 +4732,13 @@ void sched_setnuma(struct task_struct *p, int nid)
 {
 	struct rq *rq;
 	unsigned long flags;
-	bool on_rq, running;
+	bool queued, running;
 
 	rq = task_rq_lock(p, &flags);
-	on_rq = p->on_rq;
+	queued = task_queued(p);
 	running = task_current(rq, p);
 
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, p, 0);
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
@@ -4747,7 +4747,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq)
+	if (queued)
 		enqueue_task(rq, p, 0);
 	task_rq_unlock(rq, p, &flags);
 }
@@ -7099,13 +7099,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 		.sched_policy = SCHED_NORMAL,
 	};
 	int old_prio = p->prio;
-	int on_rq;
+	int queued;
 
-	on_rq = p->on_rq;
-	if (on_rq)
+	queued = task_queued(p);
+	if (queued)
 		dequeue_task(rq, p, 0);
 	__setscheduler(rq, p, &attr);
-	if (on_rq) {
+	if (queued) {
 		enqueue_task(rq, p, 0);
 		resched_curr(rq);
 	}
@@ -7293,16 +7293,16 @@ void sched_offline_group(struct task_group *tg)
 void sched_move_task(struct task_struct *tsk)
 {
 	struct task_group *tg;
-	int on_rq, running;
+	int queued, running;
 	unsigned long flags;
 	struct rq *rq;
 
 	rq = task_rq_lock(tsk, &flags);
 
 	running = task_current(rq, tsk);
-	on_rq = tsk->on_rq;
+	queued = task_queued(tsk);
 
-	if (on_rq)
+	if (queued)
 		dequeue_task(rq, tsk, 0);
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
@@ -7315,14 +7315,14 @@ void sched_move_task(struct task_struct *tsk)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_move_group)
-		tsk->sched_class->task_move_group(tsk, on_rq);
+		tsk->sched_class->task_move_group(tsk, queued);
 	else
 #endif
 		set_task_rq(tsk, task_cpu(tsk));
 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
-	if (on_rq)
+	if (queued)
 		enqueue_task(rq, tsk, 0);
 
 	task_rq_unlock(rq, tsk, &flags);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce13..4cc3b14 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -530,7 +530,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	update_rq_clock(rq);
 	dl_se->dl_throttled = 0;
 	dl_se->dl_yielded = 0;
-	if (p->on_rq) {
+	if (task_queued(p)) {
 		enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
 		if (task_has_dl_policy(rq->curr))
 			check_preempt_curr_dl(rq, p, 0);
@@ -1030,7 +1030,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 		 * means a stop task can slip in, in which case we need to
 		 * re-start task selection.
 		 */
-		if (rq->stop && rq->stop->on_rq)
+		if (rq->stop && task_queued(rq->stop))
 			return RETRY_TASK;
 	}
 
@@ -1257,7 +1257,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 			if (unlikely(task_rq(task) != rq ||
 				     !cpumask_test_cpu(later_rq->cpu,
 				                       &task->cpus_allowed) ||
-				     task_running(rq, task) || !task->on_rq)) {
+				     task_running(rq, task) || !task_queued(task))) {
 				double_unlock_balance(rq, later_rq);
 				later_rq = NULL;
 				break;
@@ -1296,7 +1296,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
 	BUG_ON(task_current(rq, p));
 	BUG_ON(p->nr_cpus_allowed <= 1);
 
-	BUG_ON(!p->on_rq);
+	BUG_ON(!task_queued(p));
 	BUG_ON(!dl_task(p));
 
 	return p;
@@ -1443,7 +1443,7 @@ static int pull_dl_task(struct rq *this_rq)
 		     dl_time_before(p->dl.deadline,
 				    this_rq->dl.earliest_dl.curr))) {
 			WARN_ON(p == src_rq->curr);
-			WARN_ON(!p->on_rq);
+			WARN_ON(!task_queued(p));
 
 			/*
 			 * Then we pull iff p has actually an earlier
@@ -1596,7 +1596,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 	if (unlikely(p->dl.dl_throttled))
 		return;
 
-	if (p->on_rq && rq->curr != p) {
+	if (task_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
 			/* Only reschedule if pushing failed */
@@ -1614,7 +1614,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 			    int oldprio)
 {
-	if (p->on_rq || rq->curr == p) {
+	if (task_queued(p) || rq->curr == p) {
 #ifdef CONFIG_SMP
 		/*
 		 * This might be too much, but unfortunately
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 45943b2..dd90fff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7464,7 +7464,7 @@ static void task_fork_fair(struct task_struct *p)
 static void
 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (!p->se.on_rq)
+	if (!task_queued(p))
 		return;
 
 	/*
@@ -7489,11 +7489,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 	 * switched back to the fair class the enqueue_entity(.flags=0) will
 	 * do the right thing.
 	 *
-	 * If it's on_rq, then the dequeue_entity(.flags=0) will already
-	 * have normalized the vruntime, if it's !on_rq, then only when
+	 * If it's queued, then the dequeue_entity(.flags=0) will already
+	 * have normalized the vruntime, if it's !queued, then only when
 	 * the task is sleeping will it still have non-normalized vruntime.
 	 */
-	if (!p->on_rq && p->state != TASK_RUNNING) {
+	if (!task_queued(p) && p->state != TASK_RUNNING) {
 		/*
 		 * Fix up our vruntime so that the current sleep doesn't
 		 * cause 'unlimited' sleep bonus.
@@ -7528,7 +7528,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
 	 */
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
-	if (!se->on_rq)
+	if (!task_queued(p))
 		return;
 
 	/*
@@ -7574,7 +7574,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
 {
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq;
@@ -7593,7 +7593,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	 * fair sleeper stuff for the first placement, but who cares.
 	 */
 	/*
-	 * When !on_rq, vruntime of the task has usually NOT been normalized.
+	 * When !queued, vruntime of the task has usually NOT been normalized.
 	 * But there are some cases where it has already been normalized:
 	 *
 	 * - Moving a forked child which is waiting for being woken up by
@@ -7604,14 +7604,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
-	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
-		on_rq = 1;
+	if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+		queued = 1;
 
-	if (!on_rq)
+	if (!queued)
 		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
-	if (!on_rq) {
+	if (!queued) {
 		cfs_rq = cfs_rq_of(se);
 		se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca..9395320 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 		 * means a dl or stop task can slip in, in which case we need
 		 * to re-start task selection.
 		 */
-		if (unlikely((rq->stop && rq->stop->on_rq) ||
+		if (unlikely((rq->stop && task_queued(rq->stop)) ||
 			     rq->dl.dl_nr_running))
 			return RETRY_TASK;
 	}
@@ -1624,7 +1624,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 				     !cpumask_test_cpu(lowest_rq->cpu,
 						       tsk_cpus_allowed(task)) ||
 				     task_running(rq, task) ||
-				     !task->on_rq)) {
+				     !task_queued(task))) {
 
 				double_unlock_balance(rq, lowest_rq);
 				lowest_rq = NULL;
@@ -1658,7 +1658,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
 	BUG_ON(task_current(rq, p));
 	BUG_ON(p->nr_cpus_allowed <= 1);
 
-	BUG_ON(!p->on_rq);
+	BUG_ON(!task_queued(p));
 	BUG_ON(!rt_task(p));
 
 	return p;
@@ -1809,7 +1809,7 @@ static int pull_rt_task(struct rq *this_rq)
 		 */
 		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
 			WARN_ON(p == src_rq->curr);
-			WARN_ON(!p->on_rq);
+			WARN_ON(!task_queued(p));
 
 			/*
 			 * There's a chance that p is higher in priority
@@ -1870,7 +1870,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 
 	BUG_ON(!rt_task(p));
 
-	if (!p->on_rq)
+	if (!task_queued(p))
 		return;
 
 	weight = cpumask_weight(new_mask);
@@ -1936,7 +1936,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!p->on_rq || rq->rt.rt_nr_running)
+	if (!task_queued(p) || rq->rt.rt_nr_running)
 		return;
 
 	if (pull_rt_task(rq))
@@ -1970,7 +1970,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 * If that current running task is also an RT task
 	 * then see if we can move to another run queue.
 	 */
-	if (p->on_rq && rq->curr != p) {
+	if (task_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
 		    /* Don't resched if we changed runqueues */
@@ -1989,7 +1989,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 static void
 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-	if (!p->on_rq)
+	if (!task_queued(p))
 		return;
 
 	if (rq->curr == p) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f..e5a9b6d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -15,6 +15,9 @@
 
 struct rq;
 
+/* .on_rq states of struct task_struct: */
+#define ONRQ_QUEUED	1
+
 extern __read_mostly int scheduler_running;
 
 extern unsigned long calc_load_update;
@@ -942,6 +945,10 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
 #endif
 }
 
+static inline int task_queued(struct task_struct *p)
+{
+	return p->on_rq == ONRQ_QUEUED;
+}
 
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0eda..1a4bb0f 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *stop = rq->stop;
 
-	if (!stop || !stop->on_rq)
+	if (!stop || !task_queued(stop))
 		return NULL;
 
 	put_prev_task(rq, prev);




^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state
       [not found] <20140722102425.29682.24086.stgit@tkhai>
  2014-07-22 11:30 ` [PATCH 1/5] sched: Wrapper for checking task_struct's .on_rq Kirill Tkhai
@ 2014-07-22 11:30 ` Kirill Tkhai
  2014-07-22 11:45   ` Peter Zijlstra
  2014-07-22 11:30 ` [PATCH 3/5] sched: Remove double_rq_lock() from __migrate_task() Kirill Tkhai
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 13+ messages in thread
From: Kirill Tkhai @ 2014-07-22 11:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Mike Galbraith, Steven Rostedt, Tim Chen,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai


This is new on_rq state for the cases when task is migrating
from one src_rq to another dst_rq, and locks of the both RQs
are unlocked.

We will use the state this way:

	raw_spin_lock(&src_rq->lock);
	dequeue_task(src_rq, p, 0);
	p->on_rq = ONRQ_MIGRATING;
	set_task_cpu(p, dst_cpu);
	raw_spin_unlock(&src_rq->lock);

	raw_spin_lock(&dst_rq->lock);
	p->on_rq = ONRQ_QUEUED;
	enqueue_task(dst_rq, p, 0);
	raw_spin_unlock(&dst_rq->lock);

The profit is that double_rq_lock() is not needed now,
and this may reduce the latencies in some situations.

The logic of try_to_wake_up() remained the same as it
was. Its behaviour changes in a small subset of cases
(when preempted task in ~TASK_RUNNING state is queued
 on rq and we are migrating it to another).

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/core.c  |   25 ++++++++++++++++++-------
 kernel/sched/sched.h |    1 +
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 205f99a..78388b0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1214,7 +1214,7 @@ static int migration_cpu_stop(void *data);
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
 	unsigned long flags;
-	int running, queued;
+	int running, on_rq;
 	unsigned long ncsw;
 	struct rq *rq;
 
@@ -1252,7 +1252,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		rq = task_rq_lock(p, &flags);
 		trace_sched_wait_task(p);
 		running = task_running(rq, p);
-		queued = task_queued(p);
+		on_rq = p->on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1284,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
-		if (unlikely(queued)) {
+		if (unlikely(on_rq)) {
 			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
 
 			set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1491,10 +1491,14 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
-	check_preempt_curr(rq, p, wake_flags);
 	trace_sched_wakeup(p, true);
 
 	p->state = TASK_RUNNING;
+
+	if (!task_queued(p))
+		return;
+
+	check_preempt_curr(rq, p, wake_flags);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
@@ -1537,7 +1541,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 	int ret = 0;
 
 	rq = __task_rq_lock(p);
-	if (task_queued(p)) {
+	if (p->on_rq) {
 		/* check_preempt_curr() may use rq clock */
 		update_rq_clock(rq);
 		ttwu_do_wakeup(rq, p, wake_flags);
@@ -1678,7 +1682,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 
-	if (task_queued(p) && ttwu_remote(p, wake_flags))
+	if (p->on_rq && ttwu_remote(p, wake_flags))
 		goto stat;
 
 #ifdef CONFIG_SMP
@@ -1693,6 +1697,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	smp_rmb();
 
+	BUG_ON(p->on_rq);
+
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 
@@ -4623,9 +4629,14 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 	struct rq *rq;
 	unsigned int dest_cpu;
 	int ret = 0;
-
+again:
 	rq = task_rq_lock(p, &flags);
 
+	if (unlikely(p->on_rq) == ONRQ_MIGRATING) {
+		task_rq_unlock(rq, p, &flags);
+		goto again;
+	}
+
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e5a9b6d..9b00e9b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -17,6 +17,7 @@ struct rq;
 
 /* .on_rq states of struct task_struct: */
 #define ONRQ_QUEUED	1
+#define ONRQ_MIGRATING	2
 
 extern __read_mostly int scheduler_running;
 




^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 3/5] sched: Remove double_rq_lock() from __migrate_task()
       [not found] <20140722102425.29682.24086.stgit@tkhai>
  2014-07-22 11:30 ` [PATCH 1/5] sched: Wrapper for checking task_struct's .on_rq Kirill Tkhai
  2014-07-22 11:30 ` [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state Kirill Tkhai
@ 2014-07-22 11:30 ` Kirill Tkhai
  2014-07-22 11:30 ` [PATCH 4/5] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Kirill Tkhai
  2014-07-22 11:31 ` [PATCH 5/5] sched/fair: Remove double_lock_balance() from load_balance() Kirill Tkhai
  4 siblings, 0 replies; 13+ messages in thread
From: Kirill Tkhai @ 2014-07-22 11:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Mike Galbraith, Steven Rostedt, Tim Chen,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai


Let's use ONRQ_MIGRATING instead.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/core.c |   22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78388b0..4d62deb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4680,20 +4680,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
  */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
-	struct rq *rq_dest, *rq_src;
+	struct rq *rq;
 	int ret = 0;
 
 	if (unlikely(!cpu_active(dest_cpu)))
 		return ret;
 
-	rq_src = cpu_rq(src_cpu);
-	rq_dest = cpu_rq(dest_cpu);
+	rq = cpu_rq(src_cpu);
 
 	raw_spin_lock(&p->pi_lock);
-	double_rq_lock(rq_src, rq_dest);
+	raw_spin_lock(&rq->lock);
 	/* Already moved. */
 	if (task_cpu(p) != src_cpu)
 		goto done;
+
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
 		goto fail;
@@ -4703,15 +4703,21 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	 * placed properly.
 	 */
 	if (task_queued(p)) {
-		dequeue_task(rq_src, p, 0);
+		dequeue_task(rq, p, 0);
+		p->on_rq = ONRQ_MIGRATING;
 		set_task_cpu(p, dest_cpu);
-		enqueue_task(rq_dest, p, 0);
-		check_preempt_curr(rq_dest, p, 0);
+		raw_spin_unlock(&rq->lock);
+
+		rq = cpu_rq(dest_cpu);
+		raw_spin_lock(&rq->lock);
+		p->on_rq = ONRQ_QUEUED;
+		enqueue_task(rq, p, 0);
+		check_preempt_curr(rq, p, 0);
 	}
 done:
 	ret = 1;
 fail:
-	double_rq_unlock(rq_src, rq_dest);
+	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock(&p->pi_lock);
 	return ret;
 }




^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 4/5] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
       [not found] <20140722102425.29682.24086.stgit@tkhai>
                   ` (2 preceding siblings ...)
  2014-07-22 11:30 ` [PATCH 3/5] sched: Remove double_rq_lock() from __migrate_task() Kirill Tkhai
@ 2014-07-22 11:30 ` Kirill Tkhai
  2014-07-25  0:04   ` Tim Chen
  2014-07-22 11:31 ` [PATCH 5/5] sched/fair: Remove double_lock_balance() from load_balance() Kirill Tkhai
  4 siblings, 1 reply; 13+ messages in thread
From: Kirill Tkhai @ 2014-07-22 11:30 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Mike Galbraith, Steven Rostedt, Tim Chen,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai


Bad situation:

double_lock_balance() drops busiest_rq lock. The busiest_rq is *busiest*,
and a lot of tasks and context switches there. We are dropping the lock
and waiting for it again.

Let's just detach the task and once finally unlock it!

Warning: this admits unlocked using of can_migrate_task(), throttled_lb_pair(),
and task_hot(). I added comments about that.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/fair.c |   54 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd90fff..cf2d2eb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3297,6 +3297,8 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
  * Ensure that neither of the group entities corresponding to src_cpu or
  * dest_cpu are members of a throttled hierarchy when performing group
  * load-balance operations.
+ *
+ * Note: RQs are not locked.
  */
 static inline int throttled_lb_pair(struct task_group *tg,
 				    int src_cpu, int dest_cpu)
@@ -5127,7 +5129,9 @@ static void move_task(struct task_struct *p, struct lb_env *env)
 }
 
 /*
- * Is this task likely cache-hot:
+ * Is this task likely cache-hot?
+ *
+ * Note: env->dst_rq is unlocked, but rcu_read_lock() is held.
  */
 static int task_hot(struct task_struct *p, struct lb_env *env)
 {
@@ -5247,6 +5251,8 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
 
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ *
+ * Note: env->dest_rq is not locked.
  */
 static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
@@ -5336,13 +5342,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 }
 
 /*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * detach_one_task tries to dequeue exactly one task from env->src_rq, as
  * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
+ * Returns a task if successful and NULL otherwise.
  *
- * Called with both runqueues locked.
+ * Called with env->src_rq locked.
  */
-static int move_one_task(struct lb_env *env)
+static struct task_struct *detach_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 
@@ -5350,16 +5356,20 @@ static int move_one_task(struct lb_env *env)
 		if (!can_migrate_task(p, env))
 			continue;
 
-		move_task(p, env);
+		deactivate_task(env->src_rq, p, 0);
+		p->on_rq = ONRQ_MIGRATING;
+		set_task_cpu(p, env->dst_cpu);
+
 		/*
-		 * Right now, this is only the second place move_task()
-		 * is called, so we can safely collect move_task()
-		 * stats here rather than inside move_task().
+		 * Right now, this is only the second place where
+		 * lb_gained[env->idle] is updated (other is move_tasks)
+		 * so we can safely collect stats here rather than
+		 * inside move_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
-		return 1;
+		return p;
 	}
-	return 0;
+	return NULL;
 }
 
 static const unsigned int sched_nr_migrate_break = 32;
@@ -6913,6 +6923,7 @@ static int active_load_balance_cpu_stop(void *data)
 	int target_cpu = busiest_rq->push_cpu;
 	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd;
+	struct task_struct *p = NULL;
 
 	raw_spin_lock_irq(&busiest_rq->lock);
 
@@ -6932,9 +6943,6 @@ static int active_load_balance_cpu_stop(void *data)
 	 */
 	BUG_ON(busiest_rq == target_rq);
 
-	/* move a task from busiest_rq to target_rq */
-	double_lock_balance(busiest_rq, target_rq);
-
 	/* Search for an sd spanning us and the target CPU. */
 	rcu_read_lock();
 	for_each_domain(target_cpu, sd) {
@@ -6955,16 +6963,28 @@ static int active_load_balance_cpu_stop(void *data)
 
 		schedstat_inc(sd, alb_count);
 
-		if (move_one_task(&env))
+		p = detach_one_task(&env);
+		if (p)
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
 	}
 	rcu_read_unlock();
-	double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
 	busiest_rq->active_balance = 0;
-	raw_spin_unlock_irq(&busiest_rq->lock);
+	raw_spin_unlock(&busiest_rq->lock);
+
+	if (p) {
+		raw_spin_lock(&target_rq->lock);
+		BUG_ON(task_rq(p) != target_rq);
+		p->on_rq = ONRQ_QUEUED;
+		activate_task(target_rq, p, 0);
+		check_preempt_curr(target_rq, p, 0);
+		raw_spin_unlock(&target_rq->lock);
+	}
+
+	local_irq_enable();
+
 	return 0;
 }
 




^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 5/5] sched/fair: Remove double_lock_balance() from load_balance()
       [not found] <20140722102425.29682.24086.stgit@tkhai>
                   ` (3 preceding siblings ...)
  2014-07-22 11:30 ` [PATCH 4/5] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Kirill Tkhai
@ 2014-07-22 11:31 ` Kirill Tkhai
  4 siblings, 0 replies; 13+ messages in thread
From: Kirill Tkhai @ 2014-07-22 11:31 UTC (permalink / raw)
  To: linux-kernel
  Cc: Peter Zijlstra, Mike Galbraith, Steven Rostedt, Tim Chen,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai


Keep on_rq = ONRQ_MIGRATING, while task is migrating, instead.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
---
 kernel/sched/fair.c |   84 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 30 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cf2d2eb..ebab2e7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4706,9 +4706,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		return;
 
 	/*
-	 * This is possible from callers such as move_task(), in which we
-	 * unconditionally check_prempt_curr() after an enqueue (which may have
-	 * lead to a throttle).  This both saves work and prevents false
+	 * This is possible from callers, in which we unconditionally
+	 * check_prempt_curr() after an enqueue (which may have lead
+	 * to a throttle).  This both saves work and prevents false
 	 * next-buddy nomination below.
 	 */
 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
@@ -5114,20 +5114,22 @@ struct lb_env {
 	unsigned int		loop_max;
 
 	enum fbq_type		fbq_type;
+	struct list_head	tasks;
 };
 
 /*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
+ * detach_task - detach a task from its runqueue for migration.
+ * The runqueue must be locked.
  */
-static void move_task(struct task_struct *p, struct lb_env *env)
+static void detach_task(struct task_struct *p, struct lb_env *env)
 {
 	deactivate_task(env->src_rq, p, 0);
+	list_add(&p->se.group_node, &env->tasks);
+	p->on_rq = ONRQ_MIGRATING;
 	set_task_cpu(p, env->dst_cpu);
-	activate_task(env->dst_rq, p, 0);
-	check_preempt_curr(env->dst_rq, p, 0);
 }
 
+
 /*
  * Is this task likely cache-hot?
  *
@@ -5362,9 +5364,9 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 
 		/*
 		 * Right now, this is only the second place where
-		 * lb_gained[env->idle] is updated (other is move_tasks)
+		 * lb_gained[env->idle] is updated (other is detach_tasks)
 		 * so we can safely collect stats here rather than
-		 * inside move_tasks().
+		 * inside detach_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
 		return p;
@@ -5375,18 +5377,18 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 static const unsigned int sched_nr_migrate_break = 32;
 
 /*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
+ * detach_tasks tries to detach up to imbalance weighted load from busiest_rq,
+ * as part of a balancing operation within domain "sd".
+ * Returns number of detached tasks if successful and 0 otherwise.
  *
- * Called with both runqueues locked.
+ * Called with env->src_rq locked.
  */
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
 {
 	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
 	unsigned long load;
-	int pulled = 0;
+	int detached = 0;
 
 	if (env->imbalance <= 0)
 		return 0;
@@ -5417,14 +5419,15 @@ static int move_tasks(struct lb_env *env)
 		if ((load / 2) > env->imbalance)
 			goto next;
 
-		move_task(p, env);
-		pulled++;
+		detach_task(p, env);
+
+		detached++;
 		env->imbalance -= load;
 
 #ifdef CONFIG_PREEMPT
 		/*
 		 * NEWIDLE balancing is a source of latency, so preemptible
-		 * kernels will stop after the first task is pulled to minimize
+		 * kernels will stop after the first task is detached to minimize
 		 * the critical section.
 		 */
 		if (env->idle == CPU_NEWLY_IDLE)
@@ -5444,13 +5447,27 @@ static int move_tasks(struct lb_env *env)
 	}
 
 	/*
-	 * Right now, this is one of only two places move_task() is called,
-	 * so we can safely collect move_task() stats here rather than
-	 * inside move_task().
+	 * Right now, this is one of only two places we collect this stat
+	 * so we can safely collect detach_one_task() stats here rather
+	 * than inside detach_one_task().
 	 */
-	schedstat_add(env->sd, lb_gained[env->idle], pulled);
+	schedstat_add(env->sd, lb_gained[env->idle], detached);
+
+	return detached;
+}
+
+static void attach_tasks(struct lb_env *env)
+{
+	struct list_head *tasks = &env->tasks;
+	struct task_struct *p;
+
+	while (!list_empty(tasks)) {
+		p = list_first_entry(tasks, struct task_struct, se.group_node);
+		list_del_init(&p->se.group_node);
+		p->on_rq = ONRQ_QUEUED;
+		activate_task(env->dst_rq, p, 0);
+	}
 
-	return pulled;
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6559,6 +6576,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.loop_break	= sched_nr_migrate_break,
 		.cpus		= cpus,
 		.fbq_type	= all,
+		.tasks		= LIST_HEAD_INIT(env.tasks),
 	};
 
 	/*
@@ -6608,16 +6626,22 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 
 more_balance:
-		local_irq_save(flags);
-		double_rq_lock(env.dst_rq, busiest);
+		raw_spin_lock_irqsave(&busiest->lock, flags);
 
 		/*
 		 * cur_ld_moved - load moved in current iteration
 		 * ld_moved     - cumulative load moved across iterations
 		 */
-		cur_ld_moved = move_tasks(&env);
-		ld_moved += cur_ld_moved;
-		double_rq_unlock(env.dst_rq, busiest);
+		cur_ld_moved = detach_tasks(&env);
+		raw_spin_unlock(&busiest->lock);
+
+		if (cur_ld_moved) {
+			raw_spin_lock(&env.dst_rq->lock);
+			attach_tasks(&env);
+			raw_spin_unlock(&env.dst_rq->lock);
+			ld_moved += cur_ld_moved;
+		}
+
 		local_irq_restore(flags);
 
 		/*
@@ -6753,7 +6777,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		 * If we've begun active balancing, start to back off. This
 		 * case may not be covered by the all_pinned logic if there
 		 * is only 1 task on the busy runqueue (because we don't call
-		 * move_tasks).
+		 * detach_tasks).
 		 */
 		if (sd->balance_interval < sd->max_interval)
 			sd->balance_interval *= 2;




^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state
  2014-07-22 11:30 ` [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state Kirill Tkhai
@ 2014-07-22 11:45   ` Peter Zijlstra
  2014-07-22 12:24     ` Kirill Tkhai
                       ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Peter Zijlstra @ 2014-07-22 11:45 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, Mike Galbraith, Steven Rostedt, Tim Chen,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai, Oleg Nesterov

On Tue, Jul 22, 2014 at 03:30:16PM +0400, Kirill Tkhai wrote:
> 
> This is new on_rq state for the cases when task is migrating
> from one src_rq to another dst_rq, and locks of the both RQs
> are unlocked.
> 
> We will use the state this way:
> 
> 	raw_spin_lock(&src_rq->lock);
> 	dequeue_task(src_rq, p, 0);
> 	p->on_rq = ONRQ_MIGRATING;
> 	set_task_cpu(p, dst_cpu);
> 	raw_spin_unlock(&src_rq->lock);
> 
> 	raw_spin_lock(&dst_rq->lock);
> 	p->on_rq = ONRQ_QUEUED;
> 	enqueue_task(dst_rq, p, 0);
> 	raw_spin_unlock(&dst_rq->lock);
> 
> The profit is that double_rq_lock() is not needed now,
> and this may reduce the latencies in some situations.
> 
> The logic of try_to_wake_up() remained the same as it
> was. Its behaviour changes in a small subset of cases
> (when preempted task in ~TASK_RUNNING state is queued
>  on rq and we are migrating it to another).

more details is better ;-) Also, I think Oleg enjoys these kind of
things, so I've added him to the CC.

A few questions, haven't really thought about things yet.

> @@ -1491,10 +1491,14 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
>  static void
>  ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
>  {
> -	check_preempt_curr(rq, p, wake_flags);
>  	trace_sched_wakeup(p, true);
>  
>  	p->state = TASK_RUNNING;
> +
> +	if (!task_queued(p))
> +		return;

How can this happen? we're in the middle of a wakeup, we're just added
the task to the rq and are still holding the appropriate rq->lock.

> @@ -4623,9 +4629,14 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
>  	struct rq *rq;
>  	unsigned int dest_cpu;
>  	int ret = 0;
> -
> +again:
>  	rq = task_rq_lock(p, &flags);
>  
> +	if (unlikely(p->on_rq) == ONRQ_MIGRATING) {
> +		task_rq_unlock(rq, p, &flags);
> +		goto again;
> +	}
> +
>  	if (cpumask_equal(&p->cpus_allowed, new_mask))
>  		goto out;
>  

That looks like a non-deterministic spin loop, 'waiting' for the
migration to finish. Not particularly nice and something I think we
should avoid for it has bad (TM) worst case behaviour.

Also, why only this site and not all task_rq_lock() sites?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state
  2014-07-22 11:45   ` Peter Zijlstra
@ 2014-07-22 12:24     ` Kirill Tkhai
  2014-07-22 12:25     ` Steven Rostedt
  2014-07-24 19:03     ` Oleg Nesterov
  2 siblings, 0 replies; 13+ messages in thread
From: Kirill Tkhai @ 2014-07-22 12:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Mike Galbraith, Steven Rostedt, Tim Chen,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai, Oleg Nesterov

В Вт, 22/07/2014 в 13:45 +0200, Peter Zijlstra пишет:
> On Tue, Jul 22, 2014 at 03:30:16PM +0400, Kirill Tkhai wrote:
> > 
> > This is new on_rq state for the cases when task is migrating
> > from one src_rq to another dst_rq, and locks of the both RQs
> > are unlocked.
> > 
> > We will use the state this way:
> > 
> > 	raw_spin_lock(&src_rq->lock);
> > 	dequeue_task(src_rq, p, 0);
> > 	p->on_rq = ONRQ_MIGRATING;
> > 	set_task_cpu(p, dst_cpu);
> > 	raw_spin_unlock(&src_rq->lock);
> > 
> > 	raw_spin_lock(&dst_rq->lock);
> > 	p->on_rq = ONRQ_QUEUED;
> > 	enqueue_task(dst_rq, p, 0);
> > 	raw_spin_unlock(&dst_rq->lock);
> > 
> > The profit is that double_rq_lock() is not needed now,
> > and this may reduce the latencies in some situations.
> > 
> > The logic of try_to_wake_up() remained the same as it
> > was. Its behaviour changes in a small subset of cases
> > (when preempted task in ~TASK_RUNNING state is queued
> >  on rq and we are migrating it to another).
> 
> more details is better ;-) Also, I think Oleg enjoys these kind of
> things, so I've added him to the CC.

try_to_wake_up() wakes tasks in the particular states,
no one calls it with TASK_RUNNING argument. So, our
logic will have a deal with tasks in !TASK_RUNNING state.

If they are on rq and !TASK_RUNNING, this means, they were
preempted using preempt_schedule{,_irq}. If they were preempted
in !TASK_RUNNING state, this was in one of the cases like:

	set_current_state(TASK_INTERRUPTIBLE);

	(actions)

	schedule();

And someone is migrating this task.

Really small subset of cases :)

> A few questions, haven't really thought about things yet.
> 
> > @@ -1491,10 +1491,14 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
> >  static void
> >  ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
> >  {
> > -	check_preempt_curr(rq, p, wake_flags);
> >  	trace_sched_wakeup(p, true);
> >  
> >  	p->state = TASK_RUNNING;
> > +
> > +	if (!task_queued(p))
> > +		return;
> 
> How can this happen? we're in the middle of a wakeup, we're just added
> the task to the rq and are still holding the appropriate rq->lock.

try_to_wake_up()->ttwu_remote()->ttwu_do_wakeup():

task is migrating at the moment, and the only thing we do is we change
its state on TASK_RUNNING. It will be queued with new state (TASK_RUNNING) by the code, which is migrating it.

It looks like this situation in general is not very often. It's only
for the cases when we're migrating a task, which was preempted with
!TASK_RUNNING state.


> > @@ -4623,9 +4629,14 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> >  	struct rq *rq;
> >  	unsigned int dest_cpu;
> >  	int ret = 0;
> > -
> > +again:
> >  	rq = task_rq_lock(p, &flags);
> >  
> > +	if (unlikely(p->on_rq) == ONRQ_MIGRATING) {
> > +		task_rq_unlock(rq, p, &flags);
> > +		goto again;
> > +	}
> > +
> >  	if (cpumask_equal(&p->cpus_allowed, new_mask))
> >  		goto out;
> >  
> 
> That looks like a non-deterministic spin loop, 'waiting' for the
> migration to finish. Not particularly nice and something I think we
> should avoid for it has bad (TM) worst case behaviour.
> 
> Also, why only this site and not all task_rq_lock() sites?

All other places tests for task_queued(p) under rq's lock. I watched
all of them and did not find a place who needs that. For example,
rt_mutex_setprio() enqueues only if task was really queued before.
Patch [1/5] made all preparation for that. With a hope, nothing was
skipped by /me.

About set_cpus_allowed_ptr(). We could return -EAGAIN if a task is
migrating, but set_cpus_allowed_ptr() must not fail on kernel threads.
We'd would miss something important this way, it's softirq affinity
for example.

Going to again label looks like manual spinlock for me. We used to
spin there before. The time of held lock was similar, and we also
had competition with other rq's lock users. The lock just was locked
permanently, and we didn't have overhead with repeating spin_{lock,unlock}
here.

I don't see what to do instead..

Thanks,
Kirill


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state
  2014-07-22 11:45   ` Peter Zijlstra
  2014-07-22 12:24     ` Kirill Tkhai
@ 2014-07-22 12:25     ` Steven Rostedt
  2014-07-22 13:20       ` Kirill Tkhai
  2014-07-24 19:03     ` Oleg Nesterov
  2 siblings, 1 reply; 13+ messages in thread
From: Steven Rostedt @ 2014-07-22 12:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Kirill Tkhai, linux-kernel, Mike Galbraith, Tim Chen,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai, Oleg Nesterov

On Tue, 22 Jul 2014 13:45:42 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

 
> > @@ -1491,10 +1491,14 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
> >  static void
> >  ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
> >  {
> > -	check_preempt_curr(rq, p, wake_flags);
> >  	trace_sched_wakeup(p, true);
> >  
> >  	p->state = TASK_RUNNING;
> > +
> > +	if (!task_queued(p))
> > +		return;
> 
> How can this happen? we're in the middle of a wakeup, we're just added
> the task to the rq and are still holding the appropriate rq->lock.

I believe it can be in the migrating state. A comment would be useful
here.

> 
> > @@ -4623,9 +4629,14 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> >  	struct rq *rq;
> >  	unsigned int dest_cpu;
> >  	int ret = 0;
> > -
> > +again:
> >  	rq = task_rq_lock(p, &flags);
> >  
> > +	if (unlikely(p->on_rq) == ONRQ_MIGRATING) {
> > +		task_rq_unlock(rq, p, &flags);
> > +		goto again;
> > +	}
> > +
> >  	if (cpumask_equal(&p->cpus_allowed, new_mask))
> >  		goto out;
> >  
> 
> That looks like a non-deterministic spin loop, 'waiting' for the
> migration to finish. Not particularly nice and something I think we
> should avoid for it has bad (TM) worst case behaviour.

As this patch doesn't introduce the MIGRATING getting set yet, I'd be
interested in this too. I'm assuming that the MIGRATING flag is only
set and then cleared within an interrupts disabled section, such that
the time is no more than a spinlock being taken.

I would also add a cpu_relax() there too.

> 
> Also, why only this site and not all task_rq_lock() sites?

I'm assuming that its because set_cpus_allowed_ptr() is suppose to
return with the task already migrated to the CPUs it is allowed on, and
not before.

-- Steve

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state
  2014-07-22 12:25     ` Steven Rostedt
@ 2014-07-22 13:20       ` Kirill Tkhai
  0 siblings, 0 replies; 13+ messages in thread
From: Kirill Tkhai @ 2014-07-22 13:20 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, linux-kernel, Mike Galbraith, Tim Chen,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai, Oleg Nesterov

В Вт, 22/07/2014 в 08:25 -0400, Steven Rostedt пишет:
> On Tue, 22 Jul 2014 13:45:42 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
>  
> > > @@ -1491,10 +1491,14 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
> > >  static void
> > >  ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
> > >  {
> > > -	check_preempt_curr(rq, p, wake_flags);
> > >  	trace_sched_wakeup(p, true);
> > >  
> > >  	p->state = TASK_RUNNING;
> > > +
> > > +	if (!task_queued(p))
> > > +		return;
> > 
> > How can this happen? we're in the middle of a wakeup, we're just added
> > the task to the rq and are still holding the appropriate rq->lock.
> 
> I believe it can be in the migrating state. A comment would be useful
> here.

Sure, I'll update. Stupid question: should I resend all series or one message
in this thread is enough?

> 
> > > @@ -4623,9 +4629,14 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
> > >  	struct rq *rq;
> > >  	unsigned int dest_cpu;
> > >  	int ret = 0;
> > > -
> > > +again:
> > >  	rq = task_rq_lock(p, &flags);
> > >  
> > > +	if (unlikely(p->on_rq) == ONRQ_MIGRATING) {
> > > +		task_rq_unlock(rq, p, &flags);
> > > +		goto again;
> > > +	}
> > > +
> > >  	if (cpumask_equal(&p->cpus_allowed, new_mask))
> > >  		goto out;
> > >  
> > 
> > That looks like a non-deterministic spin loop, 'waiting' for the
> > migration to finish. Not particularly nice and something I think we
> > should avoid for it has bad (TM) worst case behaviour.
> 
> As this patch doesn't introduce the MIGRATING getting set yet, I'd be
> interested in this too. I'm assuming that the MIGRATING flag is only
> set and then cleared within an interrupts disabled section, such that
> the time is no more than a spinlock being taken.
> 
> I would also add a cpu_relax() there too.

Sadly, I did't completely understand what you mean. Could you please explain
what has to be changed?

(I see wrong unlikely() place. It's an error. Is the other thing that there
is no task_migrating() method?)


Thanks,
Kirill


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state
  2014-07-22 11:45   ` Peter Zijlstra
  2014-07-22 12:24     ` Kirill Tkhai
  2014-07-22 12:25     ` Steven Rostedt
@ 2014-07-24 19:03     ` Oleg Nesterov
  2014-07-25  7:11       ` Kirill Tkhai
  2 siblings, 1 reply; 13+ messages in thread
From: Oleg Nesterov @ 2014-07-24 19:03 UTC (permalink / raw)
  To: Peter Zijlstra, Kirill Tkhai
  Cc: linux-kernel, Mike Galbraith, Steven Rostedt, Tim Chen,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai

Oh, sorry for delay, I didn't even try to read most of my emails
several days.

On 07/22, Peter Zijlstra wrote:
>
> more details is better ;-) Also, I think Oleg enjoys these kind of
> things, so I've added him to the CC.

Thanks. Trust me, at least I like this much more than what I had to
do this week ;)

This change depends on the previous patches and I am too lazy to try
to find/download the full series.

Kirill, since you are going to send v2, could you cc me? No, it is not
that I think I can help or even review (or may be even understand ;)
But this looks interesting to me, I'd like to know about these changes.

Oleg.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/5] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
  2014-07-22 11:30 ` [PATCH 4/5] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Kirill Tkhai
@ 2014-07-25  0:04   ` Tim Chen
  2014-07-25  7:05     ` Kirill Tkhai
  0 siblings, 1 reply; 13+ messages in thread
From: Tim Chen @ 2014-07-25  0:04 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: linux-kernel, Peter Zijlstra, Mike Galbraith, Steven Rostedt,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai

On Tue, 2014-07-22 at 15:30 +0400, Kirill Tkhai wrote:
> Bad situation:
> 
> double_lock_balance() drops busiest_rq lock. The busiest_rq is *busiest*,
> and a lot of tasks and context switches there. We are dropping the lock
> and waiting for it again.
> 
> Let's just detach the task and once finally unlock it!
> 
> Warning: this admits unlocked using of can_migrate_task(), throttled_lb_pair(),
> and task_hot(). I added comments about that.
> 

Wonder if we should also consider removing double_lock_balance usage
from rt.c and deadline.c? Then those two schedulers will also not
lock both the source and destination queues at the same time
for load balancing.

Tim


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/5] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop()
  2014-07-25  0:04   ` Tim Chen
@ 2014-07-25  7:05     ` Kirill Tkhai
  0 siblings, 0 replies; 13+ messages in thread
From: Kirill Tkhai @ 2014-07-25  7:05 UTC (permalink / raw)
  To: Tim Chen
  Cc: linux-kernel, Peter Zijlstra, Mike Galbraith, Steven Rostedt,
	Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai

В Чт, 24/07/2014 в 17:04 -0700, Tim Chen пишет:
> On Tue, 2014-07-22 at 15:30 +0400, Kirill Tkhai wrote:
> > Bad situation:
> > 
> > double_lock_balance() drops busiest_rq lock. The busiest_rq is *busiest*,
> > and a lot of tasks and context switches there. We are dropping the lock
> > and waiting for it again.
> > 
> > Let's just detach the task and once finally unlock it!
> > 
> > Warning: this admits unlocked using of can_migrate_task(), throttled_lb_pair(),
> > and task_hot(). I added comments about that.
> > 
> 
> Wonder if we should also consider removing double_lock_balance usage
> from rt.c and deadline.c? Then those two schedulers will also not
> lock both the source and destination queues at the same time
> for load balancing.

rt.c and deadline.c are similar, so we are able to discuss about one of them.

There are two places with double_lock_balance() in rt.c:

1)push_rt_task()->find_lock_lowest_rq()

We can't detach a task before we are locked lowest_rq. It's unknown whether
it will still be suitable to be attached to lowest_rq after we are locked it,
because the highest prioriry of lowest_rq may change. We have the race there.

2)pull_rt_task()

The same with here. The situation may change. We must keep both locks locked
to be sure the priorities won't change. For example, somebody may wake a high
priority task on src_rq, or somebody can pull a task there.

RT balancing is stricter than fair's..

Regards,
	Kirill


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state
  2014-07-24 19:03     ` Oleg Nesterov
@ 2014-07-25  7:11       ` Kirill Tkhai
  0 siblings, 0 replies; 13+ messages in thread
From: Kirill Tkhai @ 2014-07-25  7:11 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, linux-kernel, Mike Galbraith, Steven Rostedt,
	Tim Chen, Nicolas Pitre, Ingo Molnar, Paul Turner, tkhai

В Чт, 24/07/2014 в 21:03 +0200, Oleg Nesterov пишет:
> Oh, sorry for delay, I didn't even try to read most of my emails
> several days.
> 
> On 07/22, Peter Zijlstra wrote:
> >
> > more details is better ;-) Also, I think Oleg enjoys these kind of
> > things, so I've added him to the CC.
> 
> Thanks. Trust me, at least I like this much more than what I had to
> do this week ;)
> 
> This change depends on the previous patches and I am too lazy to try
> to find/download the full series.
> 
> Kirill, since you are going to send v2, could you cc me? No, it is not
> that I think I can help or even review (or may be even understand ;)
> But this looks interesting to me, I'd like to know about these changes.

I'm going to update the series on the weekend, of course, you'll surely
be CCed :)

Regards,
	Kirill


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2014-07-25  7:11 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20140722102425.29682.24086.stgit@tkhai>
2014-07-22 11:30 ` [PATCH 1/5] sched: Wrapper for checking task_struct's .on_rq Kirill Tkhai
2014-07-22 11:30 ` [PATCH 2/5] sched: Teach scheduler to understand ONRQ_MIGRATING state Kirill Tkhai
2014-07-22 11:45   ` Peter Zijlstra
2014-07-22 12:24     ` Kirill Tkhai
2014-07-22 12:25     ` Steven Rostedt
2014-07-22 13:20       ` Kirill Tkhai
2014-07-24 19:03     ` Oleg Nesterov
2014-07-25  7:11       ` Kirill Tkhai
2014-07-22 11:30 ` [PATCH 3/5] sched: Remove double_rq_lock() from __migrate_task() Kirill Tkhai
2014-07-22 11:30 ` [PATCH 4/5] sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Kirill Tkhai
2014-07-25  0:04   ` Tim Chen
2014-07-25  7:05     ` Kirill Tkhai
2014-07-22 11:31 ` [PATCH 5/5] sched/fair: Remove double_lock_balance() from load_balance() Kirill Tkhai

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.