From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1764316AbcINQ4D (ORCPT <rfc822;w@1wt.eu>);
        Wed, 14 Sep 2016 12:56:03 -0400
Received: from merlin.infradead.org ([205.233.59.134]:49352 "EHLO
        merlin.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1760668AbcINQz6 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 14 Sep 2016 12:55:58 -0400
Date: Wed, 14 Sep 2016 18:55:49 +0200
From: Peter Zijlstra <peterz@infradead.org>
To: Arve =?iso-8859-1?B?SGr4bm5lduVn?= <arve@android.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
        Thomas Gleixner <tglx@linutronix.de>,
        "devel@driverdev.osuosl.org" <devel@driverdev.osuosl.org>,
        Riley Andrews <riandrews@android.com>,
        LKML <linux-kernel@vger.kernel.org>,
        Christoph Hellwig <hch@infradead.org>, Todd Kjos <tkjos@google.com>,
        Ingo Molnar <mingo@kernel.org>
Subject: Re: [PATCH] android: binder: Disable preemption while holding the
 global binder lock
Message-ID: <20160914165549.GF5020@twins.programming.kicks-ass.net>
References: <CAHRSSEwkj-VxstqCm6JzVQcmZzxLQcob0JNiEhzes9oT0-tXSg@mail.gmail.com>
 <20160910161659.GA7987@infradead.org>
 <20160910162210.GK10153@twins.programming.kicks-ass.net>
 <alpine.DEB.2.20.1609101835360.5560@nanos>
 <20160910172843.GA17876@kroah.com>
 <CAMP5XgfDybsfz-v2NTYqAX8K48xCJ_zkH65AHosAOmT7P=Gyxg@mail.gmail.com>
 <20160913073259.GC5008@twins.programming.kicks-ass.net>
 <CAMP5Xgc89ZONamzB4gmnH3RjW5rDSwh3Akrx30npfTiZ1qGzWg@mail.gmail.com>
 <20160914161103.GC5016@twins.programming.kicks-ass.net>
 <20160914161340.GE5020@twins.programming.kicks-ass.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=iso-8859-1
Content-Disposition: inline
Content-Transfer-Encoding: 8bit
In-Reply-To: <20160914161340.GE5020@twins.programming.kicks-ass.net>
User-Agent: Mutt/1.5.23.1 (2014-03-12)
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

On Wed, Sep 14, 2016 at 06:13:40PM +0200, Peter Zijlstra wrote:
> On Wed, Sep 14, 2016 at 06:11:03PM +0200, Peter Zijlstra wrote:
> > On Tue, Sep 13, 2016 at 12:53:27PM -0700, Arve Hjønnevåg wrote:
> > > On Tue, Sep 13, 2016 at 12:32 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > > > cgroups should be irrelevant, PI is unaware of them.
> > > 
> > > I don't think cgroups are irrelevant. PI being unaware of them
> > > explains the problem I described. If the task that holds the lock is
> > > in a cgroup that has a low cpu.shares value, then boosting the task's
> > > priority within that group does necessarily make it any more likely to
> > > run.
> > 
> > Thing is, for FIFO/DL the important parameters (prio and deadline resp.)
> > are not cgroup dependent.
> > 
> > For CFS you're right, and as per usual, cgroups will be a royal pain.
> > While we can compute the total weight in the block chain, getting that
> > back to a task which is stuck in a cgroup is just not going to work
> > well.
> 
> Not to mention the fact that the weight depends on the current running
> state. Having those tasks block insta changes the actual weight.
> 
> > /me curses @ cgroups.. bloody stupid things.
> 
> More of that.


Something a little like so... completely untested, and has a fair number
of corner cases still left open I think..


--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1691,8 +1691,8 @@ struct task_struct {
 	struct seccomp seccomp;
 
 /* Thread group tracking */
-   	u32 parent_exec_id;
-   	u32 self_exec_id;
+	u32 parent_exec_id;
+	u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
  * mempolicy */
 	spinlock_t alloc_lock;
@@ -1710,6 +1710,11 @@ struct task_struct {
 	struct task_struct *pi_top_task;
 	/* Deadlock detection and priority inheritance handling */
 	struct rt_mutex_waiter *pi_blocked_on;
+	unsigned long pi_weight;
+#ifdef CONFIG_CGROUP_SCHED
+	struct task_group *orig_task_group;
+	unsigned long normal_weight;
+#endif
 #endif
 
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -1977,6 +1982,8 @@ static inline int tsk_nr_cpus_allowed(st
 	return p->nr_cpus_allowed;
 }
 
+extern unsigned long task_pi_weight(struct task_struct *p);
+
 #define TNF_MIGRATED	0x01
 #define TNF_NO_GROUP	0x02
 #define TNF_SHARED	0x04
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -20,6 +20,19 @@
 #include "rtmutex_common.h"
 
 /*
+ * !(rt_prio || dl_prio)
+ */
+static inline bool other_prio(int prio)
+{
+	return prio >= MAX_RT_PRIO;
+}
+
+static inline bool other_task(struct task_struct *task)
+{
+	return other_prio(task->prio);
+}
+
+/*
  * lock->owner state tracking:
  *
  * lock->owner holds the task_struct pointer of the owner. Bit 0
@@ -226,6 +239,11 @@ rt_mutex_enqueue(struct rt_mutex *lock,
 
 	rb_link_node(&waiter->tree_entry, parent, link);
 	rb_insert_color(&waiter->tree_entry, &lock->waiters);
+	/*
+	 * We want the lock->waiter_weight to reflect the sum of all queued
+	 * waiters.
+	 */
+	lock->waiters_weight += waiter->weight;
 }
 
 static void
@@ -239,6 +257,7 @@ rt_mutex_dequeue(struct rt_mutex *lock,
 
 	rb_erase(&waiter->tree_entry, &lock->waiters);
 	RB_CLEAR_NODE(&waiter->tree_entry);
+	lock->waiters_weight += waiter->weight;
 }
 
 static void
@@ -265,6 +284,18 @@ rt_mutex_enqueue_pi(struct task_struct *
 
 	rb_link_node(&waiter->pi_tree_entry, parent, link);
 	rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters);
+
+	/*
+	 * Since a task can own multiple locks, and we have one pi_waiter
+	 * for every lock, propagate the lock->waiter_weight, which is the sum
+	 * of all weights queued on that lock, into the top waiter, and add
+	 * that to the owning task's pi_weight.
+	 *
+	 * This results in pi_weight being the sum of all blocked waiters
+	 * on this task.
+	 */
+	waiter->top_weight = waiter->lock->waiters_weight;
+	task->pi_weight += waiter->top_weight;
 }
 
 static void
@@ -278,6 +309,7 @@ rt_mutex_dequeue_pi(struct task_struct *
 
 	rb_erase(&waiter->pi_tree_entry, &task->pi_waiters);
 	RB_CLEAR_NODE(&waiter->pi_tree_entry);
+	task->pi_weight -= waiter->top_weight;
 }
 
 static void rt_mutex_adjust_prio(struct task_struct *p)
@@ -497,7 +529,7 @@ static int rt_mutex_adjust_prio_chain(st
 		 * detection is enabled we continue, but stop the
 		 * requeueing in the chain walk.
 		 */
-		if (top_waiter != task_top_pi_waiter(task)) {
+		if (top_waiter != task_top_pi_waiter(task) && !other_task(task)) {
 			if (!detect_deadlock)
 				goto out_unlock_pi;
 			else
@@ -512,7 +544,7 @@ static int rt_mutex_adjust_prio_chain(st
 	 * enabled we continue, but stop the requeueing in the chain
 	 * walk.
 	 */
-	if (rt_mutex_waiter_equal(waiter, cmp_task(task))) {
+	if (rt_mutex_waiter_equal(waiter, cmp_task(task)) && !other_task(task)) {
 		if (!detect_deadlock)
 			goto out_unlock_pi;
 		else
@@ -627,6 +659,11 @@ static int rt_mutex_adjust_prio_chain(st
 	 */
 	waiter->prio = task->prio;
 	waiter->deadline = task->dl.deadline;
+	/*
+	 * The weight of the task depends on the block chain, since
+	 * we're iterating that, its likely to have changed.
+	 */
+	waiter->weight = task_pi_weight(task);
 
 	rt_mutex_enqueue(lock, waiter);
 
@@ -685,6 +722,15 @@ static int rt_mutex_adjust_prio_chain(st
 		waiter = rt_mutex_top_waiter(lock);
 		rt_mutex_enqueue_pi(task, waiter);
 		rt_mutex_adjust_prio(task);
+
+	} else if (other_task(task)) {
+		/*
+		 * Propagate the lock->waiters_weight into task->pi_weight
+		 */
+		rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+		rt_mutex_enqueue_pi(task, prerequeue_top_waiter);
+		rt_mutex_adjust_prio(task);
+
 	} else {
 		/*
 		 * Nothing changed. No need to do any priority
@@ -728,7 +774,7 @@ static int rt_mutex_adjust_prio_chain(st
 	 * then we can stop the chain walk here if we are not in full
 	 * deadlock detection mode.
 	 */
-	if (!detect_deadlock && waiter != top_waiter)
+	if (!detect_deadlock && waiter != top_waiter && !other_task(task))
 		goto out_put_task;
 
 	goto again;
@@ -904,6 +950,7 @@ static int task_blocks_on_rt_mutex(struc
 	waiter->lock = lock;
 	waiter->prio = task->prio;
 	waiter->deadline = task->dl.deadline;
+	waiter->weight = task_pi_weight(task);
 
 	/* Get the top priority waiter on the lock */
 	if (rt_mutex_has_waiters(lock))
@@ -918,7 +965,7 @@ static int task_blocks_on_rt_mutex(struc
 		return 0;
 
 	raw_spin_lock(&owner->pi_lock);
-	if (waiter == rt_mutex_top_waiter(lock)) {
+	if (other_task(owner) || waiter == rt_mutex_top_waiter(lock)) {
 		rt_mutex_dequeue_pi(owner, top_waiter);
 		rt_mutex_enqueue_pi(owner, waiter);
 
@@ -1017,7 +1064,8 @@ static void mark_wakeup_next_waiter(stru
 static void remove_waiter(struct rt_mutex *lock,
 			  struct rt_mutex_waiter *waiter)
 {
-	bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
+	struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
+	bool is_top_waiter = (waiter == top_waiter);
 	struct task_struct *owner = rt_mutex_owner(lock);
 	struct rt_mutex *next_lock;
 
@@ -1032,12 +1080,12 @@ static void remove_waiter(struct rt_mute
 	 * Only update priority if the waiter was the highest priority
 	 * waiter of the lock and there is an owner to update.
 	 */
-	if (!owner || !is_top_waiter)
+	if (!owner || (!other_task(owner) && !is_top_waiter))
 		return;
 
 	raw_spin_lock(&owner->pi_lock);
 
-	rt_mutex_dequeue_pi(owner, waiter);
+	rt_mutex_dequeue_pi(owner, top_waiter);
 
 	if (rt_mutex_has_waiters(lock))
 		rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -34,6 +34,7 @@ struct rt_mutex_waiter {
 #endif
 	int prio;
 	u64 deadline;
+	unsigned weight, top_weight;
 };
 
 /*
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3725,6 +3725,8 @@ void rt_mutex_setprio(struct task_struct
 		if (rt_prio(oldprio))
 			p->rt.timeout = 0;
 		p->sched_class = &fair_sched_class;
+
+		task_set_pi_weight(p);
 	}
 
 	p->prio = prio;
@@ -7933,6 +7935,12 @@ static void sched_change_group(struct ta
 	tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
 			  struct task_group, css);
 	tg = autogroup_task_group(tsk, tg);
+
+#ifdef CONFIG_RT_MUTEXES
+	tsk->orig_task_group = tg;
+
+	if (!tsk->pi_weight)
+#endif
 	tsk->sched_task_group = tg;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6646,6 +6646,32 @@ static void attach_tasks(struct lb_env *
 	raw_spin_unlock(&env->dst_rq->lock);
 }
 
+#ifdef CONFIG_RT_MUTEXES
+/*
+ * See set_load_weight().
+ */
+static inline unsigned long __task_normal_weight(struct task_struct *p)
+{
+	int prio = p->static_prio - MAX_RT_PRIO;
+
+	/*
+	 * SCHED_IDLE tasks get minimal weight:
+	 */
+	if (idle_policy(p->policy))
+		return scale_load(WEIGHT_IDLEPRIO);
+
+	return scale_load(sched_prio_to_weight[prio]);
+}
+
+
+static unsigned long task_normal_weight(struct task_struct *p);
+
+unsigned long task_pi_weight(struct task_struct *p)
+{
+	return task_normal_weight(p) + p->pi_weight;
+}
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void update_blocked_averages(int cpu)
 {
@@ -6717,7 +6743,80 @@ static unsigned long task_h_load(struct
 	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
 			cfs_rq_load_avg(cfs_rq) + 1);
 }
-#else
+
+#ifdef CONFIG_RT_MUTEXES
+/*
+ * Humongous horrid hack.. because cgroups bloody stink.
+ *
+ * The idea with PI on CFS is to sum the weight of all blocked tasks but with
+ * cgroups the weight of a task depends on the running state of tasks. Blocking
+ * changes the weight.
+ *
+ * Paper over that problem by using the regular averages, and hoping boosting
+ * is short enough to not actually matter much.
+ *
+ * The next problem is that getting that weight back to the boosted task
+ * requires lifting it out of its cgroup. Ideally we'd place it in the first
+ * common parent, but *shees* then we'd have to compute that, so bail and stick
+ * it in the root group.
+ */
+
+static unsigned long task_normal_weight(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+	/*
+	 * Once we have ->pi_weight, we'll get boosted into the root group
+	 * and the below falls apart.
+	 */
+	if (!p->pi_weight) {
+		update_cfs_rq_h_load(cfs_rq);
+		p->normal_weight = 
+			div64_ul(__task_normal_weight(p) * cfs_rq->h_load,
+					cfs_rq_load_avg(cfs_rq) + 1);
+	}
+
+	return p->normal_weight;
+}
+
+static void detach_task_cfs_rq(struct task_struct *p);
+static void attach_task_cfs_rq(struct task_struct *p);
+
+void task_set_pi_weight(struct task_struct *p)
+{
+	unsigned long normal_weight = p->normal_weight;
+	struct task_group *tg = &root_task_group;
+	bool move_group;
+
+	if (!p->pi_weight) {
+		tg = p->orig_task_group;
+		normal_weight = __task_normal_weight(p);
+	}
+
+	move_group = p->sched_task_group != tg;
+
+	if (move_group) {
+		p->sched_task_group = tg;
+
+		detach_task_cfs_rq(p);
+		set_task_rq(p, task_cpu(p));
+
+#ifdef CONFIG_SMP
+		/* Tell se's cfs_rq has been changed -- migrated */
+		p->se.avg.last_update_time = 0;
+#endif
+	}
+
+	update_load_set(&p->se.load, normal_weight + p->pi_weight);
+
+	if (move_group)
+		attach_task_cfs_rq(p);
+}
+
+#endif
+
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
 static inline void update_blocked_averages(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -6734,8 +6833,21 @@ static unsigned long task_h_load(struct
 {
 	return p->se.avg.load_avg;
 }
+
+#ifdef CONFIG_RT_MUTEXES
+static unsigned long task_normal_weight(struct task_struct *p)
+{
+	return __task_normal_weight(p);
+}
+
+void task_set_pi_weight(struct task_struct *p)
+{
+	update_load_set(&p->se.load, task_pi_weight(p));
+}
 #endif
 
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 /********** Helpers for find_busiest_group ************************/
 
 enum group_type {
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1827,3 +1827,5 @@ static inline void cpufreq_trigger_updat
 #else /* arch_scale_freq_capacity */
 #define arch_scale_freq_invariant()	(false)
 #endif
+
+extern void task_set_pi_weight(struct task_struct *p);