All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch V4 00/10] rtmutex: Code clarification and optimization
@ 2014-06-11 18:44 Thomas Gleixner
  2014-06-11 18:44 ` [patch V4 01/10] rtmutex: Plug slow unlock race Thomas Gleixner
                   ` (10 more replies)
  0 siblings, 11 replies; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar

Hopefully the last iteration of this.

I addressed all the review comments and added the following new ones:

1/n: plug unlock race.
   
	Patch was separately posted already in the safety of
   	*mutex_unlock() thread.

2/n: Document and simplify the trylock path

3/n: Document and simplify try_to_take_rtmutex()

Thanks,

	tglx




^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch V4 01/10] rtmutex: Plug slow unlock race
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
@ 2014-06-11 18:44 ` Thomas Gleixner
  2014-06-13 15:41   ` Steven Rostedt
  2014-06-16  8:06   ` [tip:locking/urgent] " tip-bot for Thomas Gleixner
  2014-06-11 18:44 ` [patch V4 02/10] rtmutex: Simplify rtmutex_slowtrylock() Thomas Gleixner
                   ` (9 subsequent siblings)
  10 siblings, 2 replies; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: rtmutex-plug-slow-unlock-race.patch --]
[-- Type: text/plain, Size: 6376 bytes --]

When the rtmutex fast path is enabled the slow unlock function can
create the following situation:

spin_lock(foo->m->wait_lock);
foo->m->owner = NULL;
	    			rt_mutex_lock(foo->m); <-- fast path
				free = atomic_dec_and_test(foo->refcnt);
				rt_mutex_unlock(foo->m); <-- fast path
				if (free)
				   kfree(foo);

spin_unlock(foo->m->wait_lock); <--- Use after free.

Plug the race by changing the slow unlock to the following scheme:

     while (!rt_mutex_has_waiters(m)) {
     	    /* Clear the waiters bit in m->owner */
	    clear_rt_mutex_waiters(m);
      	    owner = rt_mutex_owner(m);
      	    spin_unlock(m->wait_lock);
      	    if (cmpxchg(m->owner, owner, 0) == owner)
      	       return;
      	    spin_lock(m->wait_lock);
     }

So in case of a new waiter incoming while the owner tries the slow
path unlock we have two situations:

 unlock(wait_lock);
					lock(wait_lock);
 cmpxchg(p, owner, 0) == owner
 	    	   			mark_rt_mutex_waiters(lock);
	 				acquire(lock);

Or:

 unlock(wait_lock);
					lock(wait_lock);
	 				mark_rt_mutex_waiters(lock);
 cmpxchg(p, owner, 0) != owner
					enqueue_waiter();
					unlock(wait_lock);
 lock(wait_lock);
 wakeup_next waiter();
 unlock(wait_lock);
					lock(wait_lock);
					acquire(lock);

If the fast path is disabled, then the simple

   m->owner = NULL;
   unlock(m->wait_lock);

is sufficient as all access to m->owner is serialized via
m->wait_lock;

Also document and clarify the wakeup_next_waiter function as suggested
by Oleg Nesterov.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c |  118 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 112 insertions(+), 6 deletions(-)

Index: tip/kernel/locking/rtmutex.c
===================================================================
--- tip.orig/kernel/locking/rtmutex.c
+++ tip/kernel/locking/rtmutex.c
@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters
 		owner = *p;
 	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
 }
+
+/*
+ * Safe fastpath aware unlock:
+ * 1) Clear the waiters bit
+ * 2) Drop lock->wait_lock
+ * 3) Try to unlock the lock with cmpxchg
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+	__releases(lock->wait_lock)
+{
+	struct task_struct *owner = rt_mutex_owner(lock);
+
+	clear_rt_mutex_waiters(lock);
+	raw_spin_unlock(&lock->wait_lock);
+	/*
+	 * If a new waiter comes in between the unlock and the cmpxchg
+	 * we have two situations:
+	 *
+	 * unlock(wait_lock);
+	 *					lock(wait_lock);
+	 * cmpxchg(p, owner, 0) == owner
+	 *					mark_rt_mutex_waiters(lock);
+	 *					acquire(lock);
+	 * or:
+	 *
+	 * unlock(wait_lock);
+	 *					lock(wait_lock);
+	 *					mark_rt_mutex_waiters(lock);
+	 *
+	 * cmpxchg(p, owner, 0) != owner
+	 *					enqueue_waiter();
+	 *					unlock(wait_lock);
+	 * lock(wait_lock);
+	 * wake waiter();
+	 * unlock(wait_lock);
+	 *					lock(wait_lock);
+	 *					acquire(lock);
+	 */
+	return rt_mutex_cmpxchg(lock, owner, NULL);
+}
+
 #else
 # define rt_mutex_cmpxchg(l,c,n)	(0)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters
 	lock->owner = (struct task_struct *)
 			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
 }
+
+/*
+ * Simple slow path only version: lock->owner is protected by lock->wait_lock.
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+	__releases(lock->wait_lock)
+{
+	lock->owner = NULL;
+	raw_spin_unlock(&lock->wait_lock);
+	return true;
+}
 #endif
 
 static inline int
@@ -650,7 +702,8 @@ static int task_blocks_on_rt_mutex(struc
 /*
  * Wake up the next waiter on the lock.
  *
- * Remove the top waiter from the current tasks waiter list and wake it up.
+ * Remove the top waiter from the current tasks pi waiter list and
+ * wake it up.
  *
  * Called with lock->wait_lock held.
  */
@@ -671,10 +724,26 @@ static void wakeup_next_waiter(struct rt
 	 */
 	rt_mutex_dequeue_pi(current, waiter);
 
-	rt_mutex_set_owner(lock, NULL);
+	/*
+	 * This is safe versus the fastpath acquire:
+	 *
+	 * We do not remove the waiter from the lock waiter list
+	 * here. It stays the top waiter.
+	 *
+	 * We set the owner to NULL, but keep the RT_MUTEX_HAS_WAITERS
+	 * bit set, which forces all potential new waiters into the
+	 * slow path. So they are serialized along with all enqueued
+	 * waiters on lock->wait_lock.
+	 */
+	lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
 
 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
+	/*
+	 * It's safe to dereference waiter as it cannot go away as
+	 * long as we hold lock->wait_lock. The waiter task needs to
+	 * acquire it in order to dequeue the waiter.
+	 */
 	wake_up_process(waiter->task);
 }
 
@@ -928,12 +997,49 @@ rt_mutex_slowunlock(struct rt_mutex *loc
 
 	rt_mutex_deadlock_account_unlock(current);
 
-	if (!rt_mutex_has_waiters(lock)) {
-		lock->owner = NULL;
-		raw_spin_unlock(&lock->wait_lock);
-		return;
+	/*
+	 * We must be careful here if the fast path is enabled. If we
+	 * have no waiters queued we cannot set owner to NULL here
+	 * because of:
+	 *
+	 * foo->lock->owner = NULL;
+	 *			rtmutex_lock(foo->lock);   <- fast path
+	 *			free = atomic_dec_and_test(foo->refcnt);
+	 *			rtmutex_unlock(foo->lock); <- fast path
+	 *			if (free)
+	 *				kfree(foo);
+	 * raw_spin_unlock(foo->lock->wait_lock);
+	 *
+	 * So for the fastpath enabled kernel:
+	 *
+	 * Nothing can set the waiters bit as long as we hold
+	 * lock->wait_lock. So we do the following sequence:
+	 *
+	 *	owner = rt_mutex_owner(lock);
+	 *	clear_rt_mutex_waiters(lock);
+	 *	raw_spin_unlock(&lock->wait_lock);
+	 *	if (cmpxchg(&lock->owner, owner, 0) == owner)
+	 *		return;
+	 *	goto retry;
+	 *
+	 * The fastpath disabled variant is simple as all access to
+	 * lock->owner is serialized by lock->wait_lock:
+	 *
+	 *	lock->owner = NULL;
+	 *	raw_spin_unlock(&lock->wait_lock);
+	 */
+	while (!rt_mutex_has_waiters(lock)) {
+		/* Drops lock->wait_lock ! */
+		if (unlock_rt_mutex_safe(lock) == true)
+			return;
+		/* Relock the rtmutex and try again */
+		raw_spin_lock(&lock->wait_lock);
 	}
 
+	/*
+	 * The wakeup next waiter path does not suffer from the above
+	 * race. See the comments there.
+	 */
 	wakeup_next_waiter(lock);
 
 	raw_spin_unlock(&lock->wait_lock);



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch V4 02/10] rtmutex: Simplify rtmutex_slowtrylock()
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
  2014-06-11 18:44 ` [patch V4 01/10] rtmutex: Plug slow unlock race Thomas Gleixner
@ 2014-06-11 18:44 ` Thomas Gleixner
  2014-06-12  3:34   ` Lai Jiangshan
  2014-06-13 15:58   ` Steven Rostedt
  2014-06-11 18:44 ` [patch V4 04/10] rtmutex: No need to keep task ref for lock owner check Thomas Gleixner
                   ` (8 subsequent siblings)
  10 siblings, 2 replies; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar, Oleg Nesterov

[-- Attachment #1: rtmutex-simplify-slow-trylock.patch --]
[-- Type: text/plain, Size: 2045 bytes --]

Oleg noticed that rtmutex_slowtrylock() has a pointless check for
rt_mutex_owner(lock) != current.

To avoid calling try_to_take_rtmutex() we really want to check whether
the lock has an owner at all or whether the trylock failed because the
owner is NULL, but the RT_MUTEX_HAS_WAITERS bit is set. This covers
the lock is owned by caller situation as well.

We can actually do this check lockless. trylock is taking a chance
whether we take lock->wait_lock to do the check or not.

Add comments to the function while at it.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c |   32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

Index: tip/kernel/locking/rtmutex.c
===================================================================
--- tip.orig/kernel/locking/rtmutex.c
+++ tip/kernel/locking/rtmutex.c
@@ -963,22 +963,32 @@ rt_mutex_slowlock(struct rt_mutex *lock,
 /*
  * Slow path try-lock function:
  */
-static inline int
-rt_mutex_slowtrylock(struct rt_mutex *lock)
+static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
 {
-	int ret = 0;
+	int ret;
 
+	/*
+	 * trylock is taking a chance. So we dont have to take
+	 * @lock->wait_lock to figure out whether @lock has a real or
+	 * if @lock owner is NULL and the RT_MUTEX_HAS_WAITERS bit is
+	 * set.
+	 */
+	if (rt_mutex_owner(lock))
+		return 0;
+
+	/*
+	 * The mutex has currently no owner. Lock the wait lock and
+	 * try to acquire the lock.
+	 */
 	raw_spin_lock(&lock->wait_lock);
 
-	if (likely(rt_mutex_owner(lock) != current)) {
+	ret = try_to_take_rt_mutex(lock, current, NULL);
 
-		ret = try_to_take_rt_mutex(lock, current, NULL);
-		/*
-		 * try_to_take_rt_mutex() sets the lock waiters
-		 * bit unconditionally. Clean this up.
-		 */
-		fixup_rt_mutex_waiters(lock);
-	}
+	/*
+	 * try_to_take_rt_mutex() sets the lock waiters bit
+	 * unconditionally. Clean this up.
+	 */
+	fixup_rt_mutex_waiters(lock);
 
 	raw_spin_unlock(&lock->wait_lock);
 



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch V4 04/10] rtmutex: No need to keep task ref for lock owner check
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
  2014-06-11 18:44 ` [patch V4 01/10] rtmutex: Plug slow unlock race Thomas Gleixner
  2014-06-11 18:44 ` [patch V4 02/10] rtmutex: Simplify rtmutex_slowtrylock() Thomas Gleixner
@ 2014-06-11 18:44 ` Thomas Gleixner
  2014-06-13 16:30   ` Steven Rostedt
  2014-06-11 18:44 ` [patch V4 03/10] rtmutex: Simplify and document try_to_take_rtmutex() Thomas Gleixner
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: rtmutex-deobfuscate-chain-walk.patch --]
[-- Type: text/plain, Size: 1092 bytes --]

There is no point to keep the task ref across the check for lock
owner. Drop the ref before that, so the protection context is clear.

Found while documenting the chain walk.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c |    5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

Index: tip/kernel/locking/rtmutex.c
===================================================================
--- tip.orig/kernel/locking/rtmutex.c
+++ tip/kernel/locking/rtmutex.c
@@ -463,6 +463,8 @@ static int rt_mutex_adjust_prio_chain(st
 
 	/* Release the task */
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+	put_task_struct(task);
+
 	if (!rt_mutex_owner(lock)) {
 		/*
 		 * If the requeue above changed the top waiter, then we need
@@ -472,9 +474,8 @@ static int rt_mutex_adjust_prio_chain(st
 		if (top_waiter != rt_mutex_top_waiter(lock))
 			wake_up_process(rt_mutex_top_waiter(lock)->task);
 		raw_spin_unlock(&lock->wait_lock);
-		goto out_put_task;
+		return 0;
 	}
-	put_task_struct(task);
 
 	/* Grab the next task */
 	task = rt_mutex_owner(lock);



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch V4 03/10] rtmutex: Simplify and document try_to_take_rtmutex()
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
                   ` (2 preceding siblings ...)
  2014-06-11 18:44 ` [patch V4 04/10] rtmutex: No need to keep task ref for lock owner check Thomas Gleixner
@ 2014-06-11 18:44 ` Thomas Gleixner
  2014-06-13 16:27   ` Steven Rostedt
  2014-06-11 18:44 ` [patch V4 06/10] rtmutex: Document pi chain walk Thomas Gleixner
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: rtmutx-simplify-and-document-try-to-take-rtmutex.patch --]
[-- Type: text/plain, Size: 5879 bytes --]

The current implementation of try_to_take_rtmutex() is correct, but
requires more than a single brain twist to understand the clever
encoded conditionals.

Untangle it and document the cases proper.

Looks less efficient at the first glance, but actually reduces the
binary code size on x8664 by 80 bytes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c |  131 +++++++++++++++++++++++++++++++----------------
 1 file changed, 87 insertions(+), 44 deletions(-)

Index: tip/kernel/locking/rtmutex.c
===================================================================
--- tip.orig/kernel/locking/rtmutex.c
+++ tip/kernel/locking/rtmutex.c
@@ -535,74 +535,117 @@ static int rt_mutex_adjust_prio_chain(st
  *
  * @lock:   the lock to be acquired.
  * @task:   the task which wants to acquire the lock
- * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
+ * @waiter: the waiter that is queued to the lock's wait list. (can be
+ *	    NULL, if called due to trylock attempts from
+ *	    rt_mutex_slowlock() or rt_mutex_slowtrylock().
  */
 static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
-		struct rt_mutex_waiter *waiter)
+				struct rt_mutex_waiter *waiter)
 {
+	unsigned long flags;
+
 	/*
-	 * We have to be careful here if the atomic speedups are
-	 * enabled, such that, when
-	 *  - no other waiter is on the lock
-	 *  - the lock has been released since we did the cmpxchg
-	 * the lock can be released or taken while we are doing the
-	 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+	 * Before testing whether we can acquire @lock, we set the
+	 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
+	 * other tasks which try to modify @lock into the slow path
+	 * and they serialize on @lock->wait_lock.
+	 *
+	 * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
+	 * as explained at the top of this file if and only if:
 	 *
-	 * The atomic acquire/release aware variant of
-	 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
-	 * the WAITERS bit, the atomic release / acquire can not
-	 * happen anymore and lock->wait_lock protects us from the
-	 * non-atomic case.
+	 * - There is a lock owner. The caller must fixup the
+	 *   transient state if it does a trylock or leaves the lock
+	 *   function due to a signal or timeout.
 	 *
-	 * Note, that this might set lock->owner =
-	 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
-	 * any more. This is fixed up when we take the ownership.
-	 * This is the transitional state explained at the top of this file.
+	 * - @task acquires the lock and there are no other
+	 *   waiters. This is undone in rt_mutex_set_owner(@task) at
+	 *   the end of this function.
 	 */
 	mark_rt_mutex_waiters(lock);
 
+	/*
+	 * If @lock has an owner, give up.
+	 */
 	if (rt_mutex_owner(lock))
 		return 0;
 
 	/*
-	 * It will get the lock because of one of these conditions:
-	 * 1) there is no waiter
-	 * 2) higher priority than waiters
-	 * 3) it is top waiter
-	 */
-	if (rt_mutex_has_waiters(lock)) {
-		if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
-			if (!waiter || waiter != rt_mutex_top_waiter(lock))
-				return 0;
-		}
-	}
+	 * If @waiter != NULL, @task has already enqueued the waiter
+	 * into @lock waiter list. If @waiter == NULL then this is a
+	 * trylock attempt.
+	 */
+	if (waiter) {
+		/*
+		 * If waiter is not the highest priority waiter of
+		 * @lock, give up.
+		 */
+		if (waiter != rt_mutex_top_waiter(lock))
+			return 0;
 
-	if (waiter || rt_mutex_has_waiters(lock)) {
-		unsigned long flags;
-		struct rt_mutex_waiter *top;
-
-		raw_spin_lock_irqsave(&task->pi_lock, flags);
-
-		/* remove the queued waiter. */
-		if (waiter) {
-			rt_mutex_dequeue(lock, waiter);
-			task->pi_blocked_on = NULL;
-		}
+		/*
+		 * We can acquire the lock. Remove the waiter from the
+		 * lock waiters list.
+		 */
+		rt_mutex_dequeue(lock, waiter);
 
+	} else {
 		/*
-		 * We have to enqueue the top waiter(if it exists) into
-		 * task->pi_waiters list.
+		 * If the lock has waiters already we check whether @task is
+		 * eligible to take over the lock.
+		 *
+		 * If there are no other waiters, @task can acquire the lock.
+		 * @task->pi_blocked_on is NULL
 		 */
 		if (rt_mutex_has_waiters(lock)) {
-			top = rt_mutex_top_waiter(lock);
-			rt_mutex_enqueue_pi(task, top);
+			/*
+			 * If @task->prio is greater than or equal to
+			 * the top waiter priority (kernel view),
+			 * @task lost.
+			 */
+			if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+				return 0;
+
+			/*
+			 * The current top waiter stays enqueued. We
+			 * don't have to change anything in the lock
+			 * waiters order.
+			 */
+		} else {
+			/*
+			 * No waiters. Take the lock without the
+			 * pi_lock dance.@task->pi_blocked_on is NULL
+			 * and we have no waiters to enqueue in @task
+			 * pi waiters list.
+			 */
+			goto takeit;
 		}
-		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	}
 
+	/*
+	 * Clear @task->pi_blocked_on. Requires protection by
+	 * @task->pi_lock. Redundant operation for the @waiter == NULL
+	 * case, but conditionals are more expensive than a redundant
+	 * store.
+	 */
+	raw_spin_lock_irqsave(&task->pi_lock, flags);
+	task->pi_blocked_on = NULL;
+	/*
+	 * Finish the lock acquisition. @task is the new owner. If
+	 * other waiters exist we have to insert the highest priority
+	 * waiter into @task->pi_waiters list.
+	 */
+	if (rt_mutex_has_waiters(lock))
+		rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
+	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+takeit:
 	/* We got the lock. */
 	debug_rt_mutex_lock(lock);
 
+	/*
+	 * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
+	 * are still waiters or clears it.
+	 */
 	rt_mutex_set_owner(lock, task);
 
 	rt_mutex_deadlock_account_lock(lock, task);



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch V4 06/10] rtmutex: Document pi chain walk
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
                   ` (3 preceding siblings ...)
  2014-06-11 18:44 ` [patch V4 03/10] rtmutex: Simplify and document try_to_take_rtmutex() Thomas Gleixner
@ 2014-06-11 18:44 ` Thomas Gleixner
  2014-06-13 16:54   ` Steven Rostedt
  2014-06-11 18:44 ` [patch V4 05/10] rtmutex: Clarify the boost/deboost part Thomas Gleixner
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: rtmutex-document-chain-walk.patch --]
[-- Type: text/plain, Size: 6664 bytes --]

Add commentry to document the chain walk and the protection mechanisms
and their scope.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c |  100 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 91 insertions(+), 9 deletions(-)

Index: tip/kernel/locking/rtmutex.c
===================================================================
--- tip.orig/kernel/locking/rtmutex.c
+++ tip/kernel/locking/rtmutex.c
@@ -337,6 +337,48 @@ static inline struct rt_mutex *task_bloc
  * @top_task:	the current top waiter
  *
  * Returns 0 or -EDEADLK.
+ *
+ * Chain walk basics and protection scope
+ *
+ * [R] refcount on task
+ * [P] task->pi_lock held
+ * [L] rtmutex->wait_lock held
+ *
+ * Step	Description				Protected by
+ *	function arguments:
+ *	@task					[R]
+ *	@orig_lock if != NULL			@top_task is blocked on it
+ *	@next_lock				Unprotected. Cannot be
+ *						dereferenced. Only used for
+ *						comparison.
+ *	@orig_waiter if != NULL			@top_task is blocked on it
+ *	@top_task				current, or in case of proxy
+ *						locking protected by calling
+ *						code
+ *	again:
+ *	  loop_sanity_check();
+ *	retry:
+ * [1]	  lock(task->pi_lock);			[R] acquire [P]
+ * [2]	  waiter = task->pi_blocked_on;		[P]
+ * [3]	  check_exit_conditions_1();		[P]
+ * [4]	  lock = waiter->lock;			[P]
+ * [5]	  if (!try_lock(lock->wait_lock)) {	[P] try to acquire [L]
+ *	    unlock(task->pi_lock);		release [P]
+ *	    goto retry;
+ *	  }
+ * [6]	  check_exit_conditions_2();		[P] + [L]
+ * [7]	  requeue_lock_waiter(lock, waiter);	[P] + [L]
+ * [8]	  unlock(task->pi_lock);		release [P]
+ *	  put_task_struct(task);		release [R]
+ * [9]	  check_exit_conditions_3();		[L]
+ * [10]	  task = owner(lock);			[L]
+ *	  get_task_struct(task);		[L] acquire [R]
+ *	  lock(task->pi_lock);			[L] acquire [P]
+ * [11]	  requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
+ * [12]	  check_exit_conditions_4();		[P] + [L]
+ * [13]	  unlock(task->pi_lock);		release [P]
+ *	  unlock(lock->wait_lock);		release [L]
+ *	  goto again;
  */
 static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 				      int deadlock_detect,
@@ -361,6 +403,9 @@ static int rt_mutex_adjust_prio_chain(st
 	 * carefully whether things change under us.
 	 */
  again:
+	/*
+	 * We limit the lock chain length for each invocation.
+	 */
 	if (++depth > max_lock_depth) {
 		static int prev_max;
 
@@ -378,13 +423,28 @@ static int rt_mutex_adjust_prio_chain(st
 
 		return -EDEADLK;
 	}
+
+	/*
+	 * We are fully preemptible here and only hold the refcount on
+	 * @task. So everything can have changed under us since the
+	 * caller or our own code below (goto retry/again) dropped all
+	 * locks.
+	 */
  retry:
 	/*
-	 * Task can not go away as we did a get_task() before !
+	 * [1] Task cannot go away as we did a get_task() before !
 	 */
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
+	/*
+	 * [2] Get the waiter on which @task is blocked on.
+	 */
 	waiter = task->pi_blocked_on;
+
+	/*
+	 * [3] check_exit_conditions_1() protected by task->pi_lock.
+	 */
+
 	/*
 	 * Check whether the end of the boosting chain has been
 	 * reached or the state of the chain has changed while we
@@ -435,7 +495,15 @@ static int rt_mutex_adjust_prio_chain(st
 	if (!detect_deadlock && waiter->prio == task->prio)
 		goto out_unlock_pi;
 
+	/*
+	 * [4] Get the next lock
+	 */
 	lock = waiter->lock;
+	/*
+	 * [5] We need to trylock here as we are holding task->pi_lock,
+	 * which is the reverse lock order versus the other rtmutex
+	 * operations.
+	 */
 	if (!raw_spin_trylock(&lock->wait_lock)) {
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 		cpu_relax();
@@ -443,6 +511,9 @@ static int rt_mutex_adjust_prio_chain(st
 	}
 
 	/*
+	 * [6] check_exit_conditions_2() protected by task->pi_lock and
+	 * lock->wait_lock.
+	 *
 	 * Deadlock detection. If the lock is the same as the original
 	 * lock which caused us to walk the lock chain or if the
 	 * current lock is owned by the task which initiated the chain
@@ -462,24 +533,27 @@ static int rt_mutex_adjust_prio_chain(st
 	 */
 	prerequeue_top_waiter = rt_mutex_top_waiter(lock);
 
-	/* Requeue the waiter in the lock waiter list. */
+	/* [7] Requeue the waiter in the lock waiter list. */
 	rt_mutex_dequeue(lock, waiter);
 	waiter->prio = task->prio;
 	rt_mutex_enqueue(lock, waiter);
 
-	/* Release the task */
+	/* [8] Release the task */
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	put_task_struct(task);
 
 	/*
+	 * [9] check_exit_conditions_3 protected by lock->wait_lock.
+	 *
 	 * We must abort the chain walk if there is no lock owner even
 	 * in the dead lock detection case, as we have nothing to
 	 * follow here. This is the end of the chain we are walking.
 	 */
 	if (!rt_mutex_owner(lock)) {
 		/*
-		 * If the requeue above changed the top waiter, then we need
-		 * to wake the new top waiter up to try to get the lock.
+		 * If the requeue [7] above changed the top waiter,
+		 * then we need to wake the new top waiter up to try
+		 * to get the lock.
 		 */
 		if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
 			wake_up_process(rt_mutex_top_waiter(lock)->task);
@@ -487,11 +561,12 @@ static int rt_mutex_adjust_prio_chain(st
 		return 0;
 	}
 
-	/* Grab the next task, i.e. the owner of @lock */
+	/* [10] Grab the next task, i.e. the owner of @lock */
 	task = rt_mutex_owner(lock);
 	get_task_struct(task);
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
+	/* [11] requeue the pi waiters if necessary */
 	if (waiter == rt_mutex_top_waiter(lock)) {
 		/*
 		 * The waiter became the new top (highest priority)
@@ -526,23 +601,30 @@ static int rt_mutex_adjust_prio_chain(st
 	}
 
 	/*
+	 * [12] check_exit_conditions_4() protected by task->pi_lock
+	 * and lock->wait_lock. The actual decisions are made after we
+	 * dropped the locks.
+	 *
 	 * Check whether the task which owns the current lock is pi
 	 * blocked itself. If yes we store a pointer to the lock for
 	 * the lock chain change detection above. After we dropped
 	 * task->pi_lock next_lock cannot be dereferenced anymore.
 	 */
 	next_lock = task_blocked_on_lock(task);
-
-	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
 	/*
 	 * Store the top waiter of @lock for the end of chain walk
 	 * decision below.
 	 */
 	top_waiter = rt_mutex_top_waiter(lock);
+
+	/* [13] Drop the locks */
+	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	raw_spin_unlock(&lock->wait_lock);
 
 	/*
+	 * Make the actual exit decisions [12], based on the stored
+	 * values.
+	 *
 	 * We reached the end of the lock chain. Stop right here. No
 	 * point to go back just to figure that out.
 	 */



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch V4 05/10] rtmutex: Clarify the boost/deboost part
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
                   ` (4 preceding siblings ...)
  2014-06-11 18:44 ` [patch V4 06/10] rtmutex: Document pi chain walk Thomas Gleixner
@ 2014-06-11 18:44 ` Thomas Gleixner
  2014-06-13 16:35   ` Steven Rostedt
  2014-06-11 18:44 ` [patch V4 07/10] rtmutex: Simplify remove_waiter() Thomas Gleixner
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: rtmutex-clarify-the-lock-chain-walk.patch --]
[-- Type: text/plain, Size: 4249 bytes --]

Add a separate local variable for the boost/deboost logic to make the
code more readable. Add comments where appropriate.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c |   58 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 10 deletions(-)

Index: tip/kernel/locking/rtmutex.c
===================================================================
--- tip.orig/kernel/locking/rtmutex.c
+++ tip/kernel/locking/rtmutex.c
@@ -345,9 +345,10 @@ static int rt_mutex_adjust_prio_chain(st
 				      struct rt_mutex_waiter *orig_waiter,
 				      struct task_struct *top_task)
 {
-	struct rt_mutex *lock;
 	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
+	struct rt_mutex_waiter *prerequeue_top_waiter;
 	int detect_deadlock, ret = 0, depth = 0;
+	struct rt_mutex *lock;
 	unsigned long flags;
 
 	detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
@@ -454,9 +455,14 @@ static int rt_mutex_adjust_prio_chain(st
 		goto out_unlock_pi;
 	}
 
-	top_waiter = rt_mutex_top_waiter(lock);
+	/*
+	 * Store the current top waiter before doing the requeue
+	 * operation on @lock. We need it for the boost/deboost
+	 * decision below.
+	 */
+	prerequeue_top_waiter = rt_mutex_top_waiter(lock);
 
-	/* Requeue the waiter */
+	/* Requeue the waiter in the lock waiter list. */
 	rt_mutex_dequeue(lock, waiter);
 	waiter->prio = task->prio;
 	rt_mutex_enqueue(lock, waiter);
@@ -465,35 +471,58 @@ static int rt_mutex_adjust_prio_chain(st
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 	put_task_struct(task);
 
+	/*
+	 * We must abort the chain walk if there is no lock owner even
+	 * in the dead lock detection case, as we have nothing to
+	 * follow here. This is the end of the chain we are walking.
+	 */
 	if (!rt_mutex_owner(lock)) {
 		/*
 		 * If the requeue above changed the top waiter, then we need
 		 * to wake the new top waiter up to try to get the lock.
 		 */
-
-		if (top_waiter != rt_mutex_top_waiter(lock))
+		if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
 			wake_up_process(rt_mutex_top_waiter(lock)->task);
 		raw_spin_unlock(&lock->wait_lock);
 		return 0;
 	}
 
-	/* Grab the next task */
+	/* Grab the next task, i.e. the owner of @lock */
 	task = rt_mutex_owner(lock);
 	get_task_struct(task);
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
 	if (waiter == rt_mutex_top_waiter(lock)) {
-		/* Boost the owner */
-		rt_mutex_dequeue_pi(task, top_waiter);
+		/*
+		 * The waiter became the new top (highest priority)
+		 * waiter on the lock. Replace the previous top waiter
+		 * in the owner tasks pi waiters list with this waiter
+		 * and adjust the priority of the owner.
+		 */
+		rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
 		rt_mutex_enqueue_pi(task, waiter);
 		__rt_mutex_adjust_prio(task);
 
-	} else if (top_waiter == waiter) {
-		/* Deboost the owner */
+	} else if (prerequeue_top_waiter == waiter) {
+		/*
+		 * The waiter was the top waiter on the lock, but is
+		 * no longer the top prority waiter. Replace waiter in
+		 * the owner tasks pi waiters list with the new top
+		 * (highest priority) waiter and adjust the priority
+		 * of the owner.
+		 * The new top waiter is stored in @waiter so that
+		 * @waiter == @top_waiter evaluates to true below and
+		 * we continue to deboost the rest of the chain.
+		 */
 		rt_mutex_dequeue_pi(task, waiter);
 		waiter = rt_mutex_top_waiter(lock);
 		rt_mutex_enqueue_pi(task, waiter);
 		__rt_mutex_adjust_prio(task);
+	} else {
+		/*
+		 * Nothing changed. No need to do any priority
+		 * adjustment.
+		 */
 	}
 
 	/*
@@ -506,6 +535,10 @@ static int rt_mutex_adjust_prio_chain(st
 
 	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 
+	/*
+	 * Store the top waiter of @lock for the end of chain walk
+	 * decision below.
+	 */
 	top_waiter = rt_mutex_top_waiter(lock);
 	raw_spin_unlock(&lock->wait_lock);
 
@@ -516,6 +549,11 @@ static int rt_mutex_adjust_prio_chain(st
 	if (!next_lock)
 		goto out_put_task;
 
+	/*
+	 * If the current waiter is not the top waiter on the lock,
+	 * then we can stop the chain walk here if we are not in full
+	 * deadlock detection mode.
+	 */
 	if (!detect_deadlock && waiter != top_waiter)
 		goto out_put_task;
 



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch V4 07/10] rtmutex: Simplify remove_waiter()
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
                   ` (5 preceding siblings ...)
  2014-06-11 18:44 ` [patch V4 05/10] rtmutex: Clarify the boost/deboost part Thomas Gleixner
@ 2014-06-11 18:44 ` Thomas Gleixner
  2014-06-13 16:58   ` Steven Rostedt
  2014-06-11 18:44 ` [patch V4 09/10] rtmutex: Cleanup deadlock detector debug logic Thomas Gleixner
                   ` (3 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: rtmutex-simplify-deboost.patch --]
[-- Type: text/plain, Size: 2230 bytes --]

Exit right away, when the removed waiter was not the top prioriy
waiter on the lock. Get rid of the extra indent level.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c |   36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

Index: tip/kernel/locking/rtmutex.c
===================================================================
--- tip.orig/kernel/locking/rtmutex.c
+++ tip/kernel/locking/rtmutex.c
@@ -920,9 +920,9 @@ static void wakeup_next_waiter(struct rt
 static void remove_waiter(struct rt_mutex *lock,
 			  struct rt_mutex_waiter *waiter)
 {
-	int first = (waiter == rt_mutex_top_waiter(lock));
+	bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
 	struct task_struct *owner = rt_mutex_owner(lock);
-	struct rt_mutex *next_lock = NULL;
+	struct rt_mutex *next_lock;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&current->pi_lock, flags);
@@ -930,29 +930,31 @@ static void remove_waiter(struct rt_mute
 	current->pi_blocked_on = NULL;
 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
-	if (!owner)
+	/*
+	 * Only update priority if the waiter was the highest priority
+	 * waiter of the lock and there is an owner to update.
+	 */
+	if (!owner || !is_top_waiter)
 		return;
 
-	if (first) {
+	raw_spin_lock_irqsave(&owner->pi_lock, flags);
 
-		raw_spin_lock_irqsave(&owner->pi_lock, flags);
+	rt_mutex_dequeue_pi(owner, waiter);
 
-		rt_mutex_dequeue_pi(owner, waiter);
+	if (rt_mutex_has_waiters(lock))
+		rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
 
-		if (rt_mutex_has_waiters(lock)) {
-			struct rt_mutex_waiter *next;
+	__rt_mutex_adjust_prio(owner);
 
-			next = rt_mutex_top_waiter(lock);
-			rt_mutex_enqueue_pi(owner, next);
-		}
-		__rt_mutex_adjust_prio(owner);
+	/* Store the lock on which owner is blocked or NULL */
+	next_lock = task_blocked_on_lock(owner);
 
-		/* Store the lock on which owner is blocked or NULL */
-		next_lock = task_blocked_on_lock(owner);
-
-		raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
-	}
+	raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
 
+	/*
+	 * Don't walk the chain, if the owner task is not blocked
+	 * itself.
+	 */
 	if (!next_lock)
 		return;
 



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch V4 09/10] rtmutex: Cleanup deadlock detector debug logic
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
                   ` (6 preceding siblings ...)
  2014-06-11 18:44 ` [patch V4 07/10] rtmutex: Simplify remove_waiter() Thomas Gleixner
@ 2014-06-11 18:44 ` Thomas Gleixner
  2014-06-13 17:19   ` Steven Rostedt
  2014-06-11 18:44 ` [patch V4 08/10] rtmutex: Confine deadlock logic to futex Thomas Gleixner
                   ` (2 subsequent siblings)
  10 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar, Lai Jiangshan

[-- Attachment #1: rtmutex-cleanup-deadlock-detector-debug-logic.patch --]
[-- Type: text/plain, Size: 11140 bytes --]

The conditions under which deadlock detection is conducted are unclear
and undocumented.

Add constants instead of using 0/1 and provide a selection function
which hides the additional debug dependency from the calling code.

Add comments where needed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Link: http://lkml.kernel.org/r/20140522031949.947264874@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex-debug.c  |    5 +-
 kernel/locking/rtmutex-debug.h  |    7 ++--
 kernel/locking/rtmutex.c        |   70 +++++++++++++++++++++++++++-------------
 kernel/locking/rtmutex.h        |    7 +++-
 kernel/locking/rtmutex_common.h |   15 ++++++++
 5 files changed, 76 insertions(+), 28 deletions(-)

Index: tip/kernel/locking/rtmutex-debug.c
===================================================================
--- tip.orig/kernel/locking/rtmutex-debug.c
+++ tip/kernel/locking/rtmutex-debug.c
@@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct tas
  * the deadlock. We print when we return. act_waiter can be NULL in
  * case of a remove waiter operation.
  */
-void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+			     struct rt_mutex_waiter *act_waiter,
 			     struct rt_mutex *lock)
 {
 	struct task_struct *task;
 
-	if (!debug_locks || detect || !act_waiter)
+	if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
 		return;
 
 	task = rt_mutex_owner(act_waiter->lock);
Index: tip/kernel/locking/rtmutex-debug.h
===================================================================
--- tip.orig/kernel/locking/rtmutex-debug.h
+++ tip/kernel/locking/rtmutex-debug.h
@@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct
 extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
 				      struct task_struct *powner);
 extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+				    struct rt_mutex_waiter *waiter,
 				    struct rt_mutex *lock);
 extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
 # define debug_rt_mutex_reset_waiter(w)			\
 	do { (w)->deadlock_lock = NULL; } while (0)
 
-static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
-						 int detect)
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+						  enum rtmutex_chainwalk walk)
 {
 	return (waiter != NULL);
 }
Index: tip/kernel/locking/rtmutex.c
===================================================================
--- tip.orig/kernel/locking/rtmutex.c
+++ tip/kernel/locking/rtmutex.c
@@ -308,6 +308,25 @@ static void rt_mutex_adjust_prio(struct
 }
 
 /*
+ * Deadlock detection is conditional:
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
+ * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
+ * conducted independent of the detect argument.
+ *
+ * If the waiter argument is NULL this indicates the deboost path and
+ * deadlock detection is disabled independent of the detect argument
+ * and the config settings.
+ */
+static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+					  enum rtmutex_chainwalk chwalk)
+{
+	return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+}
+
+/*
  * Max number of times we'll walk the boosting chain:
  */
 int max_lock_depth = 1024;
@@ -381,7 +400,7 @@ static inline struct rt_mutex *task_bloc
  *	  goto again;
  */
 static int rt_mutex_adjust_prio_chain(struct task_struct *task,
-				      int deadlock_detect,
+				      enum rtmutex_chainwalk chwalk,
 				      struct rt_mutex *orig_lock,
 				      struct rt_mutex *next_lock,
 				      struct rt_mutex_waiter *orig_waiter,
@@ -389,12 +408,12 @@ static int rt_mutex_adjust_prio_chain(st
 {
 	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
 	struct rt_mutex_waiter *prerequeue_top_waiter;
-	int detect_deadlock, ret = 0, depth = 0;
+	int ret = 0, depth = 0;
 	struct rt_mutex *lock;
+	bool detect_deadlock;
 	unsigned long flags;
 
-	detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
-							 deadlock_detect);
+	detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
 
 	/*
 	 * The (de)boosting is a step by step approach with a lot of
@@ -520,7 +539,7 @@ static int rt_mutex_adjust_prio_chain(st
 	 * walk, we detected a deadlock.
 	 */
 	if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
-		debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+		debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
 		raw_spin_unlock(&lock->wait_lock);
 		ret = -EDEADLK;
 		goto out_unlock_pi;
@@ -784,7 +803,7 @@ takeit:
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 				   struct rt_mutex_waiter *waiter,
 				   struct task_struct *task,
-				   int detect_deadlock)
+				   enum rtmutex_chainwalk chwalk)
 {
 	struct task_struct *owner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *top_waiter = waiter;
@@ -830,7 +849,7 @@ static int task_blocks_on_rt_mutex(struc
 		__rt_mutex_adjust_prio(owner);
 		if (owner->pi_blocked_on)
 			chain_walk = 1;
-	} else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
+	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
 		chain_walk = 1;
 	}
 
@@ -855,7 +874,7 @@ static int task_blocks_on_rt_mutex(struc
 
 	raw_spin_unlock(&lock->wait_lock);
 
-	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
+	res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
 					 next_lock, waiter, task);
 
 	raw_spin_lock(&lock->wait_lock);
@@ -963,7 +982,8 @@ static void remove_waiter(struct rt_mute
 
 	raw_spin_unlock(&lock->wait_lock);
 
-	rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
+	rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+				   next_lock, NULL, current);
 
 	raw_spin_lock(&lock->wait_lock);
 }
@@ -993,7 +1013,8 @@ void rt_mutex_adjust_pi(struct task_stru
 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
 	get_task_struct(task);
 
-	rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
+	rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, NULL,
+				   NULL, task);
 }
 
 /**
@@ -1071,7 +1092,7 @@ static void rt_mutex_handle_deadlock(int
 static int __sched
 rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		  struct hrtimer_sleeper *timeout,
-		  int detect_deadlock)
+		  enum rtmutex_chainwalk chwalk)
 {
 	struct rt_mutex_waiter waiter;
 	int ret = 0;
@@ -1097,7 +1118,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
 			timeout->task = NULL;
 	}
 
-	ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
+	ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
 
 	if (likely(!ret))
 		ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
@@ -1106,7 +1127,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
 
 	if (unlikely(ret)) {
 		remove_waiter(lock, &waiter);
-		rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
+		rt_mutex_handle_deadlock(ret, chwalk, &waiter);
 	}
 
 	/*
@@ -1234,27 +1255,29 @@ static inline int
 rt_mutex_fastlock(struct rt_mutex *lock, int state,
 		  int (*slowfn)(struct rt_mutex *lock, int state,
 				struct hrtimer_sleeper *timeout,
-				int detect_deadlock))
+				enum rtmutex_chainwalk chwalk))
 {
 	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
-		return slowfn(lock, state, NULL, 0);
+		return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
 }
 
 static inline int
 rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
-			struct hrtimer_sleeper *timeout, int detect_deadlock,
+			struct hrtimer_sleeper *timeout,
+			enum rtmutex_chainwalk chwalk,
 			int (*slowfn)(struct rt_mutex *lock, int state,
 				      struct hrtimer_sleeper *timeout,
-				      int detect_deadlock))
+				      enum rtmutex_chainwalk chwalk))
 {
-	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
+	    likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
-		return slowfn(lock, state, timeout, detect_deadlock);
+		return slowfn(lock, state, timeout, chwalk);
 }
 
 static inline int
@@ -1316,7 +1339,8 @@ int __rt_mutex_timed_lock(struct rt_mute
 {
 	might_sleep();
 
-	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, 1,
+	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+				       RT_MUTEX_FULL_CHAINWALK,
 				       rt_mutex_slowlock);
 }
 
@@ -1338,7 +1362,8 @@ rt_mutex_timed_lock(struct rt_mutex *loc
 {
 	might_sleep();
 
-	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, 0,
+	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+				       RT_MUTEX_MIN_CHAINWALK,
 				       rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
@@ -1467,7 +1492,8 @@ int rt_mutex_start_proxy_lock(struct rt_
 	}
 
 	/* We enforce deadlock detection for futexes */
-	ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
+	ret = task_blocks_on_rt_mutex(lock, waiter, task,
+				      RT_MUTEX_FULL_CHAINWALK);
 
 	if (ret && !rt_mutex_owner(lock)) {
 		/*
Index: tip/kernel/locking/rtmutex.h
===================================================================
--- tip.orig/kernel/locking/rtmutex.h
+++ tip/kernel/locking/rtmutex.h
@@ -22,10 +22,15 @@
 #define debug_rt_mutex_init(m, n)			do { } while (0)
 #define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0)
 #define debug_rt_mutex_print_deadlock(w)		do { } while (0)
-#define debug_rt_mutex_detect_deadlock(w,d)		(d)
 #define debug_rt_mutex_reset_waiter(w)			do { } while (0)
 
 static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
 {
 	WARN(1, "rtmutex deadlock detected\n");
 }
+
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
+						  enum rtmutex_chainwalk walk)
+{
+	return walk == RT_MUTEX_FULL_CHAINWALK;
+}
Index: tip/kernel/locking/rtmutex_common.h
===================================================================
--- tip.orig/kernel/locking/rtmutex_common.h
+++ tip/kernel/locking/rtmutex_common.h
@@ -102,6 +102,21 @@ static inline struct task_struct *rt_mut
 }
 
 /*
+ * Constants for rt mutex functions which have a selectable deadlock
+ * detection.
+ *
+ * RT_MUTEX_MIN_CHAINWALK:	Stops the lock chain walk when there are
+ *				no further PI adjustments to be made.
+ *
+ * RT_MUTEX_FULL_CHAINWALK:	Invoke deadlock detection with a full
+ *				walk of the lock chain.
+ */
+enum rtmutex_chainwalk {
+	RT_MUTEX_MIN_CHAINWALK,
+	RT_MUTEX_FULL_CHAINWALK,
+};
+
+/*
  * PI-futex support (proxy locking functions, etc.):
  */
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch V4 08/10] rtmutex: Confine deadlock logic to futex
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
                   ` (7 preceding siblings ...)
  2014-06-11 18:44 ` [patch V4 09/10] rtmutex: Cleanup deadlock detector debug logic Thomas Gleixner
@ 2014-06-11 18:44 ` Thomas Gleixner
  2014-06-13 17:10   ` Steven Rostedt
  2014-06-11 18:44 ` [patch V4 10/10] rtmutex: Avoid pointless requeueing in the deadlock detection chain walk Thomas Gleixner
  2014-06-22  8:45 ` [patch V4 00/10] rtmutex: Code clarification and optimization Ingo Molnar
  10 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar

[-- Attachment #1: rtmutex-confine-deadlock-logic-to-futex.patch --]
[-- Type: text/plain, Size: 7881 bytes --]

The deadlock logic is only required for futexes.

Remove the extra arguments for the public functions and also for the
futex specific ones which get always called with deadlock detection
enabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rtmutex.h         |    6 +---
 kernel/futex.c                  |   10 +++---
 kernel/locking/rtmutex.c        |   59 ++++++++++++++++++++--------------------
 kernel/locking/rtmutex_common.h |    7 ++--
 4 files changed, 40 insertions(+), 42 deletions(-)

Index: tip/include/linux/rtmutex.h
===================================================================
--- tip.orig/include/linux/rtmutex.h
+++ tip/include/linux/rtmutex.h
@@ -90,11 +90,9 @@ extern void __rt_mutex_init(struct rt_mu
 extern void rt_mutex_destroy(struct rt_mutex *lock);
 
 extern void rt_mutex_lock(struct rt_mutex *lock);
-extern int rt_mutex_lock_interruptible(struct rt_mutex *lock,
-						int detect_deadlock);
+extern int rt_mutex_lock_interruptible(struct rt_mutex *lock);
 extern int rt_mutex_timed_lock(struct rt_mutex *lock,
-					struct hrtimer_sleeper *timeout,
-					int detect_deadlock);
+			       struct hrtimer_sleeper *timeout);
 
 extern int rt_mutex_trylock(struct rt_mutex *lock);
 
Index: tip/kernel/futex.c
===================================================================
--- tip.orig/kernel/futex.c
+++ tip/kernel/futex.c
@@ -1718,7 +1718,7 @@ retry_private:
 			this->pi_state = pi_state;
 			ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
 							this->rt_waiter,
-							this->task, 1);
+							this->task);
 			if (ret == 1) {
 				/* We got the lock. */
 				requeue_pi_wake_futex(this, &key2, hb2);
@@ -2337,9 +2337,9 @@ retry_private:
 	/*
 	 * Block on the PI mutex:
 	 */
-	if (!trylock)
-		ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
-	else {
+	if (!trylock) {
+		ret = __rt_mutex_timed_lock(&q.pi_state->pi_mutex, to);
+	} else {
 		ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
 		/* Fixup the trylock return value: */
 		ret = ret ? 0 : -EWOULDBLOCK;
@@ -2669,7 +2669,7 @@ static int futex_wait_requeue_pi(u32 __u
 		 */
 		WARN_ON(!q.pi_state);
 		pi_mutex = &q.pi_state->pi_mutex;
-		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
 		debug_rt_mutex_free_waiter(&rt_waiter);
 
 		spin_lock(q.lock_ptr);
Index: tip/kernel/locking/rtmutex.c
===================================================================
--- tip.orig/kernel/locking/rtmutex.c
+++ tip/kernel/locking/rtmutex.c
@@ -1232,16 +1232,15 @@ rt_mutex_slowunlock(struct rt_mutex *loc
  */
 static inline int
 rt_mutex_fastlock(struct rt_mutex *lock, int state,
-		  int detect_deadlock,
 		  int (*slowfn)(struct rt_mutex *lock, int state,
 				struct hrtimer_sleeper *timeout,
 				int detect_deadlock))
 {
-	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
 		rt_mutex_deadlock_account_lock(lock, current);
 		return 0;
 	} else
-		return slowfn(lock, state, NULL, detect_deadlock);
+		return slowfn(lock, state, NULL, 0);
 }
 
 static inline int
@@ -1288,54 +1287,59 @@ void __sched rt_mutex_lock(struct rt_mut
 {
 	might_sleep();
 
-	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_lock);
 
 /**
  * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
  *
- * @lock: 		the rt_mutex to be locked
- * @detect_deadlock:	deadlock detection on/off
+ * @lock:		the rt_mutex to be locked
  *
  * Returns:
- *  0 		on success
- * -EINTR 	when interrupted by a signal
- * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
+ *  0		on success
+ * -EINTR	when interrupted by a signal
  */
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
-						 int detect_deadlock)
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
 {
 	might_sleep();
 
-	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
-				 detect_deadlock, rt_mutex_slowlock);
+	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 
+/*
+ * Futex variant with full deadlock detection.
+ */
+int __rt_mutex_timed_lock(struct rt_mutex *lock,
+			  struct hrtimer_sleeper *timeout)
+{
+	might_sleep();
+
+	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, 1,
+				       rt_mutex_slowlock);
+}
+
 /**
  * rt_mutex_timed_lock - lock a rt_mutex interruptible
  *			the timeout structure is provided
  *			by the caller
  *
- * @lock: 		the rt_mutex to be locked
+ * @lock:		the rt_mutex to be locked
  * @timeout:		timeout structure or NULL (no timeout)
- * @detect_deadlock:	deadlock detection on/off
  *
  * Returns:
- *  0 		on success
- * -EINTR 	when interrupted by a signal
+ *  0		on success
+ * -EINTR	when interrupted by a signal
  * -ETIMEDOUT	when the timeout expired
- * -EDEADLK	when the lock would deadlock (when deadlock detection is on)
  */
 int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
-		    int detect_deadlock)
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
 {
 	might_sleep();
 
-	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
-				       detect_deadlock, rt_mutex_slowlock);
+	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, 0,
+				       rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
 
@@ -1441,7 +1445,6 @@ void rt_mutex_proxy_unlock(struct rt_mut
  * @lock:		the rt_mutex to take
  * @waiter:		the pre-initialized rt_mutex_waiter
  * @task:		the task to prepare
- * @detect_deadlock:	perform deadlock detection (1) or not (0)
  *
  * Returns:
  *  0 - task blocked on lock
@@ -1452,7 +1455,7 @@ void rt_mutex_proxy_unlock(struct rt_mut
  */
 int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 			      struct rt_mutex_waiter *waiter,
-			      struct task_struct *task, int detect_deadlock)
+			      struct task_struct *task)
 {
 	int ret;
 
@@ -1510,22 +1513,20 @@ struct task_struct *rt_mutex_next_owner(
  * rt_mutex_finish_proxy_lock() - Complete lock acquisition
  * @lock:		the rt_mutex we were woken on
  * @to:			the timeout, null if none. hrtimer should already have
- * 			been started.
+ *			been started.
  * @waiter:		the pre-initialized rt_mutex_waiter
- * @detect_deadlock:	perform deadlock detection (1) or not (0)
  *
  * Complete the lock acquisition started our behalf by another thread.
  *
  * Returns:
  *  0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ * <0 - error, one of -EINTR, -ETIMEDOUT
  *
  * Special API call for PI-futex requeue support
  */
 int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 			       struct hrtimer_sleeper *to,
-			       struct rt_mutex_waiter *waiter,
-			       int detect_deadlock)
+			       struct rt_mutex_waiter *waiter)
 {
 	int ret;
 
Index: tip/kernel/locking/rtmutex_common.h
===================================================================
--- tip.orig/kernel/locking/rtmutex_common.h
+++ tip/kernel/locking/rtmutex_common.h
@@ -111,12 +111,11 @@ extern void rt_mutex_proxy_unlock(struct
 				  struct task_struct *proxy_owner);
 extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 				     struct rt_mutex_waiter *waiter,
-				     struct task_struct *task,
-				     int detect_deadlock);
+				     struct task_struct *task);
 extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
 				      struct hrtimer_sleeper *to,
-				      struct rt_mutex_waiter *waiter,
-				      int detect_deadlock);
+				      struct rt_mutex_waiter *waiter);
+extern int __rt_mutex_timed_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch V4 10/10] rtmutex: Avoid pointless requeueing in the deadlock detection chain walk
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
                   ` (8 preceding siblings ...)
  2014-06-11 18:44 ` [patch V4 08/10] rtmutex: Confine deadlock logic to futex Thomas Gleixner
@ 2014-06-11 18:44 ` Thomas Gleixner
  2014-06-13 17:28   ` Steven Rostedt
  2014-06-22  8:45 ` [patch V4 00/10] rtmutex: Code clarification and optimization Ingo Molnar
  10 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-11 18:44 UTC (permalink / raw)
  To: LKML; +Cc: Steven Rostedt, Peter Zijlstra, Ingo Molnar, Lai Jiangshan

[-- Attachment #1: rtmutex-avoid-pointless-requeueing-in-the-deadlock-detection-chain-walk.patch --]
[-- Type: text/plain, Size: 4035 bytes --]

In case the dead lock detector is enabled we follow the lock chain to
the end in rt_mutex_adjust_prio_chain, even if we could stop earlier
due to the priority/waiter constellation.

But once we are not longer the top priority waiter in a certain step
or the task holding the lock has already the same priority then there
is no point in dequeing and enqueing along the lock chain as there is
no change at all.

So stop the queueing at this point.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Link: http://lkml.kernel.org/r/20140522031950.280830190@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c |   77 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 7 deletions(-)

Index: tip/kernel/locking/rtmutex.c
===================================================================
--- tip.orig/kernel/locking/rtmutex.c
+++ tip/kernel/locking/rtmutex.c
@@ -412,6 +412,7 @@ static int rt_mutex_adjust_prio_chain(st
 	struct rt_mutex *lock;
 	bool detect_deadlock;
 	unsigned long flags;
+	bool requeue = true;
 
 	detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
 
@@ -501,18 +502,31 @@ static int rt_mutex_adjust_prio_chain(st
 			goto out_unlock_pi;
 		/*
 		 * If deadlock detection is off, we stop here if we
-		 * are not the top pi waiter of the task.
+		 * are not the top pi waiter of the task. If deadlock
+		 * detection is enabled we continue, but stop the
+		 * requeueing in the chain walk.
 		 */
-		if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
-			goto out_unlock_pi;
+		if (top_waiter != task_top_pi_waiter(task)) {
+			if (!detect_deadlock)
+				goto out_unlock_pi;
+			else
+				requeue = false;
+		}
 	}
 
 	/*
-	 * When deadlock detection is off then we check, if further
-	 * priority adjustment is necessary.
+	 * If the waiter priority is the same as the task priority
+	 * then there is no further priority adjustment necessary.  If
+	 * deadlock detection is off, we stop the chain walk. If its
+	 * enabled we continue, but stop the requeueing in the chain
+	 * walk.
 	 */
-	if (!detect_deadlock && waiter->prio == task->prio)
-		goto out_unlock_pi;
+	if (waiter->prio == task->prio) {
+		if (!detect_deadlock)
+			goto out_unlock_pi;
+		else
+			requeue = false;
+	}
 
 	/*
 	 * [4] Get the next lock
@@ -546,6 +560,55 @@ static int rt_mutex_adjust_prio_chain(st
 	}
 
 	/*
+	 * If we just follow the lock chain for deadlock detection, no
+	 * need to do all the requeue operations. To avoid a truckload
+	 * of conditionals around the various places below, just do the
+	 * minimum chain walk checks.
+	 */
+	if (!requeue) {
+		/*
+		 * No requeue[7] here. Just release @task [8]
+		 */
+		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+		put_task_struct(task);
+
+		/*
+		 * [9] check_exit_conditions_3 protected by lock->wait_lock.
+		 * If there is no owner of the lock, end of chain.
+		 */
+		if (!rt_mutex_owner(lock)) {
+			raw_spin_unlock(&lock->wait_lock);
+			return 0;
+		}
+
+		/* [10] Grab the next task, i.e. owner of @lock */
+		task = rt_mutex_owner(lock);
+		get_task_struct(task);
+		raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+		/*
+		 * No requeue [11] here. We just do deadlock detection.
+		 *
+		 * Get the top waiter for the next iteration
+		 */
+		top_waiter = rt_mutex_top_waiter(lock);
+
+		/*
+		 * [12] Store whether owner is blocked
+		 * itself. Decision is made after dropping the locks
+		 */
+		next_lock = task_blocked_on_lock(task);
+		/* [13] Drop locks */
+		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+		raw_spin_unlock(&lock->wait_lock);
+
+		/* If owner is not blocked, end of chain. */
+		if (!next_lock)
+			goto out_put_task;
+		goto again;
+	}
+
+	/*
 	 * Store the current top waiter before doing the requeue
 	 * operation on @lock. We need it for the boost/deboost
 	 * decision below.



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 02/10] rtmutex: Simplify rtmutex_slowtrylock()
  2014-06-11 18:44 ` [patch V4 02/10] rtmutex: Simplify rtmutex_slowtrylock() Thomas Gleixner
@ 2014-06-12  3:34   ` Lai Jiangshan
  2014-06-13 15:58   ` Steven Rostedt
  1 sibling, 0 replies; 27+ messages in thread
From: Lai Jiangshan @ 2014-06-12  3:34 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Steven Rostedt, Peter Zijlstra, Ingo Molnar, Oleg Nesterov

On 06/12/2014 02:44 AM, Thomas Gleixner wrote:
> Oleg noticed that rtmutex_slowtrylock() has a pointless check for
> rt_mutex_owner(lock) != current.
> 
> To avoid calling try_to_take_rtmutex() we really want to check whether
> the lock has an owner at all or whether the trylock failed because the
> owner is NULL, but the RT_MUTEX_HAS_WAITERS bit is set. This covers
> the lock is owned by caller situation as well.
> 
> We can actually do this check lockless. trylock is taking a chance
> whether we take lock->wait_lock to do the check or not.
> 
> Add comments to the function while at it.
> 
> Reported-by: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---

Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>

Thanks,
Lai

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 01/10] rtmutex: Plug slow unlock race
  2014-06-11 18:44 ` [patch V4 01/10] rtmutex: Plug slow unlock race Thomas Gleixner
@ 2014-06-13 15:41   ` Steven Rostedt
  2014-06-16  8:06   ` [tip:locking/urgent] " tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2014-06-13 15:41 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Peter Zijlstra, Ingo Molnar

On Wed, 11 Jun 2014 18:44:04 -0000
Thomas Gleixner <tglx@linutronix.de> wrote:

> @@ -671,10 +724,26 @@ static void wakeup_next_waiter(struct rt
>  	 */
>  	rt_mutex_dequeue_pi(current, waiter);
>  
> -	rt_mutex_set_owner(lock, NULL);
> +	/*
> +	 * This is safe versus the fastpath acquire:
> +	 *
> +	 * We do not remove the waiter from the lock waiter list
> +	 * here. It stays the top waiter.
> +	 *
> +	 * We set the owner to NULL, but keep the RT_MUTEX_HAS_WAITERS
> +	 * bit set, which forces all potential new waiters into the
> +	 * slow path. So they are serialized along with all enqueued
> +	 * waiters on lock->wait_lock.
> +	 */

The above comment is a little more complex than it needs to be. We can
probably just consider this an optimization. If we are waking up the
next waiter, this lock obviously has waiters, and the deleted
rt_mutex_set_owner(lock, NULL) would do the below anyway.

	/*
	 * As we are waking up the top waiter, and the waiter stays
	 * queued on the lock until it gets the lock, this lock
	 * obviously has waiters. Just set the bit here and this has
	 * the added benefit of forcing all new tasks into the
	 * slow path making sure no task of lower priority than
	 * the top waiter can steal this lock.
	 */

The rest looks good.

Reviewed-by: Steven Rostedt <rostedt@goodmis.org>

-- Steve


> +	lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
>  
>  	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
>  
> +	/*
> +	 * It's safe to dereference waiter as it cannot go away as
> +	 * long as we hold lock->wait_lock. The waiter task needs to
> +	 * acquire it in order to dequeue the waiter.
> +	 */
>  	wake_up_process(waiter->task);
>  }
>  
> @@ -928,12 +997,49 @@ rt_mutex_slowunlock(struct rt_mutex *loc
>  
>  	rt_mutex_deadlock_account_unlock(current);
>  
> -	if (!rt_mutex_has_waiters(lock)) {
> -		lock->owner = NULL;
> -		raw_spin_unlock(&lock->wait_lock);
> -		return;
> +	/*
> +	 * We must be careful here if the fast path is enabled. If we
> +	 * have no waiters queued we cannot set owner to NULL here
> +	 * because of:
> +	 *
> +	 * foo->lock->owner = NULL;
> +	 *			rtmutex_lock(foo->lock);   <- fast path
> +	 *			free = atomic_dec_and_test(foo->refcnt);
> +	 *			rtmutex_unlock(foo->lock); <- fast path
> +	 *			if (free)
> +	 *				kfree(foo);
> +	 * raw_spin_unlock(foo->lock->wait_lock);
> +	 *
> +	 * So for the fastpath enabled kernel:
> +	 *
> +	 * Nothing can set the waiters bit as long as we hold
> +	 * lock->wait_lock. So we do the following sequence:
> +	 *
> +	 *	owner = rt_mutex_owner(lock);
> +	 *	clear_rt_mutex_waiters(lock);
> +	 *	raw_spin_unlock(&lock->wait_lock);
> +	 *	if (cmpxchg(&lock->owner, owner, 0) == owner)
> +	 *		return;
> +	 *	goto retry;
> +	 *
> +	 * The fastpath disabled variant is simple as all access to
> +	 * lock->owner is serialized by lock->wait_lock:
> +	 *
> +	 *	lock->owner = NULL;
> +	 *	raw_spin_unlock(&lock->wait_lock);
> +	 */
> +	while (!rt_mutex_has_waiters(lock)) {
> +		/* Drops lock->wait_lock ! */
> +		if (unlock_rt_mutex_safe(lock) == true)
> +			return;
> +		/* Relock the rtmutex and try again */
> +		raw_spin_lock(&lock->wait_lock);
>  	}
>  
> +	/*
> +	 * The wakeup next waiter path does not suffer from the above
> +	 * race. See the comments there.
> +	 */
>  	wakeup_next_waiter(lock);
>  
>  	raw_spin_unlock(&lock->wait_lock);
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 02/10] rtmutex: Simplify rtmutex_slowtrylock()
  2014-06-11 18:44 ` [patch V4 02/10] rtmutex: Simplify rtmutex_slowtrylock() Thomas Gleixner
  2014-06-12  3:34   ` Lai Jiangshan
@ 2014-06-13 15:58   ` Steven Rostedt
  1 sibling, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2014-06-13 15:58 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Peter Zijlstra, Ingo Molnar, Oleg Nesterov

On Wed, 11 Jun 2014 18:44:04 -0000
Thomas Gleixner <tglx@linutronix.de> wrote:

> Oleg noticed that rtmutex_slowtrylock() has a pointless check for
> rt_mutex_owner(lock) != current.
> 
> To avoid calling try_to_take_rtmutex() we really want to check whether
> the lock has an owner at all or whether the trylock failed because the
> owner is NULL, but the RT_MUTEX_HAS_WAITERS bit is set. This covers
> the lock is owned by caller situation as well.
> 
> We can actually do this check lockless. trylock is taking a chance
> whether we take lock->wait_lock to do the check or not.
> 
> Add comments to the function while at it.
> 
> Reported-by: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  kernel/locking/rtmutex.c |   32 +++++++++++++++++++++-----------
>  1 file changed, 21 insertions(+), 11 deletions(-)
> 
> Index: tip/kernel/locking/rtmutex.c
> ===================================================================
> --- tip.orig/kernel/locking/rtmutex.c
> +++ tip/kernel/locking/rtmutex.c
> @@ -963,22 +963,32 @@ rt_mutex_slowlock(struct rt_mutex *lock,
>  /*
>   * Slow path try-lock function:
>   */
> -static inline int
> -rt_mutex_slowtrylock(struct rt_mutex *lock)
> +static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
>  {
> -	int ret = 0;
> +	int ret;
>  
> +	/*
> +	 * trylock is taking a chance. So we dont have to take
> +	 * @lock->wait_lock to figure out whether @lock has a real or

"whether @lock has a real"

 real what?

> +	 * if @lock owner is NULL and the RT_MUTEX_HAS_WAITERS bit is
> +	 * set.

I don't understand the above. As rt_mutex_owner() will ignore the
RT_MUTEX_HAS_WAITERS bit.


I think a simple comment is good enough:

	/*
	 * If the lock already has an owner we fail to get the lock.
	 * This can be done without taking the @lock->wait_lock as
	 * it is only being read, and this is a trylock anyway.
> +	 */
> +	if (rt_mutex_owner(lock))
> +		return 0;
> +
> +	/*
> +	 * The mutex has currently no owner. Lock the wait lock and
> +	 * try to acquire the lock.
> +	 */
>  	raw_spin_lock(&lock->wait_lock);
>  
> -	if (likely(rt_mutex_owner(lock) != current)) {
> +	ret = try_to_take_rt_mutex(lock, current, NULL);
>  
> -		ret = try_to_take_rt_mutex(lock, current, NULL);
> -		/*
> -		 * try_to_take_rt_mutex() sets the lock waiters
> -		 * bit unconditionally. Clean this up.
> -		 */
> -		fixup_rt_mutex_waiters(lock);
> -	}
> +	/*
> +	 * try_to_take_rt_mutex() sets the lock waiters bit
> +	 * unconditionally. Clean this up.
> +	 */
> +	fixup_rt_mutex_waiters(lock);

Rest looks good.

Reviewed-by: Steven Rostedt <rostedt@goodmis.org>

-- Steve

>  
>  	raw_spin_unlock(&lock->wait_lock);
>  
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 03/10] rtmutex: Simplify and document try_to_take_rtmutex()
  2014-06-11 18:44 ` [patch V4 03/10] rtmutex: Simplify and document try_to_take_rtmutex() Thomas Gleixner
@ 2014-06-13 16:27   ` Steven Rostedt
  0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2014-06-13 16:27 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Peter Zijlstra, Ingo Molnar

On Wed, 11 Jun 2014 18:44:05 -0000
Thomas Gleixner <tglx@linutronix.de> wrote:

> The current implementation of try_to_take_rtmutex() is correct, but
> requires more than a single brain twist to understand the clever
> encoded conditionals.
> 
> Untangle it and document the cases proper.
> 
> Looks less efficient at the first glance, but actually reduces the
> binary code size on x8664 by 80 bytes.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  kernel/locking/rtmutex.c |  131 +++++++++++++++++++++++++++++++----------------
>  1 file changed, 87 insertions(+), 44 deletions(-)
> 
> Index: tip/kernel/locking/rtmutex.c
> ===================================================================
> --- tip.orig/kernel/locking/rtmutex.c
> +++ tip/kernel/locking/rtmutex.c
> @@ -535,74 +535,117 @@ static int rt_mutex_adjust_prio_chain(st
>   *
>   * @lock:   the lock to be acquired.
>   * @task:   the task which wants to acquire the lock
> - * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
> + * @waiter: the waiter that is queued to the lock's wait list. (can be
> + *	    NULL, if called due to trylock attempts from
> + *	    rt_mutex_slowlock() or rt_mutex_slowtrylock().

In other words, waiter is non null iff @task is already queued on the
lock (task_blocked_on_lock() was called).


>   */
>  static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
> -		struct rt_mutex_waiter *waiter)
> +				struct rt_mutex_waiter *waiter)
>  {
> +	unsigned long flags;
> +
>  	/*
> -	 * We have to be careful here if the atomic speedups are
> -	 * enabled, such that, when
> -	 *  - no other waiter is on the lock
> -	 *  - the lock has been released since we did the cmpxchg
> -	 * the lock can be released or taken while we are doing the
> -	 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
> +	 * Before testing whether we can acquire @lock, we set the
> +	 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
> +	 * other tasks which try to modify @lock into the slow path
> +	 * and they serialize on @lock->wait_lock.
> +	 *
> +	 * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
> +	 * as explained at the top of this file if and only if:
>  	 *
> -	 * The atomic acquire/release aware variant of
> -	 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
> -	 * the WAITERS bit, the atomic release / acquire can not
> -	 * happen anymore and lock->wait_lock protects us from the
> -	 * non-atomic case.
> +	 * - There is a lock owner. The caller must fixup the
> +	 *   transient state if it does a trylock or leaves the lock
> +	 *   function due to a signal or timeout.
>  	 *
> -	 * Note, that this might set lock->owner =
> -	 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
> -	 * any more. This is fixed up when we take the ownership.
> -	 * This is the transitional state explained at the top of this file.
> +	 * - @task acquires the lock and there are no other
> +	 *   waiters. This is undone in rt_mutex_set_owner(@task) at
> +	 *   the end of this function.
>  	 */
>  	mark_rt_mutex_waiters(lock);
>  
> +	/*
> +	 * If @lock has an owner, give up.
> +	 */
>  	if (rt_mutex_owner(lock))
>  		return 0;
>  
>  	/*
> -	 * It will get the lock because of one of these conditions:
> -	 * 1) there is no waiter
> -	 * 2) higher priority than waiters
> -	 * 3) it is top waiter
> -	 */
> -	if (rt_mutex_has_waiters(lock)) {
> -		if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
> -			if (!waiter || waiter != rt_mutex_top_waiter(lock))
> -				return 0;
> -		}
> -	}
> +	 * If @waiter != NULL, @task has already enqueued the waiter
> +	 * into @lock waiter list. If @waiter == NULL then this is a

Ah, you state my comment here :-)

> +	 * trylock attempt.
> +	 */
> +	if (waiter) {
> +		/*
> +		 * If waiter is not the highest priority waiter of
> +		 * @lock, give up.
> +		 */
> +		if (waiter != rt_mutex_top_waiter(lock))
> +			return 0;
>  
> -	if (waiter || rt_mutex_has_waiters(lock)) {
> -		unsigned long flags;
> -		struct rt_mutex_waiter *top;
> -
> -		raw_spin_lock_irqsave(&task->pi_lock, flags);
> -
> -		/* remove the queued waiter. */
> -		if (waiter) {
> -			rt_mutex_dequeue(lock, waiter);
> -			task->pi_blocked_on = NULL;
> -		}
> +		/*
> +		 * We can acquire the lock. Remove the waiter from the
> +		 * lock waiters list.
> +		 */
> +		rt_mutex_dequeue(lock, waiter);
>  
> +	} else {
>  		/*
> -		 * We have to enqueue the top waiter(if it exists) into
> -		 * task->pi_waiters list.
> +		 * If the lock has waiters already we check whether @task is
> +		 * eligible to take over the lock.
> +		 *
> +		 * If there are no other waiters, @task can acquire the lock.


> +		 * @task->pi_blocked_on is NULL

This is rather out of the blue. What does it mean? Why did you state
this? Maybe you meant to continue to say:

 ... is NULL, so it does not need to be dequeued.


>  		 */
>  		if (rt_mutex_has_waiters(lock)) {
> -			top = rt_mutex_top_waiter(lock);
> -			rt_mutex_enqueue_pi(task, top);
> +			/*
> +			 * If @task->prio is greater than or equal to
> +			 * the top waiter priority (kernel view),
> +			 * @task lost.
> +			 */
> +			if (task->prio >= rt_mutex_top_waiter(lock)->prio)

Didn't we have a function for this test? Oh wait, that's for the -rt
patch ;-)

> +				return 0;
> +
> +			/*
> +			 * The current top waiter stays enqueued. We
> +			 * don't have to change anything in the lock
> +			 * waiters order.
> +			 */
> +		} else {
> +			/*
> +			 * No waiters. Take the lock without the
> +			 * pi_lock dance.@task->pi_blocked_on is NULL

s/\.\@/. @/

> +			 * and we have no waiters to enqueue in @task
> +			 * pi waiters list.
> +			 */
> +			goto takeit;
>  		}
> -		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
>  	}
>  
> +	/*
> +	 * Clear @task->pi_blocked_on. Requires protection by
> +	 * @task->pi_lock. Redundant operation for the @waiter == NULL
> +	 * case, but conditionals are more expensive than a redundant
> +	 * store.
> +	 */
> +	raw_spin_lock_irqsave(&task->pi_lock, flags);
> +	task->pi_blocked_on = NULL;
> +	/*
> +	 * Finish the lock acquisition. @task is the new owner. If
> +	 * other waiters exist we have to insert the highest priority
> +	 * waiter into @task->pi_waiters list.
> +	 */
> +	if (rt_mutex_has_waiters(lock))
> +		rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
> +	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
> +
> +takeit:
>  	/* We got the lock. */
>  	debug_rt_mutex_lock(lock);
>  
> +	/*
> +	 * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
> +	 * are still waiters or clears it.
> +	 */
>  	rt_mutex_set_owner(lock, task);
>  
>  	rt_mutex_deadlock_account_lock(lock, task);
> 

Much more readable and honestly, it looks more efficient than the
original maze. Good job!

Reviewed-by: Steven Rostedt <rostedt@goodmis.org>

-- Steve


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 04/10] rtmutex: No need to keep task ref for lock owner check
  2014-06-11 18:44 ` [patch V4 04/10] rtmutex: No need to keep task ref for lock owner check Thomas Gleixner
@ 2014-06-13 16:30   ` Steven Rostedt
  0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2014-06-13 16:30 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Peter Zijlstra, Ingo Molnar

On Wed, 11 Jun 2014 18:44:05 -0000
Thomas Gleixner <tglx@linutronix.de> wrote:

> There is no point to keep the task ref across the check for lock
> owner. Drop the ref before that, so the protection context is clear.

You mean you deobfuscated the code ;-)

(No, patch #3 could have been called "deobfuscate the code")

Reviewed-by: Steven Rostedt <rostedt@goodmis.org>

-- Steve

> 
> Found while documenting the chain walk.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  kernel/locking/rtmutex.c |    5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> Index: tip/kernel/locking/rtmutex.c
> ===================================================================
> --- tip.orig/kernel/locking/rtmutex.c
> +++ tip/kernel/locking/rtmutex.c
> @@ -463,6 +463,8 @@ static int rt_mutex_adjust_prio_chain(st
>  
>  	/* Release the task */
>  	raw_spin_unlock_irqrestore(&task->pi_lock, flags);
> +	put_task_struct(task);
> +
>  	if (!rt_mutex_owner(lock)) {
>  		/*
>  		 * If the requeue above changed the top waiter, then we need
> @@ -472,9 +474,8 @@ static int rt_mutex_adjust_prio_chain(st
>  		if (top_waiter != rt_mutex_top_waiter(lock))
>  			wake_up_process(rt_mutex_top_waiter(lock)->task);
>  		raw_spin_unlock(&lock->wait_lock);
> -		goto out_put_task;
> +		return 0;
>  	}
> -	put_task_struct(task);
>  
>  	/* Grab the next task */
>  	task = rt_mutex_owner(lock);
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 05/10] rtmutex: Clarify the boost/deboost part
  2014-06-11 18:44 ` [patch V4 05/10] rtmutex: Clarify the boost/deboost part Thomas Gleixner
@ 2014-06-13 16:35   ` Steven Rostedt
  0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2014-06-13 16:35 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Peter Zijlstra, Ingo Molnar

On Wed, 11 Jun 2014 18:44:06 -0000
Thomas Gleixner <tglx@linutronix.de> wrote:

> Add a separate local variable for the boost/deboost logic to make the
> code more readable. Add comments where appropriate.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Already-reviewed-by: Steven Rostedt <rostedt@goodmis.org>

(I added my RB tag to the last version, and noticed this didn't change)

-- Steve

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 06/10] rtmutex: Document pi chain walk
  2014-06-11 18:44 ` [patch V4 06/10] rtmutex: Document pi chain walk Thomas Gleixner
@ 2014-06-13 16:54   ` Steven Rostedt
  0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2014-06-13 16:54 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Peter Zijlstra, Ingo Molnar

On Wed, 11 Jun 2014 18:44:06 -0000
Thomas Gleixner <tglx@linutronix.de> wrote:

> Add commentry to document the chain walk and the protection mechanisms
> and their scope.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---

Reviewed-by: Steven Rostedt <rostedt@goodmis.org>

-- Steve

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 07/10] rtmutex: Simplify remove_waiter()
  2014-06-11 18:44 ` [patch V4 07/10] rtmutex: Simplify remove_waiter() Thomas Gleixner
@ 2014-06-13 16:58   ` Steven Rostedt
  0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2014-06-13 16:58 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Peter Zijlstra, Ingo Molnar

On Wed, 11 Jun 2014 18:44:07 -0000
Thomas Gleixner <tglx@linutronix.de> wrote:

> Exit right away, when the removed waiter was not the top prioriy
> waiter on the lock. Get rid of the extra indent level.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Reviewed-by: Steven Rostedt <rostedt@goodmis.org>

-- Steve


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 08/10] rtmutex: Confine deadlock logic to futex
  2014-06-11 18:44 ` [patch V4 08/10] rtmutex: Confine deadlock logic to futex Thomas Gleixner
@ 2014-06-13 17:10   ` Steven Rostedt
  0 siblings, 0 replies; 27+ messages in thread
From: Steven Rostedt @ 2014-06-13 17:10 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Peter Zijlstra, Ingo Molnar

On Wed, 11 Jun 2014 18:44:08 -0000
Thomas Gleixner <tglx@linutronix.de> wrote:
  
> +/*
> + * Futex variant with full deadlock detection.
> + */
> +int __rt_mutex_timed_lock(struct rt_mutex *lock,
> +			  struct hrtimer_sleeper *timeout)

I hate underscores. Although it's commented, it's still rather awkward.

What about calling it rt_mutex_timed_futex_lock() or something else
with futex in its name?

Rest looks good.

Reviewed-by: Steven Rostedt <rostedt@goodmis.org>

-- Steve

> +{
> +	might_sleep();
> +
> +	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, 1,
> +				       rt_mutex_slowlock);
> +}
> +


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 09/10] rtmutex: Cleanup deadlock detector debug logic
  2014-06-11 18:44 ` [patch V4 09/10] rtmutex: Cleanup deadlock detector debug logic Thomas Gleixner
@ 2014-06-13 17:19   ` Steven Rostedt
  2014-06-13 19:43     ` Thomas Gleixner
  0 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2014-06-13 17:19 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Peter Zijlstra, Ingo Molnar, Lai Jiangshan

On Wed, 11 Jun 2014 18:44:08 -0000
Thomas Gleixner <tglx@linutronix.de> wrote:

> The conditions under which deadlock detection is conducted are unclear
> and undocumented.
> 
> Add constants instead of using 0/1 and provide a selection function
> which hides the additional debug dependency from the calling code.
> 
> Add comments where needed.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
> Link: http://lkml.kernel.org/r/20140522031949.947264874@linutronix.de
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  kernel/locking/rtmutex-debug.c  |    5 +-
>  kernel/locking/rtmutex-debug.h  |    7 ++--
>  kernel/locking/rtmutex.c        |   70 +++++++++++++++++++++++++++-------------
>  kernel/locking/rtmutex.h        |    7 +++-
>  kernel/locking/rtmutex_common.h |   15 ++++++++
>  5 files changed, 76 insertions(+), 28 deletions(-)
> 
> Index: tip/kernel/locking/rtmutex-debug.c
> ===================================================================
> --- tip.orig/kernel/locking/rtmutex-debug.c
> +++ tip/kernel/locking/rtmutex-debug.c
> @@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct tas
>   * the deadlock. We print when we return. act_waiter can be NULL in
>   * case of a remove waiter operation.
>   */
> -void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
> +void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
> +			     struct rt_mutex_waiter *act_waiter,
>  			     struct rt_mutex *lock)
>  {
>  	struct task_struct *task;
>  
> -	if (!debug_locks || detect || !act_waiter)
> +	if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
>  		return;
>  
>  	task = rt_mutex_owner(act_waiter->lock);
> Index: tip/kernel/locking/rtmutex-debug.h
> ===================================================================
> --- tip.orig/kernel/locking/rtmutex-debug.h
> +++ tip/kernel/locking/rtmutex-debug.h
> @@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct
>  extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
>  				      struct task_struct *powner);
>  extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
> -extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
> +extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
> +				    struct rt_mutex_waiter *waiter,
>  				    struct rt_mutex *lock);
>  extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
>  # define debug_rt_mutex_reset_waiter(w)			\
>  	do { (w)->deadlock_lock = NULL; } while (0)
>  
> -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
> -						 int detect)
> +static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
> +						  enum rtmutex_chainwalk walk)
>  {
>  	return (waiter != NULL);
>  }
> Index: tip/kernel/locking/rtmutex.c
> ===================================================================
> --- tip.orig/kernel/locking/rtmutex.c
> +++ tip/kernel/locking/rtmutex.c
> @@ -308,6 +308,25 @@ static void rt_mutex_adjust_prio(struct
>  }
>  
>  /*
> + * Deadlock detection is conditional:
> + *
> + * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
> + * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
> + *
> + * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
> + * conducted independent of the detect argument.
> + *
> + * If the waiter argument is NULL this indicates the deboost path and
> + * deadlock detection is disabled independent of the detect argument
> + * and the config settings.
> + */
> +static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
> +					  enum rtmutex_chainwalk chwalk)
> +{

	/*
	 * This is just a wrapper function for the following call,
	 * because debug_rt_mutex_detect_deadlock() smells like a magic
	 * debug feature and I wanted to keep the cond function in the
	 * main source file along with the comments instead of having
	 * two of the same in the headers.
	 */

;-)

(found an error below)

> +	return debug_rt_mutex_detect_deadlock(waiter, chwalk);
> +}
> +
> +/*
>   * Max number of times we'll walk the boosting chain:
>   */
>  int max_lock_depth = 1024;
> @@ -381,7 +400,7 @@ static inline struct rt_mutex *task_bloc
>   *	  goto again;
>   */
>  static int rt_mutex_adjust_prio_chain(struct task_struct *task,
> -				      int deadlock_detect,
> +				      enum rtmutex_chainwalk chwalk,
>  				      struct rt_mutex *orig_lock,
>  				      struct rt_mutex *next_lock,
>  				      struct rt_mutex_waiter *orig_waiter,
> @@ -389,12 +408,12 @@ static int rt_mutex_adjust_prio_chain(st
>  {
>  	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
>  	struct rt_mutex_waiter *prerequeue_top_waiter;
> -	int detect_deadlock, ret = 0, depth = 0;
> +	int ret = 0, depth = 0;
>  	struct rt_mutex *lock;
> +	bool detect_deadlock;
>  	unsigned long flags;
>  
> -	detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
> -							 deadlock_detect);
> +	detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
>  
>  	/*
>  	 * The (de)boosting is a step by step approach with a lot of
> @@ -520,7 +539,7 @@ static int rt_mutex_adjust_prio_chain(st
>  	 * walk, we detected a deadlock.
>  	 */
>  	if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
> -		debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
> +		debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
>  		raw_spin_unlock(&lock->wait_lock);
>  		ret = -EDEADLK;
>  		goto out_unlock_pi;
> @@ -784,7 +803,7 @@ takeit:
>  static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
>  				   struct rt_mutex_waiter *waiter,
>  				   struct task_struct *task,
> -				   int detect_deadlock)
> +				   enum rtmutex_chainwalk chwalk)
>  {
>  	struct task_struct *owner = rt_mutex_owner(lock);
>  	struct rt_mutex_waiter *top_waiter = waiter;
> @@ -830,7 +849,7 @@ static int task_blocks_on_rt_mutex(struc
>  		__rt_mutex_adjust_prio(owner);
>  		if (owner->pi_blocked_on)
>  			chain_walk = 1;
> -	} else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
> +	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
>  		chain_walk = 1;
>  	}
>  
> @@ -855,7 +874,7 @@ static int task_blocks_on_rt_mutex(struc
>  
>  	raw_spin_unlock(&lock->wait_lock);
>  
> -	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
> +	res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
>  					 next_lock, waiter, task);
>  
>  	raw_spin_lock(&lock->wait_lock);
> @@ -963,7 +982,8 @@ static void remove_waiter(struct rt_mute
>  
>  	raw_spin_unlock(&lock->wait_lock);
>  
> -	rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
> +	rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
> +				   next_lock, NULL, current);
>  
>  	raw_spin_lock(&lock->wait_lock);
>  }
> @@ -993,7 +1013,8 @@ void rt_mutex_adjust_pi(struct task_stru
>  	/* gets dropped in rt_mutex_adjust_prio_chain()! */
>  	get_task_struct(task);
>  
> -	rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
> +	rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, NULL,

You dropped next_lock here. Was that intentional? I doubt it.

-- Steve


> +				   NULL, task);
>  }
>  
>  /**
> @@ -1071,7 +1092,7 @@ static void rt_mutex_handle_deadlock(int
>  static int __sched
>  rt_mutex_slowlock(struct rt_mutex *lock, int state,
>  		  struct hrtimer_sleeper *timeout,
> -		  int detect_deadlock)
> +		  enum rtmutex_chainwalk chwalk)
>  {
>  	struct rt_mutex_waiter waiter;
>  	int ret = 0;
> @@ -1097,7 +1118,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
>  			timeout->task = NULL;
>  	}
>  
> -	ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
> +	ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
>  
>  	if (likely(!ret))
>  		ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
> @@ -1106,7 +1127,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
>  
>  	if (unlikely(ret)) {
>  		remove_waiter(lock, &waiter);
> -		rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
> +		rt_mutex_handle_deadlock(ret, chwalk, &waiter);
>  	}
>  
>  	/*
> @@ -1234,27 +1255,29 @@ static inline int
>  rt_mutex_fastlock(struct rt_mutex *lock, int state,
>  		  int (*slowfn)(struct rt_mutex *lock, int state,
>  				struct hrtimer_sleeper *timeout,
> -				int detect_deadlock))
> +				enum rtmutex_chainwalk chwalk))
>  {
>  	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
>  		rt_mutex_deadlock_account_lock(lock, current);
>  		return 0;
>  	} else
> -		return slowfn(lock, state, NULL, 0);
> +		return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
>  }
>  
>  static inline int
>  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
> -			struct hrtimer_sleeper *timeout, int detect_deadlock,
> +			struct hrtimer_sleeper *timeout,
> +			enum rtmutex_chainwalk chwalk,
>  			int (*slowfn)(struct rt_mutex *lock, int state,
>  				      struct hrtimer_sleeper *timeout,
> -				      int detect_deadlock))
> +				      enum rtmutex_chainwalk chwalk))
>  {
> -	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
> +	if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
> +	    likely(rt_mutex_cmpxchg(lock, NULL, current))) {
>  		rt_mutex_deadlock_account_lock(lock, current);
>  		return 0;
>  	} else
> -		return slowfn(lock, state, timeout, detect_deadlock);
> +		return slowfn(lock, state, timeout, chwalk);
>  }
>  
>  static inline int
> @@ -1316,7 +1339,8 @@ int __rt_mutex_timed_lock(struct rt_mute
>  {
>  	might_sleep();
>  
> -	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, 1,
> +	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
> +				       RT_MUTEX_FULL_CHAINWALK,
>  				       rt_mutex_slowlock);
>  }
>  
> @@ -1338,7 +1362,8 @@ rt_mutex_timed_lock(struct rt_mutex *loc
>  {
>  	might_sleep();
>  
> -	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, 0,
> +	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
> +				       RT_MUTEX_MIN_CHAINWALK,
>  				       rt_mutex_slowlock);
>  }
>  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
> @@ -1467,7 +1492,8 @@ int rt_mutex_start_proxy_lock(struct rt_
>  	}
>  
>  	/* We enforce deadlock detection for futexes */
> -	ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
> +	ret = task_blocks_on_rt_mutex(lock, waiter, task,
> +				      RT_MUTEX_FULL_CHAINWALK);
>  
>  	if (ret && !rt_mutex_owner(lock)) {
>  		/*
> Index: tip/kernel/locking/rtmutex.h
> ===================================================================
> --- tip.orig/kernel/locking/rtmutex.h
> +++ tip/kernel/locking/rtmutex.h
> @@ -22,10 +22,15 @@
>  #define debug_rt_mutex_init(m, n)			do { } while (0)
>  #define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0)
>  #define debug_rt_mutex_print_deadlock(w)		do { } while (0)
> -#define debug_rt_mutex_detect_deadlock(w,d)		(d)
>  #define debug_rt_mutex_reset_waiter(w)			do { } while (0)
>  
>  static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
>  {
>  	WARN(1, "rtmutex deadlock detected\n");
>  }
> +
> +static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
> +						  enum rtmutex_chainwalk walk)
> +{
> +	return walk == RT_MUTEX_FULL_CHAINWALK;
> +}
> Index: tip/kernel/locking/rtmutex_common.h
> ===================================================================
> --- tip.orig/kernel/locking/rtmutex_common.h
> +++ tip/kernel/locking/rtmutex_common.h
> @@ -102,6 +102,21 @@ static inline struct task_struct *rt_mut
>  }
>  
>  /*
> + * Constants for rt mutex functions which have a selectable deadlock
> + * detection.
> + *
> + * RT_MUTEX_MIN_CHAINWALK:	Stops the lock chain walk when there are
> + *				no further PI adjustments to be made.
> + *
> + * RT_MUTEX_FULL_CHAINWALK:	Invoke deadlock detection with a full
> + *				walk of the lock chain.
> + */
> +enum rtmutex_chainwalk {
> +	RT_MUTEX_MIN_CHAINWALK,
> +	RT_MUTEX_FULL_CHAINWALK,
> +};
> +
> +/*
>   * PI-futex support (proxy locking functions, etc.):
>   */
>  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 10/10] rtmutex: Avoid pointless requeueing in the deadlock detection chain walk
  2014-06-11 18:44 ` [patch V4 10/10] rtmutex: Avoid pointless requeueing in the deadlock detection chain walk Thomas Gleixner
@ 2014-06-13 17:28   ` Steven Rostedt
  2014-06-13 19:46     ` Thomas Gleixner
  0 siblings, 1 reply; 27+ messages in thread
From: Steven Rostedt @ 2014-06-13 17:28 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Peter Zijlstra, Ingo Molnar, Lai Jiangshan

On Wed, 11 Jun 2014 18:44:09 -0000
Thomas Gleixner <tglx@linutronix.de> wrote:

> In case the dead lock detector is enabled we follow the lock chain to
> the end in rt_mutex_adjust_prio_chain, even if we could stop earlier
> due to the priority/waiter constellation.
> 
> But once we are not longer the top priority waiter in a certain step

s/not/no/

> or the task holding the lock has already the same priority then there
> is no point in dequeing and enqueing along the lock chain as there is
> no change at all.
> 
> So stop the queueing at this point.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
> Link: http://lkml.kernel.org/r/20140522031950.280830190@linutronix.de
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  kernel/locking/rtmutex.c |   77 ++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 70 insertions(+), 7 deletions(-)
> 
> Index: tip/kernel/locking/rtmutex.c
> ===================================================================
> --- tip.orig/kernel/locking/rtmutex.c
> +++ tip/kernel/locking/rtmutex.c
> @@ -412,6 +412,7 @@ static int rt_mutex_adjust_prio_chain(st
>  	struct rt_mutex *lock;
>  	bool detect_deadlock;
>  	unsigned long flags;
> +	bool requeue = true;
>  
>  	detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
>  
> @@ -501,18 +502,31 @@ static int rt_mutex_adjust_prio_chain(st
>  			goto out_unlock_pi;
>  		/*
>  		 * If deadlock detection is off, we stop here if we
> -		 * are not the top pi waiter of the task.
> +		 * are not the top pi waiter of the task. If deadlock
> +		 * detection is enabled we continue, but stop the
> +		 * requeueing in the chain walk.
>  		 */
> -		if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
> -			goto out_unlock_pi;
> +		if (top_waiter != task_top_pi_waiter(task)) {
> +			if (!detect_deadlock)
> +				goto out_unlock_pi;
> +			else
> +				requeue = false;
> +		}
>  	}
>  
>  	/*
> -	 * When deadlock detection is off then we check, if further
> -	 * priority adjustment is necessary.
> +	 * If the waiter priority is the same as the task priority
> +	 * then there is no further priority adjustment necessary.  If
> +	 * deadlock detection is off, we stop the chain walk. If its
> +	 * enabled we continue, but stop the requeueing in the chain
> +	 * walk.
>  	 */
> -	if (!detect_deadlock && waiter->prio == task->prio)
> -		goto out_unlock_pi;
> +	if (waiter->prio == task->prio) {
> +		if (!detect_deadlock)
> +			goto out_unlock_pi;
> +		else
> +			requeue = false;
> +	}
>  
>  	/*
>  	 * [4] Get the next lock
> @@ -546,6 +560,55 @@ static int rt_mutex_adjust_prio_chain(st
>  	}
>  
>  	/*
> +	 * If we just follow the lock chain for deadlock detection, no
> +	 * need to do all the requeue operations. To avoid a truckload
> +	 * of conditionals around the various places below, just do the
> +	 * minimum chain walk checks.
> +	 */
> +	if (!requeue) {
> +		/*
> +		 * No requeue[7] here. Just release @task [8]
> +		 */
> +		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
> +		put_task_struct(task);
> +
> +		/*
> +		 * [9] check_exit_conditions_3 protected by lock->wait_lock.
> +		 * If there is no owner of the lock, end of chain.
> +		 */
> +		if (!rt_mutex_owner(lock)) {
> +			raw_spin_unlock(&lock->wait_lock);
> +			return 0;
> +		}
> +
> +		/* [10] Grab the next task, i.e. owner of @lock */
> +		task = rt_mutex_owner(lock);
> +		get_task_struct(task);
> +		raw_spin_lock_irqsave(&task->pi_lock, flags);
> +
> +		/*
> +		 * No requeue [11] here. We just do deadlock detection.
> +		 *
> +		 * Get the top waiter for the next iteration
> +		 */
> +		top_waiter = rt_mutex_top_waiter(lock);
> +
> +		/*
> +		 * [12] Store whether owner is blocked
> +		 * itself. Decision is made after dropping the locks
> +		 */
> +		next_lock = task_blocked_on_lock(task);

Nit pick, but we should be consistent.

The requeue path does:

        next_lock = task_blocked_on_lock(task);
        /*
         * Store the top waiter of @lock for the end of chain walk
         * decision below.
         */
        top_waiter = rt_mutex_top_waiter(lock);

I know the order is not important, but we should keep the two the same,
helps in review and making sure changes to one implementation still
make it to the other, without confusing those making the changes
(like you in a few years ;)

-- Steve


> +		/* [13] Drop locks */
> +		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
> +		raw_spin_unlock(&lock->wait_lock);
> +
> +		/* If owner is not blocked, end of chain. */
> +		if (!next_lock)
> +			goto out_put_task;
> +		goto again;
> +	}
> +
> +	/*
>  	 * Store the current top waiter before doing the requeue
>  	 * operation on @lock. We need it for the boost/deboost
>  	 * decision below.
> 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 09/10] rtmutex: Cleanup deadlock detector debug logic
  2014-06-13 17:19   ` Steven Rostedt
@ 2014-06-13 19:43     ` Thomas Gleixner
  0 siblings, 0 replies; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-13 19:43 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: LKML, Peter Zijlstra, Ingo Molnar, Lai Jiangshan

On Fri, 13 Jun 2014, Steven Rostedt wrote:
> On Wed, 11 Jun 2014 18:44:08 -0000
> Thomas Gleixner <tglx@linutronix.de> wrote:
> > -	rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
> > +	rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, NULL,
> 
> You dropped next_lock here. Was that intentional? I doubt it.

Definitely not. Thanks for spotting it.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 10/10] rtmutex: Avoid pointless requeueing in the deadlock detection chain walk
  2014-06-13 17:28   ` Steven Rostedt
@ 2014-06-13 19:46     ` Thomas Gleixner
  0 siblings, 0 replies; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-13 19:46 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: LKML, Peter Zijlstra, Ingo Molnar, Lai Jiangshan

On Fri, 13 Jun 2014, Steven Rostedt wrote:
> On Wed, 11 Jun 2014 18:44:09 -0000
> Thomas Gleixner <tglx@linutronix.de> wrote:
> Nit pick, but we should be consistent.
> 
> The requeue path does:
> 
>         next_lock = task_blocked_on_lock(task);
>         /*
>          * Store the top waiter of @lock for the end of chain walk
>          * decision below.
>          */
>         top_waiter = rt_mutex_top_waiter(lock);
> 
> I know the order is not important, but we should keep the two the same,
> helps in review and making sure changes to one implementation still
> make it to the other, without confusing those making the changes
> (like you in a few years ;)

The timespan for forgetting about that is measured in weeks, if at
all. No sane brain will keep itself exposed to this longer than
absolutely necessary.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [tip:locking/urgent] rtmutex: Plug slow unlock race
  2014-06-11 18:44 ` [patch V4 01/10] rtmutex: Plug slow unlock race Thomas Gleixner
  2014-06-13 15:41   ` Steven Rostedt
@ 2014-06-16  8:06   ` tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 27+ messages in thread
From: tip-bot for Thomas Gleixner @ 2014-06-16  8:06 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, rostedt, peterz, tglx

Commit-ID:  27e35715df54cbc4f2d044f681802ae30479e7fb
Gitweb:     http://git.kernel.org/tip/27e35715df54cbc4f2d044f681802ae30479e7fb
Author:     Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Wed, 11 Jun 2014 18:44:04 +0000
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Mon, 16 Jun 2014 10:03:09 +0200

rtmutex: Plug slow unlock race

When the rtmutex fast path is enabled the slow unlock function can
create the following situation:

spin_lock(foo->m->wait_lock);
foo->m->owner = NULL;
	    			rt_mutex_lock(foo->m); <-- fast path
				free = atomic_dec_and_test(foo->refcnt);
				rt_mutex_unlock(foo->m); <-- fast path
				if (free)
				   kfree(foo);

spin_unlock(foo->m->wait_lock); <--- Use after free.

Plug the race by changing the slow unlock to the following scheme:

     while (!rt_mutex_has_waiters(m)) {
     	    /* Clear the waiters bit in m->owner */
	    clear_rt_mutex_waiters(m);
      	    owner = rt_mutex_owner(m);
      	    spin_unlock(m->wait_lock);
      	    if (cmpxchg(m->owner, owner, 0) == owner)
      	       return;
      	    spin_lock(m->wait_lock);
     }

So in case of a new waiter incoming while the owner tries the slow
path unlock we have two situations:

 unlock(wait_lock);
					lock(wait_lock);
 cmpxchg(p, owner, 0) == owner
 	    	   			mark_rt_mutex_waiters(lock);
	 				acquire(lock);

Or:

 unlock(wait_lock);
					lock(wait_lock);
	 				mark_rt_mutex_waiters(lock);
 cmpxchg(p, owner, 0) != owner
					enqueue_waiter();
					unlock(wait_lock);
 lock(wait_lock);
 wakeup_next waiter();
 unlock(wait_lock);
					lock(wait_lock);
					acquire(lock);

If the fast path is disabled, then the simple

   m->owner = NULL;
   unlock(m->wait_lock);

is sufficient as all access to m->owner is serialized via
m->wait_lock;

Also document and clarify the wakeup_next_waiter function as suggested
by Oleg Nesterov.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140611183852.937945560@linutronix.de
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c | 115 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 109 insertions(+), 6 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a8a83a2..fc60594 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 		owner = *p;
 	} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
 }
+
+/*
+ * Safe fastpath aware unlock:
+ * 1) Clear the waiters bit
+ * 2) Drop lock->wait_lock
+ * 3) Try to unlock the lock with cmpxchg
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+	__releases(lock->wait_lock)
+{
+	struct task_struct *owner = rt_mutex_owner(lock);
+
+	clear_rt_mutex_waiters(lock);
+	raw_spin_unlock(&lock->wait_lock);
+	/*
+	 * If a new waiter comes in between the unlock and the cmpxchg
+	 * we have two situations:
+	 *
+	 * unlock(wait_lock);
+	 *					lock(wait_lock);
+	 * cmpxchg(p, owner, 0) == owner
+	 *					mark_rt_mutex_waiters(lock);
+	 *					acquire(lock);
+	 * or:
+	 *
+	 * unlock(wait_lock);
+	 *					lock(wait_lock);
+	 *					mark_rt_mutex_waiters(lock);
+	 *
+	 * cmpxchg(p, owner, 0) != owner
+	 *					enqueue_waiter();
+	 *					unlock(wait_lock);
+	 * lock(wait_lock);
+	 * wake waiter();
+	 * unlock(wait_lock);
+	 *					lock(wait_lock);
+	 *					acquire(lock);
+	 */
+	return rt_mutex_cmpxchg(lock, owner, NULL);
+}
+
 #else
 # define rt_mutex_cmpxchg(l,c,n)	(0)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
 	lock->owner = (struct task_struct *)
 			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
 }
+
+/*
+ * Simple slow path only version: lock->owner is protected by lock->wait_lock.
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+	__releases(lock->wait_lock)
+{
+	lock->owner = NULL;
+	raw_spin_unlock(&lock->wait_lock);
+	return true;
+}
 #endif
 
 static inline int
@@ -650,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 /*
  * Wake up the next waiter on the lock.
  *
- * Remove the top waiter from the current tasks waiter list and wake it up.
+ * Remove the top waiter from the current tasks pi waiter list and
+ * wake it up.
  *
  * Called with lock->wait_lock held.
  */
@@ -671,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 	 */
 	rt_mutex_dequeue_pi(current, waiter);
 
-	rt_mutex_set_owner(lock, NULL);
+	/*
+	 * As we are waking up the top waiter, and the waiter stays
+	 * queued on the lock until it gets the lock, this lock
+	 * obviously has waiters. Just set the bit here and this has
+	 * the added benefit of forcing all new tasks into the
+	 * slow path making sure no task of lower priority than
+	 * the top waiter can steal this lock.
+	 */
+	lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
 
 	raw_spin_unlock_irqrestore(&current->pi_lock, flags);
 
+	/*
+	 * It's safe to dereference waiter as it cannot go away as
+	 * long as we hold lock->wait_lock. The waiter task needs to
+	 * acquire it in order to dequeue the waiter.
+	 */
 	wake_up_process(waiter->task);
 }
 
@@ -928,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
 
 	rt_mutex_deadlock_account_unlock(current);
 
-	if (!rt_mutex_has_waiters(lock)) {
-		lock->owner = NULL;
-		raw_spin_unlock(&lock->wait_lock);
-		return;
+	/*
+	 * We must be careful here if the fast path is enabled. If we
+	 * have no waiters queued we cannot set owner to NULL here
+	 * because of:
+	 *
+	 * foo->lock->owner = NULL;
+	 *			rtmutex_lock(foo->lock);   <- fast path
+	 *			free = atomic_dec_and_test(foo->refcnt);
+	 *			rtmutex_unlock(foo->lock); <- fast path
+	 *			if (free)
+	 *				kfree(foo);
+	 * raw_spin_unlock(foo->lock->wait_lock);
+	 *
+	 * So for the fastpath enabled kernel:
+	 *
+	 * Nothing can set the waiters bit as long as we hold
+	 * lock->wait_lock. So we do the following sequence:
+	 *
+	 *	owner = rt_mutex_owner(lock);
+	 *	clear_rt_mutex_waiters(lock);
+	 *	raw_spin_unlock(&lock->wait_lock);
+	 *	if (cmpxchg(&lock->owner, owner, 0) == owner)
+	 *		return;
+	 *	goto retry;
+	 *
+	 * The fastpath disabled variant is simple as all access to
+	 * lock->owner is serialized by lock->wait_lock:
+	 *
+	 *	lock->owner = NULL;
+	 *	raw_spin_unlock(&lock->wait_lock);
+	 */
+	while (!rt_mutex_has_waiters(lock)) {
+		/* Drops lock->wait_lock ! */
+		if (unlock_rt_mutex_safe(lock) == true)
+			return;
+		/* Relock the rtmutex and try again */
+		raw_spin_lock(&lock->wait_lock);
 	}
 
+	/*
+	 * The wakeup next waiter path does not suffer from the above
+	 * race. See the comments there.
+	 */
 	wakeup_next_waiter(lock);
 
 	raw_spin_unlock(&lock->wait_lock);

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [patch V4 00/10] rtmutex: Code clarification and optimization
  2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
                   ` (9 preceding siblings ...)
  2014-06-11 18:44 ` [patch V4 10/10] rtmutex: Avoid pointless requeueing in the deadlock detection chain walk Thomas Gleixner
@ 2014-06-22  8:45 ` Ingo Molnar
  2014-06-22  9:20   ` Thomas Gleixner
  10 siblings, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2014-06-22  8:45 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Steven Rostedt, Peter Zijlstra


* Thomas Gleixner <tglx@linutronix.de> wrote:

> Hopefully the last iteration of this.
> 
> I addressed all the review comments and added the following new ones:
> 
> 1/n: plug unlock race.
>    
> 	Patch was separately posted already in the safety of
>    	*mutex_unlock() thread.
> 
> 2/n: Document and simplify the trylock path
> 
> 3/n: Document and simplify try_to_take_rtmutex()

JFYI, the Blackfin cross-build fails with:

/home/mingo/tip/kernel/locking/rtmutex.c: In function 'rt_mutex_adjust_prio_chain':
/home/mingo/tip/kernel/locking/rtmutex.c:739: error: unable to find a register to spill in class 'CCREGS'
/home/mingo/tip/kernel/locking/rtmutex.c:739: error: this is the insn:
(insn 82 75 107 2 /home/mingo/tip/kernel/locking/rtmutex.c:487 (set 
(reg:BI 186)
        (eq:BI (reg/v/f:SI 9 P1 [orig:85 top_waiter ] [85])
            (const_int 0 [0x0]))) 111 {compare_eq} (nil))
/home/mingo/tip/kernel/locking/rtmutex.c:739: confused by earlier errors, bailing out

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch V4 00/10] rtmutex: Code clarification and optimization
  2014-06-22  8:45 ` [patch V4 00/10] rtmutex: Code clarification and optimization Ingo Molnar
@ 2014-06-22  9:20   ` Thomas Gleixner
  0 siblings, 0 replies; 27+ messages in thread
From: Thomas Gleixner @ 2014-06-22  9:20 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: LKML, Steven Rostedt, Peter Zijlstra

On Sun, 22 Jun 2014, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> > Hopefully the last iteration of this.
> > 
> > I addressed all the review comments and added the following new ones:
> > 
> > 1/n: plug unlock race.
> >    
> > 	Patch was separately posted already in the safety of
> >    	*mutex_unlock() thread.
> > 
> > 2/n: Document and simplify the trylock path
> > 
> > 3/n: Document and simplify try_to_take_rtmutex()
> 
> JFYI, the Blackfin cross-build fails with:
> 
> /home/mingo/tip/kernel/locking/rtmutex.c: In function 'rt_mutex_adjust_prio_chain':
> /home/mingo/tip/kernel/locking/rtmutex.c:739: error: unable to find a register to spill in class 'CCREGS'
> /home/mingo/tip/kernel/locking/rtmutex.c:739: error: this is the insn:
> (insn 82 75 107 2 /home/mingo/tip/kernel/locking/rtmutex.c:487 (set 
> (reg:BI 186)
>         (eq:BI (reg/v/f:SI 9 P1 [orig:85 top_waiter ] [85])
>             (const_int 0 [0x0]))) 111 {compare_eq} (nil))
> /home/mingo/tip/kernel/locking/rtmutex.c:739: confused by earlier errors, bailing out

https://www.kernel.org/pub/tools/crosstool/files/bin/x86_64/4.6.3/x86_64-gcc-4.6.3-nolibc_bfin-uclinux.tar.xz

compiles that file nicely.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2014-06-22  9:20 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-11 18:44 [patch V4 00/10] rtmutex: Code clarification and optimization Thomas Gleixner
2014-06-11 18:44 ` [patch V4 01/10] rtmutex: Plug slow unlock race Thomas Gleixner
2014-06-13 15:41   ` Steven Rostedt
2014-06-16  8:06   ` [tip:locking/urgent] " tip-bot for Thomas Gleixner
2014-06-11 18:44 ` [patch V4 02/10] rtmutex: Simplify rtmutex_slowtrylock() Thomas Gleixner
2014-06-12  3:34   ` Lai Jiangshan
2014-06-13 15:58   ` Steven Rostedt
2014-06-11 18:44 ` [patch V4 04/10] rtmutex: No need to keep task ref for lock owner check Thomas Gleixner
2014-06-13 16:30   ` Steven Rostedt
2014-06-11 18:44 ` [patch V4 03/10] rtmutex: Simplify and document try_to_take_rtmutex() Thomas Gleixner
2014-06-13 16:27   ` Steven Rostedt
2014-06-11 18:44 ` [patch V4 06/10] rtmutex: Document pi chain walk Thomas Gleixner
2014-06-13 16:54   ` Steven Rostedt
2014-06-11 18:44 ` [patch V4 05/10] rtmutex: Clarify the boost/deboost part Thomas Gleixner
2014-06-13 16:35   ` Steven Rostedt
2014-06-11 18:44 ` [patch V4 07/10] rtmutex: Simplify remove_waiter() Thomas Gleixner
2014-06-13 16:58   ` Steven Rostedt
2014-06-11 18:44 ` [patch V4 09/10] rtmutex: Cleanup deadlock detector debug logic Thomas Gleixner
2014-06-13 17:19   ` Steven Rostedt
2014-06-13 19:43     ` Thomas Gleixner
2014-06-11 18:44 ` [patch V4 08/10] rtmutex: Confine deadlock logic to futex Thomas Gleixner
2014-06-13 17:10   ` Steven Rostedt
2014-06-11 18:44 ` [patch V4 10/10] rtmutex: Avoid pointless requeueing in the deadlock detection chain walk Thomas Gleixner
2014-06-13 17:28   ` Steven Rostedt
2014-06-13 19:46     ` Thomas Gleixner
2014-06-22  8:45 ` [patch V4 00/10] rtmutex: Code clarification and optimization Ingo Molnar
2014-06-22  9:20   ` Thomas Gleixner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.