[PATCH v2] locking/mutex: Prevent lock starvation when spinning is enabled

From: Jason Low <jason.low2@hpe.com>
To: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>,
	imre.deak@intel.com
Cc: linux-kernel@vger.kernel.org, Jason Low <jason.low2@hp.com>,
	Jason Low <jason.low2@hpe.com>, Waiman Long <Waiman.Long@hpe.com>,
	Davidlohr Bueso <dave@stgolabs.net>,
	Tim Chen <tim.c.chen@linux.intel.com>,
	terry.rudd@hpe.com, "Paul E. McKenney" <paulmck@us.ibm.com>
Subject: [PATCH v2] locking/mutex: Prevent lock starvation when spinning is enabled
Date: Wed, 10 Aug 2016 11:44:08 -0700	[thread overview]
Message-ID: <1470854648.17361.9.camel@j-VirtualBox> (raw)

Imre reported an issue where threads are getting starved when trying
to acquire a mutex. Threads acquiring a mutex can get arbitrarily delayed
sleeping on a mutex because other threads can continually steal the lock
in the fastpath and/or through optimistic spinning.

Waiman has developed patches that allow waiters to return to optimistic
spinning, thus reducing the probability that starvation occurs. However,
Imre still sees this starvation problem in the workloads when optimistic
spinning is disabled.

This patch adds an additional boolean to the mutex that gets used in
the CONFIG_SMP && !CONFIG_MUTEX_SPIN_ON_OWNER cases. The flag signifies
whether or not other threads need to yield to a waiter and gets set
when a waiter spends too much time waiting for the mutex. The threshold
is currently set to 16 wakeups, and once the wakeup threshold is exceeded,
other threads must yield to the top waiter. The flag gets cleared
immediately after the top waiter acquires the mutex.

This prevents waiters from getting starved without sacrificing much
much performance, as lock stealing is still allowed and only
temporarily disabled when it is detected that a waiter has been waiting
for too long.

Reported-by: Imre Deak <imre.deak@intel.com>
Signed-off-by: Jason Low <jason.low2@hpe.com>
---
v1->v2:
- Addressed Waiman's suggestions of needing the yield_to_waiter
  flag only in the CONFIG_SMP case.

- Make sure to only clear the flag if the thread is the top waiter.

- Refactor code to clear flag into an inline function.

- Rename 'loops' variable name to 'wakeups'.

 include/linux/mutex.h  |  2 ++
 kernel/locking/mutex.c | 83 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 2cb7531..5643a233 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -57,6 +57,8 @@ struct mutex {
 #endif
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* Spinner MCS lock */
+#elif defined(CONFIG_SMP)
+	bool yield_to_waiter; /* Prevent starvation when spinning disabled */
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90d..faf31a0 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -55,6 +55,8 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 	mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	osq_lock_init(&lock->osq);
+#elif defined(CONFIG_SMP)
+	lock->yield_to_waiter = false;
 #endif
 
 	debug_mutex_init(lock, name, key);
@@ -71,6 +73,9 @@ EXPORT_SYMBOL(__mutex_init);
  */
 __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
 
+
+static inline bool need_yield_to_waiter(struct mutex *lock);
+
 /**
  * mutex_lock - acquire the mutex
  * @lock: the mutex to be acquired
@@ -99,7 +104,10 @@ void __sched mutex_lock(struct mutex *lock)
 	 * The locking fastpath is the 1->0 transition from
 	 * 'unlocked' into 'locked' state.
 	 */
-	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	if (!need_yield_to_waiter(lock))
+		__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
+	else
+		__mutex_lock_slowpath(&lock->count);
 	mutex_set_owner(lock);
 }
 
@@ -398,12 +406,51 @@ done:
 
 	return false;
 }
+
+static inline void do_yield_to_waiter(struct mutex *lock, int *wakeups)
+{
+	return;
+}
+
+static inline void clear_yield_to_waiter(struct mutex *lock)
+{
+	return;
+}
+
+static inline bool need_yield_to_waiter(struct mutex *lock)
+{
+	return false;
+}
+
 #else
 static bool mutex_optimistic_spin(struct mutex *lock,
 				  struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
 {
 	return false;
 }
+
+#define MUTEX_WAKEUP_THRESHOLD 16
+
+static inline void do_yield_to_waiter(struct mutex *lock, int *wakeups)
+{
+	*wakeups += 1;
+
+	if (*wakeups < MUTEX_WAKEUP_THRESHOLD)
+		return;
+
+	if (lock->yield_to_waiter != true)
+		lock->yield_to_waiter = true;
+}
+
+static inline void clear_yield_to_waiter(struct mutex *lock)
+{
+	lock->yield_to_waiter = false;
+}
+
+static inline bool need_yield_to_waiter(struct mutex *lock)
+{
+	return lock->yield_to_waiter;
+}
 #endif
 
 __visible __used noinline
@@ -510,6 +557,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct mutex_waiter waiter;
 	unsigned long flags;
 	int ret;
+	int wakeups = 0;
 
 	if (use_ww_ctx) {
 		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
@@ -532,7 +580,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	 * Once more, try to acquire the lock. Only try-lock the mutex if
 	 * it is unlocked to reduce unnecessary xchg() operations.
 	 */
-	if (!mutex_is_locked(lock) &&
+	if (!need_yield_to_waiter(lock) && !mutex_is_locked(lock) &&
 	    (atomic_xchg_acquire(&lock->count, 0) == 1))
 		goto skip_wait;
 
@@ -556,8 +604,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * other waiters. We only attempt the xchg if the count is
 		 * non-negative in order to avoid unnecessary xchg operations:
 		 */
-		if (atomic_read(&lock->count) >= 0 &&
+		if ((!need_yield_to_waiter(lock) || wakeups > 1) &&
+		    atomic_read(&lock->count) >= 0 &&
 		    (atomic_xchg_acquire(&lock->count, -1) == 1))
+			if (wakeups > 1)
+				clear_yield_to_waiter(lock);
+
 			break;
 
 		/*
@@ -581,6 +633,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		spin_unlock_mutex(&lock->wait_lock, flags);
 		schedule_preempt_disabled();
 		spin_lock_mutex(&lock->wait_lock, flags);
+		do_yield_to_waiter(lock, &wakeups);
 	}
 	__set_task_state(task, TASK_RUNNING);
 
@@ -789,10 +842,13 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock);
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
-	int ret;
+	int ret = 1;
 
 	might_sleep();
-	ret =  __mutex_fastpath_lock_retval(&lock->count);
+
+	if (!need_yield_to_waiter(lock))
+		ret =  __mutex_fastpath_lock_retval(&lock->count);
+
 	if (likely(!ret)) {
 		mutex_set_owner(lock);
 		return 0;
@@ -804,10 +860,13 @@ EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
-	int ret;
+	int ret = 1;
 
 	might_sleep();
-	ret = __mutex_fastpath_lock_retval(&lock->count);
+
+	if (!need_yield_to_waiter(lock))
+		ret = __mutex_fastpath_lock_retval(&lock->count);
+
 	if (likely(!ret)) {
 		mutex_set_owner(lock);
 		return 0;
@@ -917,11 +976,12 @@ EXPORT_SYMBOL(mutex_trylock);
 int __sched
 __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	int ret;
+	int ret = 1;
 
 	might_sleep();
 
-	ret = __mutex_fastpath_lock_retval(&lock->base.count);
+	if (!need_yield_to_waiter(lock))
+		ret = __mutex_fastpath_lock_retval(&lock->base.count);
 
 	if (likely(!ret)) {
 		ww_mutex_set_context_fastpath(lock, ctx);
@@ -935,11 +995,12 @@ EXPORT_SYMBOL(__ww_mutex_lock);
 int __sched
 __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	int ret;
+	int ret = 1;
 
 	might_sleep();
 
-	ret = __mutex_fastpath_lock_retval(&lock->base.count);
+	if (!need_yield_to_waiter(lock))
+		ret = __mutex_fastpath_lock_retval(&lock->base.count);
 
 	if (likely(!ret)) {
 		ww_mutex_set_context_fastpath(lock, ctx);
-- 
2.1.4