linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Davidlohr Bueso <dave@stgolabs.net>
To: Peter Zijlstra <peterz@infradead.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Chris Mason <clm@fb.com>, Steven Rostedt <rostedt@goodmis.org>,
	Manfred Spraul <manfred@colorfullife.com>,
	George Spelvin <linux@horizon.com>,
	linux-kernel@vger.kernel.org, Davidlohr Bueso <dave@stgolabs.net>,
	Davidlohr Bueso <dbueso@suse.de>
Subject: [PATCH 2/3] futex: lockless wakeups
Date: Fri,  1 May 2015 08:27:51 -0700	[thread overview]
Message-ID: <1430494072-30283-3-git-send-email-dave@stgolabs.net> (raw)
In-Reply-To: <1430494072-30283-1-git-send-email-dave@stgolabs.net>

Given the overall futex architecture, any chance of reducing
hb->lock contention is welcome. In this particular case, using
wake-queues to enable lockless wakeups addresses very much real
world performance concerns, even cases of soft-lockups in cases
of large amounts of blocked tasks (which is not hard to find in
large boxes, using but just a handful of futex).

At the lowest level, this patch can reduce latency of a single thread
attempting to acquire hb->lock in highly contended scenarios by a
up to 2x. At lower counts of nr_wake there are no regressions,
confirming, of course, that the wake_q handling overhead is practically
non existent. For instance, while a fair amount of variation,
the extended pef-bench wakeup benchmark shows for a 20 core machine
the following avg per-thread time to wakeup its share of tasks:

nr_thr	ms-before	ms-after
16 	0.0590		0.0215
32 	0.0396		0.0220
48 	0.0417		0.0182
64 	0.0536		0.0236
80 	0.0414		0.0097
96 	0.0672		0.0152

Naturally, this can cause spurious wakeups. However there is no core code
that cannot handle them afaict, and furthermore tglx does have the point
that other events can already trigger them anyway.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 kernel/futex.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e40..f9984c3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex_q *q)
 
 /*
  * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
  */
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
 {
 	struct task_struct *p = q->task;
 
@@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q)
 		return;
 
 	/*
-	 * We set q->lock_ptr = NULL _before_ we wake up the task. If
-	 * a non-futex wake up happens on another CPU then the task
-	 * might exit and p would dereference a non-existing task
-	 * struct. Prevent this by holding a reference on p across the
-	 * wake up.
+	 * Queue the task for later wakeup for after we've released
+	 * the hb->lock. wake_q_add() grabs reference to p.
 	 */
-	get_task_struct(p);
-
+	wake_q_add(wake_q, p);
 	__unqueue_futex(q);
 	/*
 	 * The waiting task can free the futex_q as soon as
@@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q)
 	 */
 	smp_wmb();
 	q->lock_ptr = NULL;
-
-	wake_up_state(p, TASK_NORMAL);
-	put_task_struct(p);
 }
 
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	struct futex_q *this, *next;
 	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
+	WAKE_Q(wake_q);
 
 	if (!bitset)
 		return -EINVAL;
@@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 			if (!(this->bitset & bitset))
 				continue;
 
-			wake_futex(this);
+			mark_wake_futex(&wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
 	}
 
 	spin_unlock(&hb->lock);
+	wake_up_q(&wake_q);
 out_put_key:
 	put_futex_key(&key);
 out:
@@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
 	int ret, op_ret;
+	WAKE_Q(wake_q);
 
 retry:
 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1320,7 +1318,7 @@ retry_private:
 				ret = -EINVAL;
 				goto out_unlock;
 			}
-			wake_futex(this);
+			mark_wake_futex(&wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
@@ -1334,7 +1332,7 @@ retry_private:
 					ret = -EINVAL;
 					goto out_unlock;
 				}
-				wake_futex(this);
+				mark_wake_futex(&wake_q, this);
 				if (++op_ret >= nr_wake2)
 					break;
 			}
@@ -1344,6 +1342,7 @@ retry_private:
 
 out_unlock:
 	double_unlock_hb(hb1, hb2);
+	wake_up_q(&wake_q);
 out_put_keys:
 	put_futex_key(&key2);
 out_put_key1:
@@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
+	WAKE_Q(wake_q);
 
 	if (requeue_pi) {
 		/*
@@ -1679,7 +1679,7 @@ retry_private:
 		 * woken by futex_unlock_pi().
 		 */
 		if (++task_count <= nr_wake && !requeue_pi) {
-			wake_futex(this);
+			mark_wake_futex(&wake_q, this);
 			continue;
 		}
 
@@ -1719,6 +1719,7 @@ retry_private:
 out_unlock:
 	free_pi_state(pi_state);
 	double_unlock_hb(hb1, hb2);
+	wake_up_q(&wake_q);
 	hb_waiters_dec(hb2);
 
 	/*
-- 
2.1.4


  parent reply	other threads:[~2015-05-01 15:28 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-05-01 15:27 [PATCH v2 0/3] kernel: lockless wake-queues Davidlohr Bueso
2015-05-01 15:27 ` [PATCH 1/3] sched: " Davidlohr Bueso
2015-05-08 13:23   ` [tip:sched/core] sched: Implement " tip-bot for Peter Zijlstra
2015-05-01 15:27 ` Davidlohr Bueso [this message]
2015-05-08 13:23   ` [tip:sched/core] futex: Implement lockless wakeups tip-bot for Davidlohr Bueso
2015-05-01 15:27 ` [PATCH 3/3] ipc/mqueue: lockless pipelined wakeups Davidlohr Bueso
2015-05-01 21:52   ` George Spelvin
2015-05-02  0:35     ` Davidlohr Bueso
2015-05-05  0:37       ` George Spelvin
2015-05-04 14:02   ` [PATCH v2 " Davidlohr Bueso
2015-05-08 13:24     ` [tip:sched/core] ipc/mqueue: Implement " tip-bot for Davidlohr Bueso

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1430494072-30283-3-git-send-email-dave@stgolabs.net \
    --to=dave@stgolabs.net \
    --cc=bigeasy@linutronix.de \
    --cc=clm@fb.com \
    --cc=dbueso@suse.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux@horizon.com \
    --cc=manfred@colorfullife.com \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=rostedt@goodmis.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).