[RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
@ 2013-11-25 20:58 Thomas Gleixner
  2013-11-25 20:58 ` [RFC patch 1/5] futex: Misc cleanups Thomas Gleixner
                   ` (5 more replies)
  0 siblings, 6 replies; 31+ messages in thread
From: Thomas Gleixner @ 2013-11-25 20:58 UTC (permalink / raw)
  To: LKML
  Cc: Davidlohr Bueso, Jason Low, Ingo Molnar, Darren Hart,
	Peter Zijlstra, Mike Galbraith, Jeff Mahoney, Linus Torvalds,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

The patch set from Davidlohr [1] tried to attempt the same via an
atomic counter of waiters in a hash bucket. The atomic counter access
provided enough serialization for x86 so that a failure is not
observable in testing, but does not provide any guarantees.

The same can be achieved with a smp_mb() pair including proper
guarantees for all architectures.

The following series provides an incremental approach to this and adds
documentation of the ordering guarantees of futexes.

Note, this is RFC and needs a lot of review, testing and proper
performance numbers for the following scenarios:

1) Test case where a single waiter is about to queue itself, i.e. the
   test case Davidlohr used to gather his numbers.

2) Test case where the hash bucket is always not empty. This allows us
   to determine the smp_mb() overhead for cases which are not
   optimized by the singler waiter per bucket.

These tests need to be done on x86 AND on other architectures where
the smp_mb() might be more expensive than on x86.

Thanks,

	tglx

---
[1] http://lkml.kernel.org/r/1385168197-8612-5-git-send-email-davidlohr@hp.com

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [RFC patch 1/5] futex: Misc cleanups
  2013-11-25 20:58 [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Thomas Gleixner
@ 2013-11-25 20:58 ` Thomas Gleixner
  2013-11-25 20:58 ` [RFC patch 2/5] futex: Document ordering guarantees Thomas Gleixner
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 31+ messages in thread
From: Thomas Gleixner @ 2013-11-25 20:58 UTC (permalink / raw)
  To: LKML
  Cc: Davidlohr Bueso, Jason Low, Ingo Molnar, Darren Hart,
	Peter Zijlstra, Mike Galbraith, Jeff Mahoney, Linus Torvalds,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

[-- Attachment #1: futex-misc-cleanups.patch --]
[-- Type: text/plain, Size: 4855 bytes --]

From: Jason Low <jason.low2@hp.com>

- Remove unnecessary head variables.
- Delete unused parameter in queue_unlock().

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Link: http://lkml.kernel.org/r/1385168197-8612-2-git-send-email-davidlohr@hp.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c |   40 ++++++++++++----------------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c
+++ linux-2.6/kernel/futex.c
@@ -597,13 +597,10 @@ lookup_pi_state(u32 uval, struct futex_h
 {
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_q *this, *next;
-	struct plist_head *head;
 	struct task_struct *p;
 	pid_t pid = uval & FUTEX_TID_MASK;
 
-	head = &hb->chain;
-
-	plist_for_each_entry_safe(this, next, head, list) {
+	plist_for_each_entry_safe(this, next, &hb->chain, list) {
 		if (match_futex(&this->key, key)) {
 			/*
 			 * Another waiter already exists - bump up
@@ -985,7 +982,6 @@ futex_wake(u32 __user *uaddr, unsigned i
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
-	struct plist_head *head;
 	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
 
@@ -998,9 +994,8 @@ futex_wake(u32 __user *uaddr, unsigned i
 
 	hb = hash_futex(&key);
 	spin_lock(&hb->lock);
-	head = &hb->chain;
 
-	plist_for_each_entry_safe(this, next, head, list) {
+	plist_for_each_entry_safe(this, next, &hb->chain, list) {
 		if (match_futex (&this->key, &key)) {
 			if (this->pi_state || this->rt_waiter) {
 				ret = -EINVAL;
@@ -1033,7 +1028,6 @@ futex_wake_op(u32 __user *uaddr1, unsign
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
 	struct futex_hash_bucket *hb1, *hb2;
-	struct plist_head *head;
 	struct futex_q *this, *next;
 	int ret, op_ret;
 
@@ -1081,9 +1075,7 @@ retry_private:
 		goto retry;
 	}
 
-	head = &hb1->chain;
-
-	plist_for_each_entry_safe(this, next, head, list) {
+	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
 		if (match_futex (&this->key, &key1)) {
 			if (this->pi_state || this->rt_waiter) {
 				ret = -EINVAL;
@@ -1096,10 +1088,8 @@ retry_private:
 	}
 
 	if (op_ret > 0) {
-		head = &hb2->chain;
-
 		op_ret = 0;
-		plist_for_each_entry_safe(this, next, head, list) {
+		plist_for_each_entry_safe(this, next, &hb2->chain, list) {
 			if (match_futex (&this->key, &key2)) {
 				if (this->pi_state || this->rt_waiter) {
 					ret = -EINVAL;
@@ -1269,7 +1259,6 @@ static int futex_requeue(u32 __user *uad
 	int drop_count = 0, task_count = 0, ret;
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
-	struct plist_head *head1;
 	struct futex_q *this, *next;
 	u32 curval2;
 
@@ -1392,8 +1381,7 @@ retry_private:
 		}
 	}
 
-	head1 = &hb1->chain;
-	plist_for_each_entry_safe(this, next, head1, list) {
+	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
 		if (task_count - nr_wake >= nr_requeue)
 			break;
 
@@ -1492,8 +1480,7 @@ static inline struct futex_hash_bucket *
 	return hb;
 }
 
-static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+static inline void queue_unlock(struct futex_hash_bucket *hb)
 	__releases(&hb->lock)
 {
 	spin_unlock(&hb->lock);
@@ -1866,7 +1853,7 @@ retry_private:
 	ret = get_futex_value_locked(&uval, uaddr);
 
 	if (ret) {
-		queue_unlock(q, *hb);
+		queue_unlock(*hb);
 
 		ret = get_user(uval, uaddr);
 		if (ret)
@@ -1880,7 +1867,7 @@ retry_private:
 	}
 
 	if (uval != val) {
-		queue_unlock(q, *hb);
+		queue_unlock(*hb);
 		ret = -EWOULDBLOCK;
 	}
 
@@ -2028,7 +2015,7 @@ retry_private:
 			 * Task is exiting and we just wait for the
 			 * exit to complete.
 			 */
-			queue_unlock(&q, hb);
+			queue_unlock(hb);
 			put_futex_key(&q.key);
 			cond_resched();
 			goto retry;
@@ -2080,7 +2067,7 @@ retry_private:
 	goto out_put_key;
 
 out_unlock_put_key:
-	queue_unlock(&q, hb);
+	queue_unlock(hb);
 
 out_put_key:
 	put_futex_key(&q.key);
@@ -2090,7 +2077,7 @@ out:
 	return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
 uaddr_faulted:
-	queue_unlock(&q, hb);
+	queue_unlock(hb);
 
 	ret = fault_in_user_writeable(uaddr);
 	if (ret)
@@ -2112,7 +2099,6 @@ static int futex_unlock_pi(u32 __user *u
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
-	struct plist_head *head;
 	union futex_key key = FUTEX_KEY_INIT;
 	u32 uval, vpid = task_pid_vnr(current);
 	int ret;
@@ -2152,9 +2138,7 @@ retry:
 	 * Ok, other tasks may need to be woken up - check waiters
 	 * and do the wakeup if necessary:
 	 */
-	head = &hb->chain;
-
-	plist_for_each_entry_safe(this, next, head, list) {
+	plist_for_each_entry_safe(this, next, &hb->chain, list) {
 		if (!match_futex (&this->key, &key))
 			continue;
 		ret = wake_futex_pi(uaddr, uval, this);



^ permalink raw reply	[flat|nested] 31+ messages in thread

* [RFC patch 2/5] futex: Document ordering guarantees
  2013-11-25 20:58 [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Thomas Gleixner
  2013-11-25 20:58 ` [RFC patch 1/5] futex: Misc cleanups Thomas Gleixner
@ 2013-11-25 20:58 ` Thomas Gleixner
  2013-11-25 20:58 ` [RFC patch 3/5] futex: Split out unlock from queue_me() Thomas Gleixner
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 31+ messages in thread
From: Thomas Gleixner @ 2013-11-25 20:58 UTC (permalink / raw)
  To: LKML
  Cc: Davidlohr Bueso, Jason Low, Ingo Molnar, Darren Hart,
	Peter Zijlstra, Mike Galbraith, Jeff Mahoney, Linus Torvalds,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

[-- Attachment #1: futex-document-serialization.patch --]
[-- Type: text/plain, Size: 2414 bytes --]

That's essential, if you want to hack on futexes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c |   57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c
+++ linux-2.6/kernel/futex.c
@@ -68,6 +68,63 @@
 
 #include "locking/rtmutex_common.h"
 
+/*
+ * Basic futex operation and ordering guarantees:
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). It computes the hash bucket and acquires the hash
+ * bucket lock. After that it reads the futex user space value again
+ * and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash
+ * bucket lock and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). It computes the hash bucket and acquires the hash
+ * bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * Note, that the spin_lock serializes waiters and wakers, so that the
+ * following scenario is avoided:
+ *
+ * CPU 0				CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ *   futex_wait(futex, val);
+ *   uval = *futex;
+ *					*futex = newval;
+ *					sys_futex(WAKE, futex);
+ *					  futex_wake(futex);
+ *					  if (queue_empty())
+ *					    return;
+ *   if (uval == val)
+ *      lock(hash_bucket(futex));
+ *      queue();
+ *	unlock(hash_bucket(futex));
+ *	schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ * The spinlock serializes that:
+ *
+ * CPU 0				CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ *   futex_wait(futex, val);
+ *   lock(hash_bucket(futex));
+ *   uval = *futex;
+ *					*futex = newval;
+ *					sys_futex(WAKE, futex);
+ *					  futex_wake(futex);
+ *					  lock(hash_bucket(futex));
+ *   if (uval == val)
+ *      queue();
+ *	unlock(hash_bucket(futex));
+ *	schedule();			  if (!queue_empty())
+ *					    wake_waiters(futex);
+ *					  unlock(hash_bucket(futex));
+ */
+
 int __read_mostly futex_cmpxchg_enabled;
 
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)



^ permalink raw reply	[flat|nested] 31+ messages in thread

* [RFC patch 3/5] futex: Split out unlock from queue_me()
  2013-11-25 20:58 [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Thomas Gleixner
  2013-11-25 20:58 ` [RFC patch 1/5] futex: Misc cleanups Thomas Gleixner
  2013-11-25 20:58 ` [RFC patch 2/5] futex: Document ordering guarantees Thomas Gleixner
@ 2013-11-25 20:58 ` Thomas Gleixner
  2013-11-25 20:58 ` [RFC patch 4/5] futex: Enqueue waiter before user space check Thomas Gleixner
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 31+ messages in thread
From: Thomas Gleixner @ 2013-11-25 20:58 UTC (permalink / raw)
  To: LKML
  Cc: Davidlohr Bueso, Jason Low, Ingo Molnar, Darren Hart,
	Peter Zijlstra, Mike Galbraith, Jeff Mahoney, Linus Torvalds,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

[-- Attachment #1: futex-split-out-unlock-from-queue-me.patch --]
[-- Type: text/plain, Size: 2324 bytes --]

Preparatory patch for a lockless empty check of the hash bucket plist
in futex_wake().

No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c |   24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c
+++ linux-2.6/kernel/futex.c
@@ -1548,15 +1548,14 @@ static inline void queue_unlock(struct f
  * @q:	The futex_q to enqueue
  * @hb:	The destination hash bucket
  *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me().  The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
+ * The hb->lock must be held by the caller. A call to queue_me() is
+ * typically paired with exactly one call to unqueue_me(). The
+ * exceptions involve the PI related operations, which may use
+ * unqueue_me_pi() or nothing if the unqueue is done as part of the
+ * wake process and the unqueue state is implicit in the state of
+ * woken task (see futex_wait_requeue_pi() for an example).
  */
 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
-	__releases(&hb->lock)
 {
 	int prio;
 
@@ -1573,7 +1572,6 @@ static inline void queue_me(struct futex
 	plist_node_init(&q->list, prio);
 	plist_add(&q->list, &hb->chain);
 	q->task = current;
-	spin_unlock(&hb->lock);
 }
 
 /**
@@ -1834,6 +1832,11 @@ static void futex_wait_queue_me(struct f
 	 */
 	set_current_state(TASK_INTERRUPTIBLE);
 	queue_me(q, hb);
+	/*
+	 * Unlock _AFTER_ we queued ourself. See the comment describing
+	 * the futex ordering guarantees on top of this file.
+	 */
+	queue_unlock(hb);
 
 	/* Arm the timer */
 	if (timeout) {
@@ -2085,6 +2088,11 @@ retry_private:
 	 * Only actually queue now that the atomic ops are done:
 	 */
 	queue_me(&q, hb);
+	/*
+	 * Unlock _AFTER_ we queued ourself. See the comment describing
+	 * the futex ordering guarantees on top of this file.
+	 */
+	queue_unlock(hb);
 
 	WARN_ON(!q.pi_state);
 	/*



^ permalink raw reply	[flat|nested] 31+ messages in thread

* [RFC patch 4/5] futex: Enqueue waiter before user space check
  2013-11-25 20:58 [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Thomas Gleixner
                   ` (2 preceding siblings ...)
  2013-11-25 20:58 ` [RFC patch 3/5] futex: Split out unlock from queue_me() Thomas Gleixner
@ 2013-11-25 20:58 ` Thomas Gleixner
  2013-11-26  0:20   ` Darren Hart
  2013-11-25 20:58 ` [RFC patch 5/5] futex: Allow lockless empty check of hash bucket plist Thomas Gleixner
  2013-11-26  8:12 ` [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Davidlohr Bueso
  5 siblings, 1 reply; 31+ messages in thread
From: Thomas Gleixner @ 2013-11-25 20:58 UTC (permalink / raw)
  To: LKML
  Cc: Davidlohr Bueso, Jason Low, Ingo Molnar, Darren Hart,
	Peter Zijlstra, Mike Galbraith, Jeff Mahoney, Linus Torvalds,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

[-- Attachment #1: futex-queue-early.patch --]
[-- Type: text/plain, Size: 6325 bytes --]

This changes the queue ordering on the waiter side from 

     lock(hash_bucket);
     validate user space value();
     queue();
     unlock(hash_bucket);

to
     lock(hash_bucket);
     queue();
     validate user space value();
     unlock(hash_bucket);

This is a preparatory patch for a lockless empty check of the hash
bucket plist.

No functional implication. All futex operations are still serialized
via the hasb bucket lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c |   84 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 49 insertions(+), 35 deletions(-)

Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c
+++ linux-2.6/kernel/futex.c
@@ -107,22 +107,26 @@
  * and the waker did not find the waiter in the hash bucket queue.
  * The spinlock serializes that:
  *
+ *
  * CPU 0				CPU 1
  * val = *futex;
  * sys_futex(WAIT, futex, val);
  *   futex_wait(futex, val);
  *   lock(hash_bucket(futex));
+ *   queue();
  *   uval = *futex;
  *					*futex = newval;
  *					sys_futex(WAKE, futex);
  *					  futex_wake(futex);
  *					  lock(hash_bucket(futex));
  *   if (uval == val)
- *      queue();
  *	unlock(hash_bucket(futex));
  *	schedule();			  if (!queue_empty())
  *					    wake_waiters(futex);
  *					  unlock(hash_bucket(futex));
+ *
+ * The futex_lock_pi ordering is similar to that, but it has the queue
+ * operation right before unlocking hash bucket lock and scheduling.
  */
 
 int __read_mostly futex_cmpxchg_enabled;
@@ -1575,6 +1579,19 @@ static inline void queue_me(struct futex
 }
 
 /**
+ * unqueue_and_unlock() - Dequeue the futex_q and release hash bucket lock
+ * @q:	The futex_q to dequeue
+ * @hb:	The hash bucket
+ */
+static inline void
+unqueue_and_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+	__releases(&hb->lock)
+{
+	plist_del(&q->list, &hb->chain);
+	spin_unlock(&hb->lock);
+}
+
+/**
  * unqueue_me() - Remove the futex_q from its futex_hash_bucket
  * @q:	The futex_q to unqueue
  *
@@ -1816,28 +1833,12 @@ out:
 }
 
 /**
- * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
- * @hb:		the futex hash bucket, must be locked by the caller
+ * __futex_wait() - wait for wakeup, timeout, or signal
  * @q:		the futex_q to queue up on
  * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
  */
-static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
-				struct hrtimer_sleeper *timeout)
+static void __futex_wait(struct futex_q *q, struct hrtimer_sleeper *timeout)
 {
-	/*
-	 * The task state is guaranteed to be set before another task can
-	 * wake it. set_current_state() is implemented using set_mb() and
-	 * queue_me() calls spin_unlock() upon completion, both serializing
-	 * access to the hash list and forcing another memory barrier.
-	 */
-	set_current_state(TASK_INTERRUPTIBLE);
-	queue_me(q, hb);
-	/*
-	 * Unlock _AFTER_ we queued ourself. See the comment describing
-	 * the futex ordering guarantees on top of this file.
-	 */
-	queue_unlock(hb);
-
 	/* Arm the timer */
 	if (timeout) {
 		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
@@ -1897,10 +1898,6 @@ static int futex_wait_setup(u32 __user *
 	 * would open a race condition where we could block indefinitely with
 	 * cond(var) false, which would violate the guarantee.
 	 *
-	 * On the other hand, we insert q and release the hash-bucket only
-	 * after testing *uaddr.  This guarantees that futex_wait() will NOT
-	 * absorb a wakeup if *uaddr does not match the desired values
-	 * while the syscall executes.
 	 */
 retry:
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
@@ -1910,10 +1907,15 @@ retry:
 retry_private:
 	*hb = queue_lock(q);
 
+	/*
+	 * We queue the futex before validating the user space value.
+	 */
+	queue_me(q, *hb);
+
 	ret = get_futex_value_locked(&uval, uaddr);
 
 	if (ret) {
-		queue_unlock(*hb);
+		unqueue_and_unlock(q, *hb);
 
 		ret = get_user(uval, uaddr);
 		if (ret)
@@ -1927,13 +1929,25 @@ retry_private:
 	}
 
 	if (uval != val) {
-		queue_unlock(*hb);
+		unqueue_and_unlock(q, *hb);
 		ret = -EWOULDBLOCK;
 	}
 
 out:
-	if (ret)
+	if (!ret) {
+		/*
+		 * If we sucessfully queued ourself, set the state to
+		 * TASK_INTERRUPTIBLE. A potential waker cannot wake
+		 * us yet, as it waits for the hb->lock to be released
+		 * by us. A potential timeout timer is not yet armed
+		 * and a signal wakeup which happened before this is
+		 * going to be reissued by the scheduler.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		queue_unlock(*hb);
+	} else {
 		put_futex_key(&q->key);
+	}
 	return ret;
 }
 
@@ -1963,15 +1977,15 @@ static int futex_wait(u32 __user *uaddr,
 
 retry:
 	/*
-	 * Prepare to wait on uaddr. On success, holds hb lock and increments
-	 * q.key refs.
+	 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+	 * count and q enqueued on hb.
 	 */
 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
 	if (ret)
 		goto out;
 
-	/* queue_me and wait for wakeup, timeout, or a signal. */
-	futex_wait_queue_me(hb, &q, to);
+	/* Wait for wakeup, timeout, or a signal. */
+	__futex_wait(&q, to);
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	ret = 0;
@@ -2308,8 +2322,8 @@ int handle_early_requeue_pi_wakeup(struc
  * without one, the pi logic would not know which task to boost/deboost, if
  * there was a need to.
  *
- * We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following--
+ * We call schedule in __futex_wait() and return there via the
+ * following:
  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
  * 2) wakeup on uaddr2 after a requeue
  * 3) signal
@@ -2376,14 +2390,14 @@ static int futex_wait_requeue_pi(u32 __u
 
 	/*
 	 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
-	 * count.
+	 * count and q enqueued on hb.
 	 */
 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
 	if (ret)
 		goto out_key2;
 
-	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
-	futex_wait_queue_me(hb, &q, to);
+	/* Wait for wakeup. */
+	__futex_wait(&q, to);
 
 	spin_lock(&hb->lock);
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);



^ permalink raw reply	[flat|nested] 31+ messages in thread

* [RFC patch 5/5] futex: Allow lockless empty check of hash bucket plist
  2013-11-25 20:58 [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Thomas Gleixner
                   ` (3 preceding siblings ...)
  2013-11-25 20:58 ` [RFC patch 4/5] futex: Enqueue waiter before user space check Thomas Gleixner
@ 2013-11-25 20:58 ` Thomas Gleixner
  2013-11-26  8:12 ` [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Davidlohr Bueso
  5 siblings, 0 replies; 31+ messages in thread
From: Thomas Gleixner @ 2013-11-25 20:58 UTC (permalink / raw)
  To: LKML
  Cc: Davidlohr Bueso, Jason Low, Ingo Molnar, Darren Hart,
	Peter Zijlstra, Mike Galbraith, Jeff Mahoney, Linus Torvalds,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

[-- Attachment #1: futex-allow-lockless-check-of-hash-bucket-list.patch --]
[-- Type: text/plain, Size: 5508 bytes --]

With the early enqueue of the waiter into the hash bucket list in
place, we can guarantee the futex ordering with an smp_mb() pair
and allow the lockless empty check of the hash bucket plist in
futex_wake().

This changes the order to:

 CPU 0				CPU 1
 val = *futex;
 sys_futex(WAIT, futex, val);
   futex_wait(futex, val);
   lock(hash_bucket(futex));
   queue();

   smp_mb(); <-- paired with ---
				|
   uval = *futex;		|
				|	*futex = newval;
				|	sys_futex(WAKE, futex);
				|	  futex_wake(futex);
				|
				------>	  smp_mb();

					  if (!queue_empty)
					    return;
					  lock(hash_bucket(futex));
   if (uval == val)
	unlock(hash_bucket(futex));
	schedule();			  if (!queue_empty())
					    wake_waiters(futex);
					  unlock(hash_bucket(futex));

This does preserve the futex ordering guarantee, which ensures, that a
waiter either observes the changed user space value before blocking or
is woken by a concurrent waker. See the related discussion at:

 http://lkml.kernel.org/r/alpine.DEB.2.02.1311231206380.30673@ionos.tec.linutronix.de

Thanks to Peter Zijlstra for noticing that we require a full mb()
instead of a wmb/rmb() pair.

Note, that this change needs a real justification with numbers of
various workloads aside of the single big database workload which
inspired HP folks to poke on that. This also wants to be verified on
!x86.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c |   97 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 91 insertions(+), 6 deletions(-)

Index: linux-2.6/kernel/futex.c
===================================================================
--- linux-2.6.orig/kernel/futex.c
+++ linux-2.6/kernel/futex.c
@@ -114,10 +114,18 @@
  *   futex_wait(futex, val);
  *   lock(hash_bucket(futex));
  *   queue();
- *   uval = *futex;
- *					*futex = newval;
- *					sys_futex(WAKE, futex);
- *					  futex_wake(futex);
+ *
+ *   smp_mb(); <-- paired with --
+ *				|
+ *   uval = *futex;		|
+ *				|	*futex = newval;
+ *				|	sys_futex(WAKE, futex);
+ *				|	  futex_wake(futex);
+ *				|
+ *				------>	  smp_mb();
+ *
+ *					  if (!queue_empty)
+ *					    return;
  *					  lock(hash_bucket(futex));
  *   if (uval == val)
  *	unlock(hash_bucket(futex));
@@ -125,8 +133,32 @@
  *					    wake_waiters(futex);
  *					  unlock(hash_bucket(futex));
  *
- * The futex_lock_pi ordering is similar to that, but it has the queue
- * operation right before unlocking hash bucket lock and scheduling.
+ * The smp_mb() pair allows a quick check on the waker side whether
+ * the hash bucket queue is empty or not. All other operations are
+ * still proper serialized via the hash bucket lock.
+ *
+ * The futex_lock_pi functionality is similar to that, but it relies
+ * on the full spinlock serialization:
+ *
+ * CPU 0				CPU 1
+ * val = *futex;
+ * sys_futex(LOCK_PI, futex, val);
+ *   futex_lock_pi(futex, val);
+ *   lock(hash_bucket(futex));
+ *
+ *   atomic_cmpxchg(futex, val, uval);
+ *					*futex = newval;
+ *					sys_futex(UNLOCK_PI, futex);
+ *					  futex_unlock_pi(futex);
+ *
+ *					  lock(hash_bucket(futex));
+ *
+ *   if (checks(val, uval))
+ *	queue();
+ *	unlock(hash_bucket(futex));
+ *	schedule();			  if (!queue_empty())
+ *					    wake_waiters(futex);
+ *					  unlock(hash_bucket(futex));
  */
 
 int __read_mostly futex_cmpxchg_enabled;
@@ -1054,6 +1086,43 @@ futex_wake(u32 __user *uaddr, unsigned i
 		goto out;
 
 	hb = hash_futex(&key);
+
+	/*
+	 * The following smp_mb() pairs with the smp_mb() in
+	 * futex_wait_setup().
+	 *
+	 * It ensures the proper ordering of the plist operations with
+	 * the operations on *uaddr.
+	 *
+	 * Abstract problem description:
+	 *
+	 * W[x]		|	W[y]
+	 * mb		|	mb
+	 * R[y]		|	R[x]
+	 *
+	 * i.e.:
+	 *
+	 * Waiter	|	Waker
+	 *
+	 * plist_add()	|	*uaddr = newval
+	 * smp_mb()	|	smp_mb()
+	 * test(*uaddr)	|	plist_head_empty()
+	 *
+	 * So it is guaranteed that:
+	 *
+	 * The waiter observes the change to the uaddr value after it
+	 * added itself to the plist.
+	 *
+	 * The waker observes plist not empty if the change to uaddr
+	 * was made after the waiter checked the value.
+	 *
+	 * See also the comment about ordering guarantees at the top
+	 * of this file.
+	 */
+	smp_mb();
+	if (plist_head_empty(&hb->chain))
+		goto out_put_keys;
+
 	spin_lock(&hb->lock);
 
 	plist_for_each_entry_safe(this, next, &hb->chain, list) {
@@ -1074,6 +1143,7 @@ futex_wake(u32 __user *uaddr, unsigned i
 	}
 
 	spin_unlock(&hb->lock);
+out_put_keys:
 	put_futex_key(&key);
 out:
 	return ret;
@@ -1588,6 +1658,15 @@ unqueue_and_unlock(struct futex_q *q, st
 	__releases(&hb->lock)
 {
 	plist_del(&q->list, &hb->chain);
+	/*
+	 * Note, we do not need a mb() here. This is called in error
+	 * handling pathes under the hb->lock. A potential waker which
+	 * sees the transient state that we are still enqueued is
+	 * going to take hb->lock and then notice that we are
+	 * gone. Not much we can do about that. The lockless check in
+	 * futex_wake() is optimized for the common case, not for the
+	 * error handling transients.
+	 */
 	spin_unlock(&hb->lock);
 }
 
@@ -1911,6 +1990,12 @@ retry_private:
 	 * We queue the futex before validating the user space value.
 	 */
 	queue_me(q, *hb);
+	/*
+	 * Pairs with the smp_mb() in futex_wake() to guarantee the
+	 * ordering between the queueing and the user space value
+	 * test. See detailed explanation of the barrier there.
+	 */
+	smp_mb();
 
 	ret = get_futex_value_locked(&uval, uaddr);
 



^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 4/5] futex: Enqueue waiter before user space check
  2013-11-25 20:58 ` [RFC patch 4/5] futex: Enqueue waiter before user space check Thomas Gleixner
@ 2013-11-26  0:20   ` Darren Hart
  0 siblings, 0 replies; 31+ messages in thread
From: Darren Hart @ 2013-11-26  0:20 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Davidlohr Bueso, Jason Low, Ingo Molnar, Peter Zijlstra,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Mon, 2013-11-25 at 20:58 +0000, Thomas Gleixner wrote:


This changes the queue ordering on the waiter side from 
> 
>      lock(hash_bucket);
>      validate user space value();
>      queue();
>      unlock(hash_bucket);
> 
> to
>      lock(hash_bucket);
>      queue();
>      validate user space value();
>      unlock(hash_bucket);
> 
> This is a preparatory patch for a lockless empty check of the hash
> bucket plist.
> 
> No functional implication. All futex operations are still serialized
> via the hasb bucket lock.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 

Reviewed-by: Darren Hart <dvhart@linux.intel.com>

...

> + *
> + * The futex_lock_pi ordering is similar to that, but it has the
queue
> + * operation right before unlocking hash bucket lock and scheduling.
> 

There is also futex_wait_requeue_pi(), as you mentioned earlier.

I spent some time looking into this as I thought I had something in
place to prevent futex_wake from waking pending requeue_pi futex_qs.

futex_wait_requeue_pi() sets up a q.rt_waiter pointer which
distinguishes requeue_pi waiters from other waiters.

futex_wake() will abort if it detects the rt_waiter, returning EINVAL as
it is an invalid op pairing.

In this case, the spin_lock is adequate for futex_wait_requeue_pi()
because the wake wouldn't be performed regardless, so while the race
still exists, it isn't relevant. Worst case, the futex_wake returns 0,
indicating the list was empty and it didn't wake anything, rather than
-EINVAL, indicating the invalid op pairing. This is a delta from today's
behavior, but it is in a rare error path, and the result is no more
damaging than the existing behavior.


The approach appears sound to me. Now I'll get to looking at testing.

-- 
Darren Hart
Intel Open Source Technology Center
Yocto Project - Linux Kernel


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-25 20:58 [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Thomas Gleixner
                   ` (4 preceding siblings ...)
  2013-11-25 20:58 ` [RFC patch 5/5] futex: Allow lockless empty check of hash bucket plist Thomas Gleixner
@ 2013-11-26  8:12 ` Davidlohr Bueso
  2013-11-26  8:52   ` Peter Zijlstra
  5 siblings, 1 reply; 31+ messages in thread
From: Davidlohr Bueso @ 2013-11-26  8:12 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Jason Low, Ingo Molnar, Darren Hart, Peter Zijlstra,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Mon, 2013-11-25 at 20:58 +0000, Thomas Gleixner wrote:
> The patch set from Davidlohr [1] tried to attempt the same via an
> atomic counter of waiters in a hash bucket. The atomic counter access
> provided enough serialization for x86 so that a failure is not
> observable in testing, but does not provide any guarantees.
> 
> The same can be achieved with a smp_mb() pair including proper
> guarantees for all architectures.

I am becoming hesitant about this approach. The following are some
results, from my quad-core laptop, measuring the latency of nthread
wakeups (1 at a time). In addition, failed wait calls never occur -- so
we don't end up including the (otherwise minimal) overhead of the list
queue+dequeue, only measuring the smp_mb() usage when !empty list never
occurs.

+---------+--------------------+--------+-------------------+--------+----------+
| threads | baseline time (ms) | stddev | patched time (ms) | stddev | overhead |
+---------+--------------------+--------+-------------------+--------+----------+
|     512 | 4.2410             | 0.9762 | 12.3660           | 5.1020 | +191.58% |
|     256 | 2.7750             | 0.3997 | 7.0220            | 2.9436 | +153.04% |
|     128 | 1.4910             | 0.4188 | 3.7430            | 0.8223 | +151.03% |
|      64 | 0.8970             | 0.3455 | 2.5570            | 0.3710 | +185.06% |
|      32 | 0.3620             | 0.2242 | 1.1300            | 0.4716 | +212.15% |
+---------+--------------------+--------+-------------------+--------+----------+

While the variation is quite a bit in the patched version for higher
nthreads, the overhead is significant in all cases. Now, this is a very
specific program and far from what occurs in the real world, but I
believe it's good data to have to make a future decision about this kind
of approach.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-26  8:12 ` [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Davidlohr Bueso
@ 2013-11-26  8:52   ` Peter Zijlstra
  2013-11-26 11:21     ` Ingo Molnar
  2013-11-26 19:25     ` Davidlohr Bueso
  0 siblings, 2 replies; 31+ messages in thread
From: Peter Zijlstra @ 2013-11-26  8:52 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Thomas Gleixner, LKML, Jason Low, Ingo Molnar, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Tue, Nov 26, 2013 at 12:12:31AM -0800, Davidlohr Bueso wrote:

> I am becoming hesitant about this approach. The following are some
> results, from my quad-core laptop, measuring the latency of nthread
> wakeups (1 at a time). In addition, failed wait calls never occur -- so
> we don't end up including the (otherwise minimal) overhead of the list
> queue+dequeue, only measuring the smp_mb() usage when !empty list never
> occurs.
> 
> +---------+--------------------+--------+-------------------+--------+----------+
> | threads | baseline time (ms) | stddev | patched time (ms) | stddev | overhead |
> +---------+--------------------+--------+-------------------+--------+----------+
> |     512 | 4.2410             | 0.9762 | 12.3660           | 5.1020 | +191.58% |
> |     256 | 2.7750             | 0.3997 | 7.0220            | 2.9436 | +153.04% |
> |     128 | 1.4910             | 0.4188 | 3.7430            | 0.8223 | +151.03% |
> |      64 | 0.8970             | 0.3455 | 2.5570            | 0.3710 | +185.06% |
> |      32 | 0.3620             | 0.2242 | 1.1300            | 0.4716 | +212.15% |
> +---------+--------------------+--------+-------------------+--------+----------+
> 

Whee, this is far more overhead than I would have expected... pretty
impressive really for a simple mfence ;-)

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-26  8:52   ` Peter Zijlstra
@ 2013-11-26 11:21     ` Ingo Molnar
  2013-11-26 11:56       ` Peter Zijlstra
  2013-11-26 19:25     ` Davidlohr Bueso
  1 sibling, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2013-11-26 11:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Davidlohr Bueso, Thomas Gleixner, LKML, Jason Low, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Nov 26, 2013 at 12:12:31AM -0800, Davidlohr Bueso wrote:
> 
> > I am becoming hesitant about this approach. The following are some
> > results, from my quad-core laptop, measuring the latency of nthread
> > wakeups (1 at a time). In addition, failed wait calls never occur -- so
> > we don't end up including the (otherwise minimal) overhead of the list
> > queue+dequeue, only measuring the smp_mb() usage when !empty list never
> > occurs.
> > 
> > +---------+--------------------+--------+-------------------+--------+----------+
> > | threads | baseline time (ms) | stddev | patched time (ms) | stddev | overhead |
> > +---------+--------------------+--------+-------------------+--------+----------+
> > |     512 | 4.2410             | 0.9762 | 12.3660           | 5.1020 | +191.58% |
> > |     256 | 2.7750             | 0.3997 | 7.0220            | 2.9436 | +153.04% |
> > |     128 | 1.4910             | 0.4188 | 3.7430            | 0.8223 | +151.03% |
> > |      64 | 0.8970             | 0.3455 | 2.5570            | 0.3710 | +185.06% |
> > |      32 | 0.3620             | 0.2242 | 1.1300            | 0.4716 | +212.15% |
> > +---------+--------------------+--------+-------------------+--------+----------+
> > 
> 
> Whee, this is far more overhead than I would have expected... pretty
> impressive really for a simple mfence ;-)

I'm somewhat reluctant to chalk it up to a single mfence - maybe 
timings/behavior changed in some substantial way?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-26 11:21     ` Ingo Molnar
@ 2013-11-26 11:56       ` Peter Zijlstra
  2013-11-26 12:34         ` Thomas Gleixner
  2013-11-26 14:49         ` Davidlohr Bueso
  0 siblings, 2 replies; 31+ messages in thread
From: Peter Zijlstra @ 2013-11-26 11:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Davidlohr Bueso, Thomas Gleixner, LKML, Jason Low, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Tue, Nov 26, 2013 at 12:21:40PM +0100, Ingo Molnar wrote:
> I'm somewhat reluctant to chalk it up to a single mfence - maybe 
> timings/behavior changed in some substantial way?

Ah indeed! We also changed the case where an enqueueing futex sees the
uval change and bails. It is now far more expensive due to having to
both queue and unqueue, whereas before it wouldn't queue at all.

I suppose the idea was to offset that by not requiring locking on the
wake side.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-26 11:56       ` Peter Zijlstra
@ 2013-11-26 12:34         ` Thomas Gleixner
  2013-11-26 15:38           ` Davidlohr Bueso
  2013-11-26 14:49         ` Davidlohr Bueso
  1 sibling, 1 reply; 31+ messages in thread
From: Thomas Gleixner @ 2013-11-26 12:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Davidlohr Bueso, LKML, Jason Low, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Tue, 26 Nov 2013, Peter Zijlstra wrote:

> On Tue, Nov 26, 2013 at 12:21:40PM +0100, Ingo Molnar wrote:
> > I'm somewhat reluctant to chalk it up to a single mfence - maybe 
> > timings/behavior changed in some substantial way?
> 
> Ah indeed! We also changed the case where an enqueueing futex sees the
> uval change and bails. It is now far more expensive due to having to
> both queue and unqueue, whereas before it wouldn't queue at all.
> 
> I suppose the idea was to offset that by not requiring locking on the
> wake side.

Aside of that I really would be interrested in an explanation for the
STDDEV going up by factor 5. That's a clear indicator for fishyness.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-26 11:56       ` Peter Zijlstra
  2013-11-26 12:34         ` Thomas Gleixner
@ 2013-11-26 14:49         ` Davidlohr Bueso
  1 sibling, 0 replies; 31+ messages in thread
From: Davidlohr Bueso @ 2013-11-26 14:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Thomas Gleixner, LKML, Jason Low, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Tue, 2013-11-26 at 12:56 +0100, Peter Zijlstra wrote:
> On Tue, Nov 26, 2013 at 12:21:40PM +0100, Ingo Molnar wrote:
> > I'm somewhat reluctant to chalk it up to a single mfence - maybe 
> > timings/behavior changed in some substantial way?

That would be nthread mfence calls.

> 
> Ah indeed! We also changed the case where an enqueueing futex sees the
> uval change and bails. It is now far more expensive due to having to
> both queue and unqueue, whereas before it wouldn't queue at all.

Right, but those particular numbers do not measure that path.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-26 12:34         ` Thomas Gleixner
@ 2013-11-26 15:38           ` Davidlohr Bueso
  0 siblings, 0 replies; 31+ messages in thread
From: Davidlohr Bueso @ 2013-11-26 15:38 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Ingo Molnar, LKML, Jason Low, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Tue, 2013-11-26 at 13:34 +0100, Thomas Gleixner wrote:
> On Tue, 26 Nov 2013, Peter Zijlstra wrote:
> 
> > On Tue, Nov 26, 2013 at 12:21:40PM +0100, Ingo Molnar wrote:
> > > I'm somewhat reluctant to chalk it up to a single mfence - maybe 
> > > timings/behavior changed in some substantial way?
> > 
> > Ah indeed! We also changed the case where an enqueueing futex sees the
> > uval change and bails. It is now far more expensive due to having to
> > both queue and unqueue, whereas before it wouldn't queue at all.
> > 
> > I suppose the idea was to offset that by not requiring locking on the
> > wake side.
> 
> Aside of that I really would be interrested in an explanation for the
> STDDEV going up by factor 5. That's a clear indicator for fishyness.

FWIW here's another run for the patched kernel, stddev went down to ~3,
yet the overhead is quite similar to original results:

Run summary [PID 17678]: blocking on 512 threads (at futex 0x60314c), waking up 1 at a time.

[Run 1]: Wokeup 512 of 512 threads (100.00%) in 14.0360 ms
[Run 2]: Wokeup 512 of 512 threads (100.00%) in 15.6660 ms
[Run 3]: Wokeup 512 of 512 threads (100.00%) in 13.9330 ms
[Run 4]: Wokeup 512 of 512 threads (100.00%) in 18.5790 ms
[Run 5]: Wokeup 512 of 512 threads (100.00%) in 14.1480 ms
[Run 6]: Wokeup 512 of 512 threads (100.00%) in 14.5930 ms
[Run 7]: Wokeup 512 of 512 threads (100.00%) in 13.4360 ms
[Run 8]: Wokeup 512 of 512 threads (100.00%) in 10.0730 ms
[Run 9]: Wokeup 512 of 512 threads (100.00%) in 16.9040 ms
[Run 10]: Wokeup 512 of 512 threads (100.00%) in 19.3800 ms

Wokeup 512 of 512 threads (100.00%) in 15.0700 ms

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-26  8:52   ` Peter Zijlstra
  2013-11-26 11:21     ` Ingo Molnar
@ 2013-11-26 19:25     ` Davidlohr Bueso
  2013-11-26 20:51       ` Davidlohr Bueso
  1 sibling, 1 reply; 31+ messages in thread
From: Davidlohr Bueso @ 2013-11-26 19:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, LKML, Jason Low, Ingo Molnar, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Tue, 2013-11-26 at 09:52 +0100, Peter Zijlstra wrote:
> On Tue, Nov 26, 2013 at 12:12:31AM -0800, Davidlohr Bueso wrote:
> 
> > I am becoming hesitant about this approach. The following are some
> > results, from my quad-core laptop, measuring the latency of nthread
> > wakeups (1 at a time). In addition, failed wait calls never occur -- so
> > we don't end up including the (otherwise minimal) overhead of the list
> > queue+dequeue, only measuring the smp_mb() usage when !empty list never
> > occurs.
> > 
> > +---------+--------------------+--------+-------------------+--------+----------+
> > | threads | baseline time (ms) | stddev | patched time (ms) | stddev | overhead |
> > +---------+--------------------+--------+-------------------+--------+----------+
> > |     512 | 4.2410             | 0.9762 | 12.3660           | 5.1020 | +191.58% |
> > |     256 | 2.7750             | 0.3997 | 7.0220            | 2.9436 | +153.04% |
> > |     128 | 1.4910             | 0.4188 | 3.7430            | 0.8223 | +151.03% |
> > |      64 | 0.8970             | 0.3455 | 2.5570            | 0.3710 | +185.06% |
> > |      32 | 0.3620             | 0.2242 | 1.1300            | 0.4716 | +212.15% |
> > +---------+--------------------+--------+-------------------+--------+----------+
> > 
> 
> Whee, this is far more overhead than I would have expected... pretty
> impressive really for a simple mfence ;-)

*sigh* I just realized I had some extra debugging options in the .config
I ran for the patched kernel. This probably explains why the huge
overhead. I'll rerun and report shortly.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-26 19:25     ` Davidlohr Bueso
@ 2013-11-26 20:51       ` Davidlohr Bueso
  2013-11-26 23:56         ` Thomas Gleixner
  0 siblings, 1 reply; 31+ messages in thread
From: Davidlohr Bueso @ 2013-11-26 20:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, LKML, Jason Low, Ingo Molnar, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Tue, 2013-11-26 at 11:25 -0800, Davidlohr Bueso wrote:
> On Tue, 2013-11-26 at 09:52 +0100, Peter Zijlstra wrote:
> > On Tue, Nov 26, 2013 at 12:12:31AM -0800, Davidlohr Bueso wrote:
> > 
> > > I am becoming hesitant about this approach. The following are some
> > > results, from my quad-core laptop, measuring the latency of nthread
> > > wakeups (1 at a time). In addition, failed wait calls never occur -- so
> > > we don't end up including the (otherwise minimal) overhead of the list
> > > queue+dequeue, only measuring the smp_mb() usage when !empty list never
> > > occurs.
> > > 
> > > +---------+--------------------+--------+-------------------+--------+----------+
> > > | threads | baseline time (ms) | stddev | patched time (ms) | stddev | overhead |
> > > +---------+--------------------+--------+-------------------+--------+----------+
> > > |     512 | 4.2410             | 0.9762 | 12.3660           | 5.1020 | +191.58% |
> > > |     256 | 2.7750             | 0.3997 | 7.0220            | 2.9436 | +153.04% |
> > > |     128 | 1.4910             | 0.4188 | 3.7430            | 0.8223 | +151.03% |
> > > |      64 | 0.8970             | 0.3455 | 2.5570            | 0.3710 | +185.06% |
> > > |      32 | 0.3620             | 0.2242 | 1.1300            | 0.4716 | +212.15% |
> > > +---------+--------------------+--------+-------------------+--------+----------+
> > > 
> > 
> > Whee, this is far more overhead than I would have expected... pretty
> > impressive really for a simple mfence ;-)
> 
> *sigh* I just realized I had some extra debugging options in the .config
> I ran for the patched kernel. This probably explains why the huge
> overhead. I'll rerun and report shortly.

I'm very sorry about the false alarm -- after midnight my brain starts
to melt. After re-running everything on my laptop (yes, with the
correct .config file), I can see that the differences are rather minimal
and variation also goes down, as expected. I've also included the
results for the original atomic ops approach, which mostly measures the
atomic_dec when we dequeue the woken task. Results are in the noise
range and virtually the same for both approaches (at least on a smaller
x86_64 system).

+---------+-----------------------------+----------------------------+------------------------------+
| threads | baseline time (ms) [stddev] | barrier time (ms) [stddev] | atomicops time (ms) [stddev] |
+---------+-----------------------------+----------------------------+------------------------------+
|     512 | 2.8360 [0.5168]             | 4.4100 [1.1150]            | 3.8150 [1.3293]              |
|     256 | 2.5080 [0.6375]             | 2.3070 [0.5112]            | 2.5980 [0.9079]              |
|     128 | 1.0200 [0.4264]             | 1.3980 [0.3391]            | 1.5180 [0.4902]              |
|      64 | 0.7890 [0.2667]             | 0.6970 [0.3374]            | 0.4020 [0.2447]              |
|      32 | 0.1150 [0.0184]             | 0.1870 [0.1428]            | 0.1490 [0.1156]              |
+---------+-----------------------------+----------------------------+------------------------------+

FYI I've uploaded the test program:
https://github.com/davidlohr/futex-stress/blob/master/futex_wake.c

I will now start running bigger, more realistic, workloads like the ones
described in the original patchset to get the big picture.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-26 20:51       ` Davidlohr Bueso
@ 2013-11-26 23:56         ` Thomas Gleixner
  2013-11-28  7:44           ` Davidlohr Bueso
  0 siblings, 1 reply; 31+ messages in thread
From: Thomas Gleixner @ 2013-11-26 23:56 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Peter Zijlstra, LKML, Jason Low, Ingo Molnar, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Tue, 26 Nov 2013, Davidlohr Bueso wrote:
> On Tue, 2013-11-26 at 11:25 -0800, Davidlohr Bueso wrote:
> > *sigh* I just realized I had some extra debugging options in the .config
> > I ran for the patched kernel. This probably explains why the huge
> > overhead. I'll rerun and report shortly.

So you pulled a FUTEX, i.e. F*d Up That EXperiment :)

> I'm very sorry about the false alarm -- after midnight my brain starts
> to melt. After re-running everything on my laptop (yes, with the
> correct .config file), I can see that the differences are rather minimal
> and variation also goes down, as expected. I've also included the
> results for the original atomic ops approach, which mostly measures the
> atomic_dec when we dequeue the woken task. Results are in the noise
> range and virtually the same for both approaches (at least on a smaller
> x86_64 system).
>
> +---------+-----------------------------+----------------------------+------------------------------+
> | threads | baseline time (ms) [stddev] | barrier time (ms) [stddev] | atomicops time (ms) [stddev] |
> +---------+-----------------------------+----------------------------+------------------------------+
> |     512 | 2.8360 [0.5168]             | 4.4100 [1.1150]            | 3.8150 [1.3293]              |
> |     256 | 2.5080 [0.6375]             | 2.3070 [0.5112]            | 2.5980 [0.9079]              |
> |     128 | 1.0200 [0.4264]             | 1.3980 [0.3391]            | 1.5180 [0.4902]              |
> |      64 | 0.7890 [0.2667]             | 0.6970 [0.3374]            | 0.4020 [0.2447]              |
> |      32 | 0.1150 [0.0184]             | 0.1870 [0.1428]            | 0.1490 [0.1156]              |
> +---------+-----------------------------+----------------------------+------------------------------+

That probably wants more than 10 repeated runs to converge into stable
numbers. Thanks for providing the atomicops comparison! That's very
helpful.

It would be interesting to measure the overhead on the waiter side as
well for both approaches (mb and atomic_inc), but I'm sure that at
least for x86 it's going to be in the same ballpark.

So I discovered earlier today, that your atomic ops variant is working
because the atomic_inc() in get_futex_key_refs() is accidentally
providing the required memory barrier on the waker side (on x86 and
all other architectures which have an implict mb in atomic_inc()).

For !fshared ones it's even a full mb on all architectiures today, see
ihold().

Aside of that get_user_pages_fast() is using atomic ops as well, not
sure if it's a full mb on all architectures today, but it is on x86
and others.

Now I'm tempted to turn this accidental into a documented behaviour.
The basic requirement for the fast lockless check of the plist is:

    record_waiter(hb)   |      *uaddr = newval
    mb                  |      mb
    *uaddr == oldval ?  |      nr_waiters(hb) != 0?

So on the waker side we can conditonally (dependent on the arch
implementation) rely on the mb in get_futex_key_refs(). See below.

Now it does not matter much in terms of barrier related overhead
whether the record_waiter() is implemented via atomic_inc() or via the
early enqueue into the plist + smp_mb. In both cases we have a full
smp_mb(), whether implicit or explicit.

And versus memory/cache footprint it's probably not relevant either
whether we add the counter or not. Assumed we have no debug options
enabled then the resulting size of futex_hash_bucket will be:

 16 bytes on 32bit (12 bytes today)

 24 bytes on 64bit (24 bytes today)

In fact for 32bit despite the slightly larger memory foot print the
cache line efficiency is actually better than now as we avoid hash
buckets crossing cache line boundaries.

No change on the 64 bit side.

As a side note, it might be worthwhile to epxlore whether forcing the
hash buckets to be cache line aligned gives us an advantage over
blindly increasing the hash size:

 We could avoid access across cacheline boundaries and also avoid
 massive cache line bouncing if multiple cpus are hammering away at
 different hash buckets which happen to reside in the same cache line.

But back to the problem at hand.

I'm leaning towards your initial atomic_inc() approach for the
following reasons:

1) It avoids the queue/unqueue dance in the error and (*uaddr != uval)
   case. The latter is the one you are optimizing for.

   We dirty the cache line in any case on the waiter side.

   With the optimized check, we avoid dirtying the cache line on the
   waker side in the case that there is no waiter in flight or
   enqueued.

2) It's very simple to make it OPT-IN. That allows architectures to
   trade the mb/memory overhead with the spinlock contention overhead.

   A lot of [embedded] architectures do not care at all about the
   futex performance simply because they do not run futex sensitive
   workloads. And others might want to avoid the heavy smp_mb() for
   obvious reasons.

   Make that a CONFIG switch which can be selected by the user or by a
   select statement in the arch. That also allows archs to determine
   the costs of that optimization just by recompiling with a different
   .config.

   All it needs are conditional implementations for futex_get_mm(),
   hb_waiters_inc(x), hb_waiters_dec() and hb_waiters_pending()

   futex_get_mm(x)
   {
      atomic_inc(x); 
      #ifdef CONFIG_FUTEX_SOMESENSIBLE_OPTION
        /*
	 * Reduced to a simple barrier() where the atomic_inc
	 * has an implicit mb()
	 */
        smp_mb__after_atomic_inc();
      #endif
   }

   futex_get_mm() must be used for incrementing the refcount of
   &key->private.mm->mm_count in get_futex_key_refs().

   hb_waiters_inc(hb)
   {
      #ifdef CONFIG_FUTEX_SOMESENSIBLE_OPTION
      atomic_inc(&hb->waiters); 
        /*
	 * Reduced to a simple barrier() where the atomic_inc
	 * has an implicit mb()
	 */
        smp_mb__after_atomic_inc();
      #endif
   }

   hb_waiters_inc() must be used in futex_wait()

   hb_waiters_dec(hb)
   {
      #ifdef CONFIG_FUTEX_SOMESENSIBLE_OPTION
      atomic_dec(&hb->waiters); 
        /*
	 * Reduced to a simple barrier() where the atomic_dec
	 * has an implicit mb().
	 *
	 * For the other archs it's debatable whether this has
	 * a hard requirement to be guarded. The optimized
	 * hb_waiters_pending() check for pending wakers might
	 * fail in rare cases, but just for the cost of a
	 * spinlock/unlock. The consistency of hb->waiters itself
	 * is always guaranteed, i.e. it can't go below 0.
	 */
        smp_mb__after_atomic_dec();
      #endif }

   hb_waiters_dec() must be used for dequeueing in all cases which
   are counterparts to a queueing futex_wait().

   hb_waiters_pending(x)
   {
      #ifdef CONFIG_FUTEX_SOMESENSIBLE_OPTION
          return atomic_read(x);
      #else
	  return 1;
      #endif
   }

   So the compiler can optimize the quick check in futex_wait() out:

   	   if (!hb_waiters_pending(&hb->waiters))
	      goto out_put_keys;

Of course that wants to be documented very carefully in the code, so
we can avoid the brain melting exercise of the futex/memory ordering
combo next time.

Thanks,

      tglx, who is about to apply a full memory barrier to himself in
      	    order to cure all this FU[BAR]TEX induced brain damage.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-26 23:56         ` Thomas Gleixner
@ 2013-11-28  7:44           ` Davidlohr Bueso
  2013-11-28 11:58             ` Thomas Gleixner
  2013-11-28 11:59             ` Peter Zijlstra
  0 siblings, 2 replies; 31+ messages in thread
From: Davidlohr Bueso @ 2013-11-28  7:44 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, LKML, Jason Low, Ingo Molnar, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Wed, 2013-11-27 at 00:56 +0100, Thomas Gleixner wrote:
> On Tue, 26 Nov 2013, Davidlohr Bueso wrote:
> > On Tue, 2013-11-26 at 11:25 -0800, Davidlohr Bueso wrote:
> > > *sigh* I just realized I had some extra debugging options in the .config
> > > I ran for the patched kernel. This probably explains why the huge
> > > overhead. I'll rerun and report shortly.
> 
> So you pulled a FUTEX, i.e. F*d Up That EXperiment :)

hehe

> 
> > I'm very sorry about the false alarm -- after midnight my brain starts
> > to melt. After re-running everything on my laptop (yes, with the
> > correct .config file), I can see that the differences are rather minimal
> > and variation also goes down, as expected. I've also included the
> > results for the original atomic ops approach, which mostly measures the
> > atomic_dec when we dequeue the woken task. Results are in the noise
> > range and virtually the same for both approaches (at least on a smaller
> > x86_64 system).
> >
> > +---------+-----------------------------+----------------------------+------------------------------+
> > | threads | baseline time (ms) [stddev] | barrier time (ms) [stddev] | atomicops time (ms) [stddev] |
> > +---------+-----------------------------+----------------------------+------------------------------+
> > |     512 | 2.8360 [0.5168]             | 4.4100 [1.1150]            | 3.8150 [1.3293]              |
> > |     256 | 2.5080 [0.6375]             | 2.3070 [0.5112]            | 2.5980 [0.9079]              |
> > |     128 | 1.0200 [0.4264]             | 1.3980 [0.3391]            | 1.5180 [0.4902]              |
> > |      64 | 0.7890 [0.2667]             | 0.6970 [0.3374]            | 0.4020 [0.2447]              |
> > |      32 | 0.1150 [0.0184]             | 0.1870 [0.1428]            | 0.1490 [0.1156]              |
> > +---------+-----------------------------+----------------------------+------------------------------+
> 
> That probably wants more than 10 repeated runs to converge into stable
> numbers. Thanks for providing the atomicops comparison! That's very
> helpful.
> 

Sorry about the delay, I've been airborne all day.

Here are the results for 1000 runs. The numbers stabilize nicely as you
add more samples. I think we can conclude that there really isn't much
of an impact in either case.

+---------+-----------------------------+----------------------------+------------------------------+
| threads | baseline time (ms) [stddev] | barrier time (ms) [stddev] | atomicops time (ms) [stddev] |
+---------+-----------------------------+----------------------------+------------------------------+
|     512 | 3.0959 [0.5293]             | 3.8402 [0.4282]            | 3.4274 [0.4418]              |
|     256 | 1.0973 [0.4023]             | 1.1162 [0.4167]            | 1.0768 [0.4130]              |
|     128 | 0.5062 [0.2110]             | 0.5221 [0.1867]            | 0.4249 [0.1922]              |
|      64 | 0.3146 [0.1312]             | 0.2580 [0.1302]            | 0.2555 [0.1266]              |
|      32 | 0.1448 [0.1022]             | 0.1568 [0.0838]            | 0.1478 [0.0788]              |
+---------+-----------------------------+----------------------------+------------------------------+

> It would be interesting to measure the overhead on the waiter side as
> well for both approaches (mb and atomic_inc), but I'm sure that at
> least for x86 it's going to be in the same ballpark.

Yeah, I don't expect much difference either, but will do the experiment.

> So I discovered earlier today, that your atomic ops variant is working
> because the atomic_inc() in get_futex_key_refs() is accidentally
> providing the required memory barrier on the waker side (on x86 and
> all other architectures which have an implict mb in atomic_inc()).
> 
> For !fshared ones it's even a full mb on all architectiures today, see
> ihold().

Interesting.

> Aside of that get_user_pages_fast() is using atomic ops as well, not
> sure if it's a full mb on all architectures today, but it is on x86
> and others.
> 
> Now I'm tempted to turn this accidental into a documented behaviour.
> The basic requirement for the fast lockless check of the plist is:
> 
>     record_waiter(hb)   |      *uaddr = newval
>     mb                  |      mb
>     *uaddr == oldval ?  |      nr_waiters(hb) != 0?
> 
> So on the waker side we can conditonally (dependent on the arch
> implementation) rely on the mb in get_futex_key_refs(). See below.
> 
> Now it does not matter much in terms of barrier related overhead
> whether the record_waiter() is implemented via atomic_inc() or via the
> early enqueue into the plist + smp_mb. In both cases we have a full
> smp_mb(), whether implicit or explicit.
> 
> And versus memory/cache footprint it's probably not relevant either
> whether we add the counter or not. Assumed we have no debug options
> enabled then the resulting size of futex_hash_bucket will be:
> 
>  16 bytes on 32bit (12 bytes today)
> 
>  24 bytes on 64bit (24 bytes today)
> 
> In fact for 32bit despite the slightly larger memory foot print the
> cache line efficiency is actually better than now as we avoid hash
> buckets crossing cache line boundaries.
> 
> No change on the 64 bit side.

Right, I should have mentioned this in the original changelog.

> 
> As a side note, it might be worthwhile to epxlore whether forcing the
> hash buckets to be cache line aligned gives us an advantage over
> blindly increasing the hash size:
> 
>  We could avoid access across cacheline boundaries and also avoid
>  massive cache line bouncing if multiple cpus are hammering away at
>  different hash buckets which happen to reside in the same cache line.

How about both enlarging the table _and_ aligning the buckets? As you
know, increasing the size of the table also benefits (particularly in
larger systems) in having more spinlocks. So we reduce the amount of
collisions and alleviate contention on the hb->lock. Btw, do you have
any particular concerns about the larger hash table patch? 

> But back to the problem at hand.
> 
> I'm leaning towards your initial atomic_inc() approach for the
> following reasons:
> 
> 1) It avoids the queue/unqueue dance in the error and (*uaddr != uval)
>    case. The latter is the one you are optimizing for.
> 
>    We dirty the cache line in any case on the waiter side.
> 
>    With the optimized check, we avoid dirtying the cache line on the
>    waker side in the case that there is no waiter in flight or
>    enqueued.
> 
> 2) It's very simple to make it OPT-IN. That allows architectures to
>    trade the mb/memory overhead with the spinlock contention overhead.
> 
>    A lot of [embedded] architectures do not care at all about the
>    futex performance simply because they do not run futex sensitive
>    workloads. And others might want to avoid the heavy smp_mb() for
>    obvious reasons.
> 
>    Make that a CONFIG switch which can be selected by the user or by a
>    select statement in the arch. That also allows archs to determine
>    the costs of that optimization just by recompiling with a different
>    .config.

Would it be useful to consider NR_CPUS? Sure, you get the problem of
when it's too little or too big, but it would deal quite well for
smaller systems - perhaps it could at least be mentioned in the option
description/help. I don't think users should have a direct say in the
option, though.

> 
>    All it needs are conditional implementations for futex_get_mm(),
>    hb_waiters_inc(x), hb_waiters_dec() and hb_waiters_pending()

I like this abstraction.

> 
>    futex_get_mm(x)
>    {
>       atomic_inc(x); 
>       #ifdef CONFIG_FUTEX_SOMESENSIBLE_OPTION
>         /*
> 	 * Reduced to a simple barrier() where the atomic_inc
> 	 * has an implicit mb()
> 	 */
>         smp_mb__after_atomic_inc();
>       #endif
>    }
> 
>    futex_get_mm() must be used for incrementing the refcount of
>    &key->private.mm->mm_count in get_futex_key_refs().
> 
>    hb_waiters_inc(hb)
>    {
>       #ifdef CONFIG_FUTEX_SOMESENSIBLE_OPTION
>       atomic_inc(&hb->waiters); 
>         /*
> 	 * Reduced to a simple barrier() where the atomic_inc
> 	 * has an implicit mb()
> 	 */
>         smp_mb__after_atomic_inc();
>       #endif
>    }
> 
>    hb_waiters_inc() must be used in futex_wait()
> 
>    hb_waiters_dec(hb)
>    {
>       #ifdef CONFIG_FUTEX_SOMESENSIBLE_OPTION
>       atomic_dec(&hb->waiters); 
>         /*
> 	 * Reduced to a simple barrier() where the atomic_dec
> 	 * has an implicit mb().
> 	 *
> 	 * For the other archs it's debatable whether this has
> 	 * a hard requirement to be guarded. The optimized
> 	 * hb_waiters_pending() check for pending wakers might
> 	 * fail in rare cases, but just for the cost of a
> 	 * spinlock/unlock. The consistency of hb->waiters itself
> 	 * is always guaranteed, i.e. it can't go below 0.
> 	 */
>         smp_mb__after_atomic_dec();
>       #endif }
> 
>    hb_waiters_dec() must be used for dequeueing in all cases which
>    are counterparts to a queueing futex_wait().
> 
>    hb_waiters_pending(x)
>    {
>       #ifdef CONFIG_FUTEX_SOMESENSIBLE_OPTION
>           return atomic_read(x);
>       #else
> 	  return 1;
>       #endif
>    }
> 
>    So the compiler can optimize the quick check in futex_wait() out:
> 

You mean futex_wake().

>    	   if (!hb_waiters_pending(&hb->waiters))
> 	      goto out_put_keys;
> 
> 
> Of course that wants to be documented very carefully in the code, so
> we can avoid the brain melting exercise of the futex/memory ordering
> combo next time.

Thanks for all the careful analysis and feedback!
Davidlohr


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-28  7:44           ` Davidlohr Bueso
@ 2013-11-28 11:58             ` Thomas Gleixner
  2013-11-28 11:59             ` Peter Zijlstra
  1 sibling, 0 replies; 31+ messages in thread
From: Thomas Gleixner @ 2013-11-28 11:58 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Peter Zijlstra, LKML, Jason Low, Ingo Molnar, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Wed, 27 Nov 2013, Davidlohr Bueso wrote:
> On Wed, 2013-11-27 at 00:56 +0100, Thomas Gleixner wrote:
> > As a side note, it might be worthwhile to epxlore whether forcing the
> > hash buckets to be cache line aligned gives us an advantage over
> > blindly increasing the hash size:
> > 
> >  We could avoid access across cacheline boundaries and also avoid
> >  massive cache line bouncing if multiple cpus are hammering away at
> >  different hash buckets which happen to reside in the same cache line.
> 
> How about both enlarging the table _and_ aligning the buckets? As you
> know, increasing the size of the table also benefits (particularly in
> larger systems) in having more spinlocks. So we reduce the amount of
> collisions and alleviate contention on the hb->lock. Btw, do you have
> any particular concerns about the larger hash table patch? 

I'm not against increasing the hash table size, but I'm very
interested in investigating the alignment effect as a separate issue.

> > But back to the problem at hand.
> > 
> > I'm leaning towards your initial atomic_inc() approach for the
> > following reasons:
> > 
> > 1) It avoids the queue/unqueue dance in the error and (*uaddr != uval)
> >    case. The latter is the one you are optimizing for.
> > 
> >    We dirty the cache line in any case on the waiter side.
> > 
> >    With the optimized check, we avoid dirtying the cache line on the
> >    waker side in the case that there is no waiter in flight or
> >    enqueued.
> > 
> > 2) It's very simple to make it OPT-IN. That allows architectures to
> >    trade the mb/memory overhead with the spinlock contention overhead.
> > 
> >    A lot of [embedded] architectures do not care at all about the
> >    futex performance simply because they do not run futex sensitive
> >    workloads. And others might want to avoid the heavy smp_mb() for
> >    obvious reasons.
> > 
> >    Make that a CONFIG switch which can be selected by the user or by a
> >    select statement in the arch. That also allows archs to determine
> >    the costs of that optimization just by recompiling with a different
> >    .config.
> 
> Would it be useful to consider NR_CPUS? Sure, you get the problem of
> when it's too little or too big, but it would deal quite well for
> smaller systems - perhaps it could at least be mentioned in the option
> description/help. I don't think users should have a direct say in the
> option, though.

Right, it does not need to be user selectable. And thinking more about
it, it shouldn't be an issue for any architecture. It's pointless for
UP though. 

What might be interesting in terms of NR_CPUS is the hash table
size. That might want to scale with the system size.

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-28  7:44           ` Davidlohr Bueso
  2013-11-28 11:58             ` Thomas Gleixner
@ 2013-11-28 11:59             ` Peter Zijlstra
  2013-11-28 14:23               ` Thomas Gleixner
                                 ` (2 more replies)
  1 sibling, 3 replies; 31+ messages in thread
From: Peter Zijlstra @ 2013-11-28 11:59 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Thomas Gleixner, LKML, Jason Low, Ingo Molnar, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Wed, Nov 27, 2013 at 11:44:38PM -0800, Davidlohr Bueso wrote:
> How about both enlarging the table _and_ aligning the buckets? As you
> know, increasing the size of the table also benefits (particularly in
> larger systems) in having more spinlocks. So we reduce the amount of
> collisions and alleviate contention on the hb->lock. Btw, do you have
> any particular concerns about the larger hash table patch? 

My only concern was the amount of #ifdef.

Wouldn't something like the below also work?


---
 kernel/futex.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 80ba086f021d..2e1a9294f751 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -70,8 +70,6 @@
 
 int __read_mostly futex_cmpxchg_enabled;
 
-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
-
 /*
  * Futex flags used to encode options to functions and preserve them across
  * restarts.
@@ -149,9 +147,11 @@ static const struct futex_q futex_q_init = {
 struct futex_hash_bucket {
 	spinlock_t lock;
 	struct plist_head chain;
-};
+} ____cacheline_aligned_in_smp;
 
-static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+static unsigned long __read_mostly futex_hashsize;
+
+static struct futex_hash_bucket *futex_queues;
 
 /*
  * We hash on the keys returned from get_futex_key (see below).
@@ -161,7 +161,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
 	u32 hash = jhash2((u32*)&key->both.word,
 			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
 			  key->both.offset);
-	return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+	return &futex_queues[hash & (futex_hashsize - 1)];
 }
 
 /*
@@ -2735,6 +2735,16 @@ static int __init futex_init(void)
 	u32 curval;
 	int i;
 
+#ifdef CONFIG_BASE_SMALL
+	futex_hashsize = 16;
+#else
+	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+			futex_hashsize, 0, futex_hashsize < 256 ? HASH_SMALL : 0,
+			NULL, NULL, futex_hashsize, futex_hashsize);
+
 	/*
 	 * This will fail and we want it. Some arch implementations do
 	 * runtime detection of the futex_atomic_cmpxchg_inatomic()

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-28 11:59             ` Peter Zijlstra
@ 2013-11-28 14:23               ` Thomas Gleixner
  2013-12-01  4:37               ` Davidlohr Bueso
  2013-12-01 12:10               ` Ingo Molnar
  2 siblings, 0 replies; 31+ messages in thread
From: Thomas Gleixner @ 2013-11-28 14:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Davidlohr Bueso, LKML, Jason Low, Ingo Molnar, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Thu, 28 Nov 2013, Peter Zijlstra wrote:

> On Wed, Nov 27, 2013 at 11:44:38PM -0800, Davidlohr Bueso wrote:
> > How about both enlarging the table _and_ aligning the buckets? As you
> > know, increasing the size of the table also benefits (particularly in
> > larger systems) in having more spinlocks. So we reduce the amount of
> > collisions and alleviate contention on the hb->lock. Btw, do you have
> > any particular concerns about the larger hash table patch? 
> 
> My only concern was the amount of #ifdef.
> 
> Wouldn't something like the below also work?

Definitely.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-28 11:59             ` Peter Zijlstra
  2013-11-28 14:23               ` Thomas Gleixner
@ 2013-12-01  4:37               ` Davidlohr Bueso
  2013-12-02 11:01                 ` Thomas Gleixner
  2013-12-01 12:10               ` Ingo Molnar
  2 siblings, 1 reply; 31+ messages in thread
From: Davidlohr Bueso @ 2013-12-01  4:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, LKML, Jason Low, Ingo Molnar, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Thu, 2013-11-28 at 12:59 +0100, Peter Zijlstra wrote:
> On Wed, Nov 27, 2013 at 11:44:38PM -0800, Davidlohr Bueso wrote:
> > How about both enlarging the table _and_ aligning the buckets? As you
> > know, increasing the size of the table also benefits (particularly in
> > larger systems) in having more spinlocks. So we reduce the amount of
> > collisions and alleviate contention on the hb->lock. Btw, do you have
> > any particular concerns about the larger hash table patch? 
> 
> My only concern was the amount of #ifdef.
> 
> Wouldn't something like the below also work?

Below are the results for a workload that stresses the uaddr hashing for
large amounts of futexes (just make waits fail the uval check, so no
list handing overhead) on an 80 core, 1Tb NUMA system.

+---------+--------------------+------------------------+-----------------------+-------------------------------+
| threads | baseline (ops/sec) | aligned-only (ops/sec) | large table (ops/sec) | large table+aligned (ops/sec) |
+---------+--------------------+------------------------+-----------------------+-------------------------------+
|     512 |              32426 | 50531  (+55.8%)        | 255274  (+687.2%)     | 292553  (+802.2%)             |
|     256 |              65360 | 99588  (+52.3%)        | 443563  (+578.6%)     | 508088  (+677.3%)             |
|     128 |             125635 | 200075 (+59.2%)        | 742613  (+491.1%)     | 835452  (+564.9%)             |
|      80 |             193559 | 323425 (+67.1%)        | 1028147 (+431.1%)     | 1130304 (+483.9%)             |
|      64 |             247667 | 443740 (+79.1%)        | 997300  (+302.6%)     | 1145494 (+362.5%)             |
|      32 |             628412 | 721401 (+14.7%)        | 965996  (+53.7%)      | 1122115 (+78.5%)              |
+---------+--------------------+------------------------+-----------------------+-------------------------------+

Baseline of course sucks compared to any other performance boost, and we
get the best throughput when applying both optimizations, no surprise.
We do particularly well for more than 32 threads, and the 'aligned-only'
column nicely exemplifies the benefits of SMP aligning the buckets
without considering the reduction in collisions.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-11-28 11:59             ` Peter Zijlstra
  2013-11-28 14:23               ` Thomas Gleixner
  2013-12-01  4:37               ` Davidlohr Bueso
@ 2013-12-01 12:10               ` Ingo Molnar
  2013-12-01 12:56                 ` Peter Zijlstra
  2 siblings, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2013-12-01 12:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Davidlohr Bueso, Thomas Gleixner, LKML, Jason Low, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney, Andrew Morton


* Peter Zijlstra <peterz@infradead.org> wrote:

> Wouldn't something like the below also work?

> -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
> -
>  /*
>   * Futex flags used to encode options to functions and preserve them across
>   * restarts.
> @@ -149,9 +147,11 @@ static const struct futex_q futex_q_init = {
>  struct futex_hash_bucket {
>  	spinlock_t lock;
>  	struct plist_head chain;
> -};
> +} ____cacheline_aligned_in_smp;
>  
> -static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
> +static unsigned long __read_mostly futex_hashsize;
> +
> +static struct futex_hash_bucket *futex_queues;
>  
>  /*
>   * We hash on the keys returned from get_futex_key (see below).
> @@ -161,7 +161,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
>  	u32 hash = jhash2((u32*)&key->both.word,
>  			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
>  			  key->both.offset);
> -	return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
> +	return &futex_queues[hash & (futex_hashsize - 1)];
>  }
>  
>  /*
> @@ -2735,6 +2735,16 @@ static int __init futex_init(void)
>  	u32 curval;
>  	int i;
>  
> +#ifdef CONFIG_BASE_SMALL
> +	futex_hashsize = 16;
> +#else
> +	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
> +#endif
> +
> +	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
> +			futex_hashsize, 0, futex_hashsize < 256 ? HASH_SMALL : 0,
> +			NULL, NULL, futex_hashsize, futex_hashsize);
> +

Two observations:

1)

Once we go dynamic hash here we might as well do a proper job: I think 
we should also scale up by RAM as well.

A 64 GB computing node with 64 cores should probably not not scale the 
same way as a 64 TB system with 64 cores, right?

Something like += log2(memory_per_node / 1GB) ? I.e. every doubling in 
the per node gigabytes adds one more bit to the hash, or so.

2)

But more importantly, since these are all NUMA systems, would it make 
sense to create per node hashes on NUMA? Each futex would be enqueued 
into the hash belonging to its own page's node.

That kind of separation would both reduce the collision count, and it 
would also reduce the cost of a collision. (it would slightly increase 
hash calculation cost.)

(It also makes hash size calculation nicely node count independent, 
we'd only consider per node CPU count and per node memory.)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-12-01 12:10               ` Ingo Molnar
@ 2013-12-01 12:56                 ` Peter Zijlstra
  2013-12-01 16:55                   ` Ingo Molnar
  0 siblings, 1 reply; 31+ messages in thread
From: Peter Zijlstra @ 2013-12-01 12:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Davidlohr Bueso, Thomas Gleixner, LKML, Jason Low, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney, Andrew Morton

On Sun, Dec 01, 2013 at 01:10:22PM +0100, Ingo Molnar wrote:
> But more importantly, since these are all NUMA systems, would it make 
> sense to create per node hashes on NUMA? Each futex would be enqueued 
> into the hash belonging to its own page's node.

Can't do that; we hash on vaddr, the actual page can move between nodes
while a futex is queued.

This would mean that the waiting futex is queued on another node than
the waker is looking.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-12-01 12:56                 ` Peter Zijlstra
@ 2013-12-01 16:55                   ` Ingo Molnar
  2013-12-01 18:58                     ` Linus Torvalds
  0 siblings, 1 reply; 31+ messages in thread
From: Ingo Molnar @ 2013-12-01 16:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Davidlohr Bueso, Thomas Gleixner, LKML, Jason Low, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney, Andrew Morton


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Sun, Dec 01, 2013 at 01:10:22PM +0100, Ingo Molnar wrote:
>
> > But more importantly, since these are all NUMA systems, would it 
> > make sense to create per node hashes on NUMA? Each futex would be 
> > enqueued into the hash belonging to its own page's node.
> 
> Can't do that; we hash on vaddr, the actual page can move between 
> nodes while a futex is queued.

Hm, indeed. We used to hash on the physical address - the very first 
futex version from Rusty did:

+static inline struct list_head *hash_futex(struct page *page,
+                                          unsigned long offset)
+{
+       unsigned long h;
+
+       /* struct page is shared, so we can hash on its address */
+       h = (unsigned long)page + offset;
+       return &futex_queues[hash_long(h, FUTEX_HASHBITS)];
+}

But this was changed to uaddr keying in:

  69e9c9b518fc [PATCH] Unpinned futexes v2: indexing changes

(commit from the linux historic git tree.)

I think this design aspect could perhaps be revisited/corrected - in 
what situations can a page move from under a futex? Only via the 
memory migration system calls, or are there other channels as well? 
Swapping should not affect the address, as the pages are pinned, 
right?

Keeping the page invariant would bring significant performance 
advantages to hashing.

> This would mean that the waiting futex is queued on another node 
> than the waker is looking.

Yeah, that cannot work.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-12-01 16:55                   ` Ingo Molnar
@ 2013-12-01 18:58                     ` Linus Torvalds
  2013-12-01 20:39                       ` Eric Dumazet
  2013-12-02 12:35                       ` Ingo Molnar
  0 siblings, 2 replies; 31+ messages in thread
From: Linus Torvalds @ 2013-12-01 18:58 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Davidlohr Bueso, Thomas Gleixner, LKML,
	Jason Low, Darren Hart, Mike Galbraith, Jeff Mahoney,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney, Andrew Morton

On Sun, Dec 1, 2013 at 8:55 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> Keeping the page invariant would bring significant performance
> advantages to hashing.

Or not. Rather, it would make things much worse. The virtual address
is much simpler and better to avoid needing any page table lookup etc
crap. The key is just the mm and the virtual address, and no silly
page table walks etc necessary.

Of course, I have no idea if people are properly using the private
futexes. glibc _should_ use them, but who the heck knows..

                Linus

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-12-01 18:58                     ` Linus Torvalds
@ 2013-12-01 20:39                       ` Eric Dumazet
  2013-12-01 21:46                         ` Linus Torvalds
  2013-12-02 12:35                       ` Ingo Molnar
  1 sibling, 1 reply; 31+ messages in thread
From: Eric Dumazet @ 2013-12-01 20:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Peter Zijlstra, Davidlohr Bueso, Thomas Gleixner,
	LKML, Jason Low, Darren Hart, Mike Galbraith, Jeff Mahoney,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney, Andrew Morton

On Sun, 2013-12-01 at 10:58 -0800, Linus Torvalds wrote:

> Of course, I have no idea if people are properly using the private
> futexes. glibc _should_ use them, but who the heck knows..

Last time I checked, private futexes were used when appropriate.

"strace -e futex" mainly show _PRIVATE uses.

Example on "strace -f -e futex host www.google.com"




^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-12-01 20:39                       ` Eric Dumazet
@ 2013-12-01 21:46                         ` Linus Torvalds
  2013-12-03 17:59                           ` Darren Hart
  0 siblings, 1 reply; 31+ messages in thread
From: Linus Torvalds @ 2013-12-01 21:46 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ingo Molnar, Peter Zijlstra, Davidlohr Bueso, Thomas Gleixner,
	LKML, Jason Low, Darren Hart, Mike Galbraith, Jeff Mahoney,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney, Andrew Morton

On Sun, Dec 1, 2013 at 12:39 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Last time I checked, private futexes were used when appropriate.
>
> "strace -e futex" mainly show _PRIVATE uses.

Yeah, pthread mutexes seem to do it. Sadly we don't do it for
mm_release(), so the case of clear_child_tid doesn't trigger it, and
that happened to be what I tested (I did "git diff" with the threaded
pre-population of the stat data).  And we can't change that because
it's ABI. Sad.

So we do have a couple of corner cases where we *could* use the
private ones but don't. But I guess that thread creation/exit is
heavy-weight enough that that particular one doesn't really matter.

               Linus

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-12-01  4:37               ` Davidlohr Bueso
@ 2013-12-02 11:01                 ` Thomas Gleixner
  0 siblings, 0 replies; 31+ messages in thread
From: Thomas Gleixner @ 2013-12-02 11:01 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Peter Zijlstra, LKML, Jason Low, Ingo Molnar, Darren Hart,
	Mike Galbraith, Jeff Mahoney, Linus Torvalds, Scott Norton,
	Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney

On Sat, 30 Nov 2013, Davidlohr Bueso wrote:
> On Thu, 2013-11-28 at 12:59 +0100, Peter Zijlstra wrote:
> > On Wed, Nov 27, 2013 at 11:44:38PM -0800, Davidlohr Bueso wrote:
> > > How about both enlarging the table _and_ aligning the buckets? As you
> > > know, increasing the size of the table also benefits (particularly in
> > > larger systems) in having more spinlocks. So we reduce the amount of
> > > collisions and alleviate contention on the hb->lock. Btw, do you have
> > > any particular concerns about the larger hash table patch? 
> > 
> > My only concern was the amount of #ifdef.
> > 
> > Wouldn't something like the below also work?
> 
> Below are the results for a workload that stresses the uaddr hashing for
> large amounts of futexes (just make waits fail the uval check, so no
> list handing overhead) on an 80 core, 1Tb NUMA system.
> 
> +---------+--------------------+------------------------+-----------------------+-------------------------------+
> | threads | baseline (ops/sec) | aligned-only (ops/sec) | large table (ops/sec) | large table+aligned (ops/sec) |
> +---------+--------------------+------------------------+-----------------------+-------------------------------+
> |     512 |              32426 | 50531  (+55.8%)        | 255274  (+687.2%)     | 292553  (+802.2%)             |
> |     256 |              65360 | 99588  (+52.3%)        | 443563  (+578.6%)     | 508088  (+677.3%)             |
> |     128 |             125635 | 200075 (+59.2%)        | 742613  (+491.1%)     | 835452  (+564.9%)             |
> |      80 |             193559 | 323425 (+67.1%)        | 1028147 (+431.1%)     | 1130304 (+483.9%)             |
> |      64 |             247667 | 443740 (+79.1%)        | 997300  (+302.6%)     | 1145494 (+362.5%)             |
> |      32 |             628412 | 721401 (+14.7%)        | 965996  (+53.7%)      | 1122115 (+78.5%)              |
> +---------+--------------------+------------------------+-----------------------+-------------------------------+
> 
> Baseline of course sucks compared to any other performance boost, and we
> get the best throughput when applying both optimizations, no surprise.
> We do particularly well for more than 32 threads, and the 'aligned-only'
> column nicely exemplifies the benefits of SMP aligning the buckets
> without considering the reduction in collisions.

Right. So yes, we want both. And I fully agree with Peters dynamic
allocation approach.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-12-01 18:58                     ` Linus Torvalds
  2013-12-01 20:39                       ` Eric Dumazet
@ 2013-12-02 12:35                       ` Ingo Molnar
  1 sibling, 0 replies; 31+ messages in thread
From: Ingo Molnar @ 2013-12-02 12:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Davidlohr Bueso, Thomas Gleixner, LKML,
	Jason Low, Darren Hart, Mike Galbraith, Jeff Mahoney,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney, Andrew Morton


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sun, Dec 1, 2013 at 8:55 AM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > Keeping the page invariant would bring significant performance 
> > advantages to hashing.
> 
> Or not. Rather, it would make things much worse. The virtual address 
> is much simpler and better to avoid needing any page table lookup 
> etc crap. The key is just the mm and the virtual address, and no 
> silly page table walks etc necessary.

In theory the CPU could give us the phyisical page address, as the TLB 
is already there: for most futex ops when the kernel runs we just 
attempted access of the virtual address on the user-space side, so the 
hardware already did the hard work of looking up everything in the 
page tables and has it cached most likely.

But yeah, you are right :-/

Thanks,

	ngo

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake()
  2013-12-01 21:46                         ` Linus Torvalds
@ 2013-12-03 17:59                           ` Darren Hart
  0 siblings, 0 replies; 31+ messages in thread
From: Darren Hart @ 2013-12-03 17:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eric Dumazet, Ingo Molnar, Peter Zijlstra, Davidlohr Bueso,
	Thomas Gleixner, LKML, Jason Low, Mike Galbraith, Jeff Mahoney,
	Scott Norton, Tom Vaden, Aswin Chandramouleeswaran, Waiman Long,
	Paul E. McKenney, Andrew Morton

On Sun, 2013-12-01 at 13:46 -0800, Linus Torvalds wrote:
> On Sun, Dec 1, 2013 at 12:39 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > Last time I checked, private futexes were used when appropriate.
> >
> > "strace -e futex" mainly show _PRIVATE uses.
> 
> Yeah, pthread mutexes seem to do it.

Yes, the default was changed to _PRIVATE for pthreads in glibc several
years back (2007 iirc). The performance benefits were significant.

-- 
Darren Hart
Intel Open Source Technology Center
Yocto Project - Linux Kernel



^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2013-12-03 18:00 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-25 20:58 [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Thomas Gleixner
2013-11-25 20:58 ` [RFC patch 1/5] futex: Misc cleanups Thomas Gleixner
2013-11-25 20:58 ` [RFC patch 2/5] futex: Document ordering guarantees Thomas Gleixner
2013-11-25 20:58 ` [RFC patch 3/5] futex: Split out unlock from queue_me() Thomas Gleixner
2013-11-25 20:58 ` [RFC patch 4/5] futex: Enqueue waiter before user space check Thomas Gleixner
2013-11-26  0:20   ` Darren Hart
2013-11-25 20:58 ` [RFC patch 5/5] futex: Allow lockless empty check of hash bucket plist Thomas Gleixner
2013-11-26  8:12 ` [RFC patch 0/5] futex: Allow lockless empty check of hashbucket plist in futex_wake() Davidlohr Bueso
2013-11-26  8:52   ` Peter Zijlstra
2013-11-26 11:21     ` Ingo Molnar
2013-11-26 11:56       ` Peter Zijlstra
2013-11-26 12:34         ` Thomas Gleixner
2013-11-26 15:38           ` Davidlohr Bueso
2013-11-26 14:49         ` Davidlohr Bueso
2013-11-26 19:25     ` Davidlohr Bueso
2013-11-26 20:51       ` Davidlohr Bueso
2013-11-26 23:56         ` Thomas Gleixner
2013-11-28  7:44           ` Davidlohr Bueso
2013-11-28 11:58             ` Thomas Gleixner
2013-11-28 11:59             ` Peter Zijlstra
2013-11-28 14:23               ` Thomas Gleixner
2013-12-01  4:37               ` Davidlohr Bueso
2013-12-02 11:01                 ` Thomas Gleixner
2013-12-01 12:10               ` Ingo Molnar
2013-12-01 12:56                 ` Peter Zijlstra
2013-12-01 16:55                   ` Ingo Molnar
2013-12-01 18:58                     ` Linus Torvalds
2013-12-01 20:39                       ` Eric Dumazet
2013-12-01 21:46                         ` Linus Torvalds
2013-12-03 17:59                           ` Darren Hart
2013-12-02 12:35                       ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).