[patch V2 2/7] futex: Hash private futexes per process

From: Thomas Gleixner <tglx@linutronix.de>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Darren Hart <darren@dvhart.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@kernel.org>,
	Michael Kerrisk <mtk.manpages@googlemail.com>,
	Davidlohr Bueso <dave@stgolabs.net>, Chris Mason <clm@fb.com>,
	"Carlos O'Donell" <carlos@redhat.com>,
	Torvald Riegel <triegel@redhat.com>,
	Eric Dumazet <edumazet@google.com>
Subject: [patch V2 2/7] futex: Hash private futexes per process
Date: Thu, 05 May 2016 20:44:04 -0000	[thread overview]
Message-ID: <20160505204353.973009518@linutronix.de> (raw)
In-Reply-To: 20160505204230.932454245@linutronix.de

[-- Attachment #1: futex-Hash-private-futexes-per-process.patch --]
[-- Type: text/plain, Size: 13765 bytes --]

From: Sebastian Siewior <bigeasy@linutronix.de>

The standard futex mechanism in the Linux kernel uses a global hash to store
transient state. Collisions on that hash can lead to performance degradation
especially on NUMA systems and on real-time enabled kernels even to priority
inversions.

To mitigate that problem we provide per process private hashing. On the first
futex operation in a process the kernel allocates a hash table. The hash table
is accessible via the process mm_struct. On Numa systems the hash is allocated
node local.

If the allocation fails then the global hash table is used as fallback, so
there is no user space visible impact of this feature.

The hash size is a default value which can be tweaked by the sys admin. The
sysctl interface is implemented in a follow up patch to make the review
simpler. For applications which have special requirements for the private hash
and to allow preallocation of the hash for RT applications, we'll provide a
futex OP in a follow up patch.

Performance data acquired on a 4 socket (node) Intel machine with perf bench
futex-hash:

Threads  G 65536  P 4	  P 8      P 16       P 32     P 64     P 128    P 256

1        8175006  8645465  8617469  8628686   8625223  8664491  8590934  8631582
2	 8149869  8618385  8578185  8622267   8603253  8618787  8595073  8590591
4	 7479482  5867840  7882991  7604838   7894380  7882850  7884911  7886278
8	 7308822  2378057  5731051  5550479   7691198  7672814  7711939  7681549
16	 7295893   677414  2670682  3453552   7158906  7688978  7677603  7690290

So with the proper hash size of the private hash is ~5% faster than the global
hash.

With a full perf bench futex-hash run with one process (36 threads) per node
and 1024 futexes per thread the following results are achieved:

G 65536	 P 4     P 8     P 16     P 32     P 64     P 128    P 256    P 512    P 1024  P 2048     
2673390  368952  682626  1223908  1845922  3003524  3538313  4118533  4286925  4289589 4274020

Ratio:   0,14    0,26    0,46     0,69	   1,12     1,32     1,54     1,60     1,60    1,60

So with a private hash size of 256 buckets and above the performance is almost
steady in this pathological test case and factor 1.6 better than the global
hash. Even a 64 buckets hash is already 10% faster,

Signed-off-by: Sebastian Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/futex.h       |   38 ++++++++--
 include/linux/futex_types.h |   12 +++
 include/linux/mm_types.h    |    4 +
 init/Kconfig                |    4 +
 kernel/fork.c               |    3 
 kernel/futex.c              |  162 +++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 212 insertions(+), 11 deletions(-)
 create mode 100644 include/linux/futex_types.h

--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_FUTEX_H
 #define _LINUX_FUTEX_H
 
+#include <linux/futex_types.h>
 #include <uapi/linux/futex.h>
 
 struct inode;
@@ -21,16 +22,19 @@ handle_futex_death(u32 __user *uaddr, st
  *
  * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
  * We use the two low order bits of offset to tell what is the kind of key :
- *  00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
- *       (no reference on an inode or mm)
+ *  00 : Private process futex (PTHREAD_PROCESS_PRIVATE) using process private
+ *	 hash (no reference on an inode or mm)
  *  01 : Shared futex (PTHREAD_PROCESS_SHARED)
  *	mapped on a file (reference on the underlying inode)
  *  10 : Shared futex (PTHREAD_PROCESS_SHARED)
  *       (but private mapping on an mm, and reference taken on it)
+ *  11 : Private process futex (PTHREAD_PROCESS_PRIVATE) using global hash
+ *	 (no reference on an inode or mm)
 */
 
-#define FUT_OFF_INODE    1 /* We set bit 0 if key has a reference on inode */
-#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+#define FUT_OFF_INODE		0x01 /* Key has a reference on inode */
+#define FUT_OFF_MMSHARED	0x02 /* Key has a reference on mm */
+#define FUT_OFF_PRIVATE		0x03 /* Key has no ref on inode/mm */
 
 union futex_key {
 	struct {
@@ -60,12 +64,30 @@ extern void exit_pi_state_list(struct ta
 #else
 extern int futex_cmpxchg_enabled;
 #endif
+
 #else
-static inline void exit_robust_list(struct task_struct *curr)
-{
-}
-static inline void exit_pi_state_list(struct task_struct *curr)
+static inline void exit_robust_list(struct task_struct *curr) { }
+static inline void exit_pi_state_list(struct task_struct *curr) { }
+#endif
+
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+/* Process private hash data for futexes */
+
+extern unsigned int futex_default_hash_bits;
+extern unsigned int futex_max_hash_bits;
+
+extern void futex_mm_hash_exit(struct mm_struct *mm);
+
+static inline void futex_mm_hash_init(struct mm_struct *mm)
 {
+	raw_spin_lock_init(&mm->futex_hash.lock);
+	mm->futex_hash.hash = NULL;
 }
+
+#else
+
+static inline void futex_mm_hash_init(struct mm_struct *mm) { }
+static inline void futex_mm_hash_exit(struct mm_struct *mm) { }
 #endif
+
 #endif
--- /dev/null
+++ b/include/linux/futex_types.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_FUTEX_TYPES_H
+#define _LINUX_FUTEX_TYPES_H
+
+struct futex_hash_bucket;
+
+struct futex_hash {
+	struct raw_spinlock		lock;
+	unsigned int			hash_bits;
+	struct futex_hash_bucket	*hash;
+};
+
+#endif
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -11,6 +11,7 @@
 #include <linux/completion.h>
 #include <linux/cpumask.h>
 #include <linux/uprobes.h>
+#include <linux/futex_types.h>
 #include <linux/page-flags-layout.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
@@ -442,6 +443,9 @@ struct mm_struct {
 
 	struct linux_binfmt *binfmt;
 
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+	struct futex_hash futex_hash;
+#endif
 	cpumask_var_t cpu_vm_mask_var;
 
 	/* Architecture-specific MM context */
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1498,6 +1498,10 @@ config FUTEX
 	  support for "fast userspace mutexes".  The resulting kernel may not
 	  run glibc-based applications correctly.
 
+config FUTEX_PRIVATE_HASH
+	bool
+	default FUTEX && SMP
+
 config HAVE_FUTEX_CMPXCHG
 	bool
 	depends on FUTEX
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -617,6 +617,8 @@ static struct mm_struct *mm_init(struct
 	mm_init_owner(mm, p);
 	mmu_notifier_mm_init(mm);
 	clear_tlb_flush_pending(mm);
+	futex_mm_hash_init(mm);
+
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
 #endif
@@ -713,6 +715,7 @@ void mmput(struct mm_struct *mm)
 		khugepaged_exit(mm); /* must run before exit_mmap */
 		exit_mmap(mm);
 		set_mm_exe_file(mm, NULL);
+		futex_mm_hash_exit(mm);
 		if (!list_empty(&mm->mmlist)) {
 			spin_lock(&mmlist_lock);
 			list_del(&mm->mmlist);
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -23,6 +23,9 @@
  *  Copyright (C) IBM Corporation, 2009
  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
  *
+ *  Private hashed futex support by Sebastian Siewior and Thomas Gleixner
+ *  Copyright (C) Linutronix GmbH, 2016
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -49,6 +52,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/jhash.h>
+#include <linux/hash.h>
 #include <linux/init.h>
 #include <linux/futex.h>
 #include <linux/mount.h>
@@ -169,6 +173,34 @@
  * the code that actually moves the futex(es) between hash buckets (requeue_futex)
  * will do the additional required waiter count housekeeping. This is done for
  * double_lock_hb() and double_unlock_hb(), respectively.
+ *
+ * For private futexes we (pre)allocate a per process hash. We check lockless
+ * whether the hash is already allocated. To access the hash later we need
+ * information about the hash properties as well. This requires barriers as
+ * follows:
+ *
+ * CPU 0					CPU 1
+ * check_hash_allocation()
+ *	if (mm->futex_hash.hash)
+ *		return;
+ *	hash = alloc_hash()
+ *	lock(&mm->futex_hash.lock);
+ *	if (!mm->futex_hash.hash) {
+ *	  mm->futex_hash.par = params;
+ *
+ *	  smp_wmb(); (A0) <-paired with-|
+ *					|
+ *	  mm->futex_hash.hash = hash;	|
+ *					|	check_hash_allocation()
+ *					|	   if (mm->futex_hash.hash)
+ *					|		return;
+ *	  unlock(&mm->futex_hash.lock);	|	get_futex_key_refs()
+ *					|
+ *					|--------- smp_mb() (B)
+ *						s = hash(f, mm->futex_hash.par);
+ *						hb = &mm->futex_hash.hash[s];
+ *
+ * So we utilize the existing smp_mb() in get_futex_key_refs().
  */
 
 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
@@ -255,6 +287,22 @@ struct futex_hash_bucket {
 	struct plist_head chain;
 } ____cacheline_aligned_in_smp;
 
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+/*
+ * Process private hash for non-shared futexes
+ */
+#define FUTEX_USE_GLOBAL_HASH		((void *) 0x03)
+
+#define FUTEX_MIN_HASH_BITS		order_base_2(4UL)
+#define FUTEX_DEF_HASH_BITS		order_base_2(8UL)
+#define FUTEX_MAX_HASH_BITS		order_base_2(256UL)
+
+unsigned int futex_default_hash_bits	= FUTEX_DEF_HASH_BITS;
+unsigned int futex_max_hash_bits	= FUTEX_MAX_HASH_BITS;
+#else
+static const unsigned int futex_default_hash_bits = 0;
+#endif
+
 /*
  * The base of the bucket array and its size are always used together
  * (after initialization only in hash_futex()), so ensure that they
@@ -374,13 +422,13 @@ static inline int hb_waiters_pending(str
 }
 
 /**
- * hash_futex - Return the hash bucket in the global hash
+ * hash_global_futex - Return the hash bucket in the global hash
  * @key:	Pointer to the futex key for which the hash is calculated
  *
  * We hash on the keys returned from get_futex_key (see below) and return the
  * corresponding hash bucket in the global hash.
  */
-static struct futex_hash_bucket *hash_futex(union futex_key *key)
+static struct futex_hash_bucket *hash_global_futex(union futex_key *key)
 {
 	u32 hash = jhash2((u32*)&key->both.word,
 			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
@@ -388,9 +436,33 @@ static struct futex_hash_bucket *hash_fu
 	return &futex_queues[hash & (futex_hashsize - 1)];
 }
 
+/**
+ * hash_futex - Get the hash bucket for a futex
+ *
+ * Returns either the process private or the global hash bucket which fits the
+ * key.
+ */
+static struct futex_hash_bucket *hash_futex(union futex_key *key)
+{
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+	struct mm_struct *mm = current->mm;
+	unsigned int slot;
+
+	/*
+	 * Futexes which use the per process hash have the lower bits cleared
+	 */
+	if (key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED))
+		return hash_global_futex(key);
+
+	slot = hash_long(key->private.address, mm->futex_hash.hash_bits);
+	return &mm->futex_hash.hash[slot];
+#else
+	return hash_global_futex(key);
+#endif
+}
 
 /**
- * match_futex - Check whether to futex keys are equal
+ * match_futex - Check whether two futex keys are equal
  * @key1:	Pointer to key1
  * @key2:	Pointer to key2
  *
@@ -505,7 +577,20 @@ get_futex_key(u32 __user *uaddr, int fsh
 	 */
 	if (!fshared) {
 		key->private.mm = mm;
+		/*
+		 * If we have a process private hash, then we store uaddr
+		 * instead of the page base address.
+		 */
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+		if (mm->futex_hash.hash != FUTEX_USE_GLOBAL_HASH) {
+			key->private.address = (unsigned long) uaddr;
+		} else {
+			key->private.address = address;
+			key->both.offset |= FUT_OFF_PRIVATE;
+		}
+#else
 		key->private.address = address;
+#endif
 		get_futex_key_refs(key);  /* implies smp_mb(); (B) */
 		return 0;
 	}
@@ -3153,6 +3238,75 @@ void exit_robust_list(struct task_struct
 				   curr, pip);
 }
 
+#ifdef CONFIG_FUTEX_PRIVATE_HASH
+
+void futex_mm_hash_exit(struct mm_struct *mm)
+{
+	if (mm->futex_hash.hash && mm->futex_hash.hash != FUTEX_USE_GLOBAL_HASH)
+		kfree(mm->futex_hash.hash);
+	mm->futex_hash.hash = NULL;
+}
+
+static struct futex_hash_bucket *futex_alloc_hash(unsigned int hash_bits)
+{
+	struct futex_hash_bucket *hb;
+	size_t hash_size, size;
+	int i;
+
+	hash_size = 1 << hash_bits;
+	size = hash_size * sizeof(struct futex_hash_bucket);
+	hb = kzalloc_node(size, GFP_KERNEL, numa_node_id());
+	if (!hb)
+		return NULL;
+
+	for (i = 0; i < hash_size; i++) {
+		atomic_set(&hb[i].waiters, 0);
+		plist_head_init(&hb[i].chain);
+		spin_lock_init(&hb[i].lock);
+	}
+	return hb;
+}
+
+static void futex_populate_hash(unsigned int hash_bits)
+{
+	struct mm_struct *mm = current->mm;
+	struct futex_hash_bucket *hb = NULL;
+
+	/*
+	 * We don't need an explicit smp_mb() when the hash is populated
+	 * because before we dereference mm->futex_hash.hash_bits in the hash
+	 * function we have an smp_mb() in futex_get_key_refs() already.
+	 */
+	if (mm->futex_hash.hash)
+		return;
+
+	/*
+	 * If we failed to allocate a hash on the fly, fall back to the global
+	 * hash.
+	 */
+	hb = futex_alloc_hash(hash_bits);
+	if (!hb)
+		hb = FUTEX_USE_GLOBAL_HASH;
+
+	raw_spin_lock(&mm->futex_hash.lock);
+	/* We might have raced with another task allocating the hash. */
+	if (!mm->futex_hash.hash) {
+		mm->futex_hash.hash_bits = hash_bits;
+		/*
+		 * Ensure that the above is visible before we store
+		 * the pointer.
+		 */
+		smp_wmb(); /* (A0) Pairs with (B) */
+		mm->futex_hash.hash = hb;
+		hb = NULL;
+	}
+	raw_spin_unlock(&mm->futex_hash.lock);
+	kfree(hb);
+}
+#else /* CONFIG_FUTEX_PRIVATE_HASH */
+static inline void futex_populate_hash(unsigned int hash_bits) { }
+#endif
+
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
@@ -3161,6 +3315,8 @@ long do_futex(u32 __user *uaddr, int op,
 
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		flags |= FLAGS_SHARED;
+	else
+		futex_populate_hash(futex_default_hash_bits);
 
 	if (op & FUTEX_CLOCK_REALTIME) {
 		flags |= FLAGS_CLOCKRT;