All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: tglx@linutronix.de, axboe@kernel.dk
Cc: linux-kernel@vger.kernel.org, peterz@infradead.org,
	mingo@redhat.com, dvhart@infradead.org, dave@stgolabs.net,
	andrealmeid@igalia.com, Andrew Morton <akpm@linux-foundation.org>,
	urezki@gmail.com, hch@infradead.org, lstoakes@gmail.com,
	Arnd Bergmann <arnd@arndb.de>,
	linux-api@vger.kernel.org, linux-mm@kvack.org,
	linux-arch@vger.kernel.org, malteskarupke@web.de
Subject: [PATCH  v2 11/14] futex: Implement FUTEX2_NUMA
Date: Mon, 07 Aug 2023 14:18:54 +0200	[thread overview]
Message-ID: <20230807123323.504975124@infradead.org> (raw)
In-Reply-To: 20230807121843.710612856@infradead.org

Extend the futex2 interface to be numa aware.

When FUTEX2_NUMA is specified for a futex, the user value is extended
to two words (of the same size). The first is the user value we all
know, the second one will be the node to place this futex on.

  struct futex_numa_32 {
	u32 val;
	u32 node;
  };

When node is set to ~0, WAIT will set it to the current node_id such
that WAKE knows where to find it. If userspace corrupts the node value
between WAIT and WAKE, the futex will not be found and no wakeup will
happen.

When FUTEX2_NUMA is not set, the node is simply an extention of the
hash, such that traditional futexes are still interleaved over the
nodes.

This is done to avoid having to have a separate !numa hash-table.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/futex.h   |    3 +
 kernel/futex/core.c     |  129 +++++++++++++++++++++++++++++++++++++++---------
 kernel/futex/futex.h    |   25 +++++++--
 kernel/futex/syscalls.c |    2 
 4 files changed, 128 insertions(+), 31 deletions(-)

--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -34,6 +34,7 @@ union futex_key {
 		u64 i_seq;
 		unsigned long pgoff;
 		unsigned int offset;
+		/* unsigned int node; */
 	} shared;
 	struct {
 		union {
@@ -42,11 +43,13 @@ union futex_key {
 		};
 		unsigned long address;
 		unsigned int offset;
+		/* unsigned int node; */
 	} private;
 	struct {
 		u64 ptr;
 		unsigned long word;
 		unsigned int offset;
+		unsigned int node;	/* NOT hashed! */
 	} both;
 };
 
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -34,7 +34,8 @@
 #include <linux/compat.h>
 #include <linux/jhash.h>
 #include <linux/pagemap.h>
-#include <linux/memblock.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
 #include <linux/fault-inject.h>
 #include <linux/slab.h>
 
@@ -47,12 +48,14 @@
  * reside in the same cacheline.
  */
 static struct {
-	struct futex_hash_bucket *queues;
 	unsigned long            hashsize;
+	unsigned int		 hashshift;
+	struct futex_hash_bucket *queues[MAX_NUMNODES];
 } __futex_data __read_mostly __aligned(2*sizeof(long));
-#define futex_queues   (__futex_data.queues)
-#define futex_hashsize (__futex_data.hashsize)
 
+#define futex_hashsize	(__futex_data.hashsize)
+#define futex_hashshift	(__futex_data.hashshift)
+#define futex_queues	(__futex_data.queues)
 
 /*
  * Fault injections for futexes.
@@ -105,6 +108,26 @@ late_initcall(fail_futex_debugfs);
 
 #endif /* CONFIG_FAIL_FUTEX */
 
+static int futex_get_value(u32 *val, u32 __user *from, unsigned int flags)
+{
+	switch (futex_size(flags)) {
+	case 1: return __get_user(*val, (u8 __user *)from);
+	case 2: return __get_user(*val, (u16 __user *)from);
+	case 4: return __get_user(*val, (u32 __user *)from);
+	default: BUG();
+	}
+}
+
+static int futex_put_value(u32 val, u32 __user *to, unsigned int flags)
+{
+	switch (futex_size(flags)) {
+	case 1: return __put_user(val, (u8 __user *)to);
+	case 2: return __put_user(val, (u16 __user *)to);
+	case 4: return __put_user(val, (u32 __user *)to);
+	default: BUG();
+	}
+}
+
 /**
  * futex_hash - Return the hash bucket in the global hash
  * @key:	Pointer to the futex key for which the hash is calculated
@@ -114,10 +137,29 @@ late_initcall(fail_futex_debugfs);
  */
 struct futex_hash_bucket *futex_hash(union futex_key *key)
 {
-	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
+	u32 hash = jhash2((u32 *)key,
+			  offsetof(typeof(*key), both.offset) / sizeof(u32),
 			  key->both.offset);
+	int node = key->both.node;
+
+	if (node == -1) {
+		/*
+		 * In case of !FLAGS_NUMA, use some unused hash bits to pick a
+		 * node -- this ensures regular futexes are interleaved across
+		 * the nodes and avoids having to allocate multiple
+		 * hash-tables.
+		 *
+		 * NOTE: this isn't perfectly uniform, but it is fast and
+		 * handles sparse node masks.
+		 */
+		node = (hash >> futex_hashshift) % nr_node_ids;
+		if (!node_possible(node)) {
+			node = find_next_bit_wrap(node_possible_map.bits,
+						  nr_node_ids, node);
+		}
+	}
 
-	return &futex_queues[hash & (futex_hashsize - 1)];
+	return &futex_queues[node][hash & (futex_hashsize - 1)];
 }
 
 
@@ -217,32 +259,56 @@ static u64 get_inode_sequence_number(str
  *
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
-int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
+int get_futex_key(void __user *uaddr, unsigned int flags, union futex_key *key,
 		  enum futex_access rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
 	struct page *page, *tail;
 	struct address_space *mapping;
-	int err, ro = 0;
+	int node, err, size, ro = 0;
 	bool fshared;
 
 	fshared = flags & FLAGS_SHARED;
+	size = futex_size(flags);
+	if (flags & FLAGS_NUMA)
+		size *= 2;
 
 	/*
 	 * The futex address must be "naturally" aligned.
 	 */
 	key->both.offset = address % PAGE_SIZE;
-	if (unlikely((address % sizeof(u32)) != 0))
+	if (unlikely((address % size) != 0))
 		return -EINVAL;
 	address -= key->both.offset;
 
-	if (unlikely(!access_ok(uaddr, sizeof(u32))))
+	if (unlikely(!access_ok(uaddr, size)))
 		return -EFAULT;
 
 	if (unlikely(should_fail_futex(fshared)))
 		return -EFAULT;
 
+	if (flags & FLAGS_NUMA) {
+		void __user *naddr = uaddr + size / 2;
+
+		if (futex_get_value(&node, naddr, flags))
+			return -EFAULT;
+
+		if (node == -1) {
+			node = numa_node_id();
+			if (futex_put_value(node, naddr, flags))
+				return -EFAULT;
+
+		} else if (node >= MAX_NUMNODES || !node_possible(node)) {
+			return -EINVAL;
+		}
+
+		key->both.node = node;
+
+	} else {
+		key->both.node = -1;
+	}
+
 	/*
 	 * PROCESS_PRIVATE futexes are fast.
 	 * As the mm cannot disappear under us and the 'key' only needs
@@ -1125,27 +1191,42 @@ void futex_exit_release(struct task_stru
 
 static int __init futex_init(void)
 {
-	unsigned int futex_shift;
-	unsigned long i;
+	unsigned int order, n;
+	unsigned long size, i;
 
 #if CONFIG_BASE_SMALL
 	futex_hashsize = 16;
 #else
-	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+	futex_hashsize = 256 * num_possible_cpus();
+	futex_hashsize /= num_possible_nodes();
+	futex_hashsize = roundup_pow_of_two(futex_hashsize);
 #endif
+	futex_hashshift = ilog2(futex_hashsize);
+	size = sizeof(struct futex_hash_bucket) * futex_hashsize;
+	order = get_order(size);
+
+	for_each_node(n) {
+		struct futex_hash_bucket *table;
+
+		if (order > MAX_ORDER)
+			table = vmalloc_huge_node(size, GFP_KERNEL, n);
+		else
+			table = alloc_pages_exact_nid(n, size, GFP_KERNEL);
+
+		BUG_ON(!table);
+
+		for (i = 0; i < futex_hashsize; i++) {
+			atomic_set(&table[i].waiters, 0);
+			spin_lock_init(&table[i].lock);
+			plist_head_init(&table[i].chain);
+		}
 
-	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
-					       futex_hashsize, 0,
-					       futex_hashsize < 256 ? HASH_SMALL : 0,
-					       &futex_shift, NULL,
-					       futex_hashsize, futex_hashsize);
-	futex_hashsize = 1UL << futex_shift;
-
-	for (i = 0; i < futex_hashsize; i++) {
-		atomic_set(&futex_queues[i].waiters, 0);
-		plist_head_init(&futex_queues[i].chain);
-		spin_lock_init(&futex_queues[i].lock);
+		futex_queues[n] = table;
 	}
+	pr_info("futex hash table, %d nodes, %ld entries (order: %d, %lu bytes)\n",
+		num_possible_nodes(),
+		futex_hashsize, order,
+		sizeof(struct futex_hash_bucket) * futex_hashsize);
 
 	return 0;
 }
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -65,6 +65,11 @@ static inline unsigned int futex2_to_fla
 	return flags;
 }
 
+static inline unsigned int futex_size(unsigned int flags)
+{
+	return 1 << (flags & FLAGS_SIZE_MASK);
+}
+
 static inline bool futex_flags_valid(unsigned int flags)
 {
 	/* Only 64bit futexes for 64bit code */
@@ -77,12 +82,20 @@ static inline bool futex_flags_valid(uns
 	if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
 		return false;
 
-	return true;
-}
+	/*
+	 * Must be able to represent both NUMA_NO_NODE and every valid nodeid
+	 * in a futex word.
+	 */
+	if (flags & FLAGS_NUMA) {
+		int bits = 8 * futex_size(flags);
+		u64 max = ~0ULL;
 
-static inline unsigned int futex_size(unsigned int flags)
-{
-	return 1 << (flags & FLAGS_SIZE_MASK);
+		max >>= 64 - bits;
+		if (nr_node_ids >= max)
+			return false;
+	}
+
+	return true;
 }
 
 static inline bool futex_validate_input(unsigned int flags, u64 val)
@@ -183,7 +196,7 @@ enum futex_access {
 	FUTEX_WRITE
 };
 
-extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
+extern int get_futex_key(void __user *uaddr, unsigned int flags, union futex_key *key,
 			 enum futex_access rw);
 
 extern struct hrtimer_sleeper *
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -179,7 +179,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uad
 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 
-#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)
+#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_NUMA | FUTEX2_PRIVATE)
 
 /**
  * futex_parse_waitv - Parse a waitv array from userspace



  parent reply	other threads:[~2023-08-07 12:37 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-08-07 12:18 [PATCH v2 00/14] futex: More futex2 bits Peter Zijlstra
2023-08-07 12:18 ` [PATCH v2 01/14] futex: Clarify FUTEX2 flags Peter Zijlstra
2023-08-07 23:51   ` André Almeida
2023-08-07 12:18 ` [PATCH v2 02/14] futex: Extend the " Peter Zijlstra
2023-08-07 18:56   ` Thomas Gleixner
2023-08-08  0:02   ` André Almeida
2023-08-07 12:18 ` [PATCH v2 03/14] futex: Flag conversion Peter Zijlstra
2023-08-07 18:58   ` Thomas Gleixner
2023-08-08  0:52   ` André Almeida
2023-08-07 12:18 ` [PATCH v2 04/14] futex: Validate futex value against futex size Peter Zijlstra
2023-08-07 12:18 ` [PATCH v2 05/14] futex: Add sys_futex_wake() Peter Zijlstra
2023-08-07 19:00   ` Thomas Gleixner
2023-08-09 22:25   ` André Almeida
2023-08-10 12:13     ` Peter Zijlstra
2023-08-10 17:01       ` André Almeida
2023-08-07 12:18 ` [PATCH v2 06/14] futex: Add sys_futex_wait() Peter Zijlstra
2023-08-07 12:18 ` [PATCH v2 07/14] futex: Propagate flags into get_futex_key() Peter Zijlstra
2023-08-07 12:18 ` [PATCH v2 08/14] futex: Add flags2 argument to futex_requeue() Peter Zijlstra
2023-08-07 19:01   ` Thomas Gleixner
2023-08-07 12:18 ` [PATCH v2 09/14] futex: Add sys_futex_requeue() Peter Zijlstra
2023-08-07 19:05   ` Thomas Gleixner
2023-08-07 12:18 ` [PATCH v2 10/14] mm: Add vmalloc_huge_node() Peter Zijlstra
2023-08-07 12:18 ` Peter Zijlstra [this message]
2023-08-07 21:11   ` [PATCH v2 11/14] futex: Implement FUTEX2_NUMA Thomas Gleixner
2023-08-08  8:54     ` Peter Zijlstra
2023-08-08  9:12       ` Thomas Gleixner
2023-08-07 12:18 ` [PATCH v2 12/14] futex: Propagate flags into futex_get_value_locked() Peter Zijlstra
2023-08-07 21:12   ` Thomas Gleixner
2023-08-07 12:18 ` [PATCH v2 13/14] futex: Enable FUTEX2_{8,16} Peter Zijlstra
2023-08-07 21:15   ` Thomas Gleixner
2023-08-07 12:18 ` [PATCH v2 14/14] futex,selftests: Extend the futex selftests Peter Zijlstra
2023-08-08 20:32 ` [PATCH v2 00/14] futex: More futex2 bits Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230807123323.504975124@infradead.org \
    --to=peterz@infradead.org \
    --cc=akpm@linux-foundation.org \
    --cc=andrealmeid@igalia.com \
    --cc=arnd@arndb.de \
    --cc=axboe@kernel.dk \
    --cc=dave@stgolabs.net \
    --cc=dvhart@infradead.org \
    --cc=hch@infradead.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-arch@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lstoakes@gmail.com \
    --cc=malteskarupke@web.de \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=urezki@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.