All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] futex: Support smaller futexes of one byte or two byte size.
@ 2019-12-04 23:52 Malte Skarupke
  2019-12-06 15:31 ` Peter Zijlstra
  2019-12-20 19:08 ` Malte Skarupke
  0 siblings, 2 replies; 11+ messages in thread
From: Malte Skarupke @ 2019-12-04 23:52 UTC (permalink / raw)
  To: tglx, mingo; +Cc: peterz, dvhart, linux-kernel, malteskarupke, Malte Skarupke

Two reasons for adding this:
1. People often want small mutexes. Having mutexes that are only one byte
in size was the first reason WebKit mentioned for re-implementing futexes
in a library:
https://webkit.org/blog/6161/locking-in-webkit/

2. The C++ standard added futexes to the standard library in C++20 under
the name atomic_wait and atomic_notify:
http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1135r5.html
The C++20 version supports this for atomic variables of any size. The more
sizes we can support, the better the implementation can be in the standard
library.

I had to change where we store two flags that were previously stored in
the low bits of the address, since the address can no longer be assumed
to be aligned on four bytes. Luckily the high bits were free as long as
PAGE_SIZE is never more than 1 gigabyte.

The reason for only supporting u8 and u16 is that those were easy to
support with the existing interface. 64 bit futexes would require bigger
changes. There is also the option of just supporting arbitrarily sized
futexes by repurposing one of the other parameters to indicate the size.
However I believe that even in that case 8 bit and 16 bit futexes
would remain more common, since people usually want to save memory. So
the flags I added wouldn't get in the way of that feature.

In this change I only added support in wait/wake and requeue/cmp_requeue,
as those are the only operations I need right now. Also the other
operations are more complicated. They will have to wait.

Signed-off-by: Malte Skarupke <malteskarupke@web.de>
---
 include/linux/futex.h      |  10 ++-
 include/uapi/linux/futex.h |   7 +-
 kernel/futex.c             | 163 +++++++++++++++++++++++++++++++------
 3 files changed, 151 insertions(+), 29 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index ccaef0097785..86fe0e2780e2 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -14,8 +14,8 @@ struct task_struct;
  * The key type depends on whether it's a shared or private mapping.
  * Don't rearrange members without looking at hash_futex().
  *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We use the two low order bits of offset to tell what is the kind of key :
+ * offset is the position within a page and is in the range [0, PAGE_SIZE).
+ * We use the next two bits of offset to tell what kind of key this is :
  *  00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
  *       (no reference on an inode or mm)
  *  01 : Shared futex (PTHREAD_PROCESS_SHARED)
@@ -24,8 +24,10 @@ struct task_struct;
  *       (but private mapping on an mm, and reference taken on it)
 */

-#define FUT_OFF_INODE    1 /* We set bit 0 if key has a reference on inode */
-#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+/* We set the next highest bit if key has a reference on inode */
+#define FUT_OFF_INODE    PAGE_SIZE
+/* We set the bit after that if key has a reference on mm */
+#define FUT_OFF_MMSHARED (2 * FUT_OFF_INODE)

 union futex_key {
 	struct {
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index a89eb0accd5e..1121603c5b64 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -24,7 +24,12 @@

 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
-#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+#define FUTEX_SIZE_8_BITS	512
+#define FUTEX_SIZE_16_BITS	1024
+#define FUTEX_SIZE_32_BITS	0
+#define FUTEX_ALL_SIZE_BITS	(FUTEX_SIZE_8_BITS | FUTEX_SIZE_16_BITS)
+#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \
+				  FUTEX_ALL_SIZE_BITS)

 #define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
diff --git a/kernel/futex.c b/kernel/futex.c
index 6d50728ef2e7..803d93ff4f4a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -183,6 +183,8 @@ static int  __read_mostly futex_cmpxchg_enabled;
 #endif
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
+#define FLAGS_8_BITS		0x08
+#define FLAGS_16_BITS		0x10

 /*
  * Priority Inheritance state:
@@ -467,7 +469,14 @@ static void drop_futex_key_refs(union futex_key *key)

 enum futex_access {
 	FUTEX_READ,
-	FUTEX_WRITE
+	FUTEX_WRITE,
+	/*
+	 * for operations that only need the address and don't touch the
+	 * memory, like FUTEX_WAKE or FUTEX_REQUEUE. (not FUTEX_CMP_REQUEUE)
+	 * this will skip the size checks of the futex, allowing those
+	 * operations to be used with futexes of any size.
+	 */
+	FUTEX_NO_READ_WRITE
 };

 /**
@@ -504,7 +513,7 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 /**
  * get_futex_key() - Get parameters which are the keys for a futex
  * @uaddr:	virtual address of the futex
- * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:	futex flags (FLAGS_SHARED, FLAGS_8_BITS etc.)
  * @key:	address where result is stored.
  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
  *              FUTEX_WRITE)
@@ -520,25 +529,37 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
 static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
+get_futex_key(u32 __user *uaddr, int flags, union futex_key *key, enum futex_access rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
 	struct page *page, *tail;
 	struct address_space *mapping;
-	int err, ro = 0;
+	int err, fshared, ro = 0;
+
+	fshared = flags & FLAGS_SHARED;

 	/*
 	 * The futex address must be "naturally" aligned.
 	 */
+	if (flags & FLAGS_8_BITS || rw == FUTEX_NO_READ_WRITE) {
+		if (unlikely(!access_ok(uaddr, sizeof(u8))))
+			return -EFAULT;
+	} else if (flags & FLAGS_16_BITS) {
+		if (unlikely((address % sizeof(u16)) != 0))
+			return -EINVAL;
+		if (unlikely(!access_ok(uaddr, sizeof(u16))))
+			return -EFAULT;
+	} else {
+		if (unlikely((address % sizeof(u32)) != 0))
+			return -EINVAL;
+		if (unlikely(!access_ok(uaddr, sizeof(u32))))
+			return -EFAULT;
+	}
+
 	key->both.offset = address % PAGE_SIZE;
-	if (unlikely((address % sizeof(u32)) != 0))
-		return -EINVAL;
 	address -= key->both.offset;

-	if (unlikely(!access_ok(uaddr, sizeof(u32))))
-		return -EFAULT;
-
 	if (unlikely(should_fail_futex(fshared)))
 		return -EFAULT;

@@ -566,7 +587,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a
 	 * If write access is not required (eg. FUTEX_WAIT), try
 	 * and get read-only access.
 	 */
-	if (err == -EFAULT && rw == FUTEX_READ) {
+	if (err == -EFAULT && rw != FUTEX_WRITE) {
 		err = get_user_pages_fast(address, 1, 0, &page);
 		ro = 1;
 	}
@@ -799,6 +820,48 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 	return ret ? -EFAULT : 0;
 }

+static int
+get_futex_value_locked_any_size(u32 *dest, u32 __user *from, int flags)
+{
+	int ret;
+	u8 dest_8_bits;
+	u16 dest_16_bits;
+
+	pagefault_disable();
+	if (flags & FLAGS_8_BITS) {
+		ret = __get_user(dest_8_bits, (u8 __user *)from);
+		*dest = dest_8_bits;
+	} else if (flags & FLAGS_16_BITS) {
+		ret = __get_user(dest_16_bits, (u16 __user *)from);
+		*dest = dest_16_bits;
+	} else {
+		ret = __get_user(*dest, from);
+	}
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+
+static int get_futex_value_any_size(u32 *dest, u32 __user *from, int flags)
+{
+	int ret;
+	u8 uval_8_bits;
+	u16 uval_16_bits;
+
+	if (flags & FLAGS_8_BITS) {
+		ret = get_user(uval_8_bits, (u8 __user *)from);
+		if (ret == 0)
+			*dest = uval_8_bits;
+	} else if (flags & FLAGS_16_BITS) {
+		ret = get_user(uval_16_bits, (u16 __user *)from);
+		if (ret == 0)
+			*dest = uval_16_bits;
+	} else {
+		ret = get_user(*dest, from);
+	}
+	return ret;
+}
+

 /*
  * PI code:
@@ -1605,7 +1668,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	if (!bitset)
 		return -EINVAL;

-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_NO_READ_WRITE);
 	if (unlikely(ret != 0))
 		goto out;

@@ -1704,10 +1767,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	DEFINE_WAKE_Q(wake_q);

 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out_put_key1;

@@ -1983,11 +2046,12 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	}

 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags, &key1,
+			    cmpval ? FUTEX_READ : FUTEX_NO_READ_WRITE);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
-			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+	ret = get_futex_key(uaddr2, flags, &key2,
+			    requeue_pi ? FUTEX_WRITE : FUTEX_NO_READ_WRITE);
 	if (unlikely(ret != 0))
 		goto out_put_key1;

@@ -2010,13 +2074,13 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	if (likely(cmpval != NULL)) {
 		u32 curval;

-		ret = get_futex_value_locked(&curval, uaddr1);
+		ret = get_futex_value_locked_any_size(&curval, uaddr1, flags);

 		if (unlikely(ret)) {
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);

-			ret = get_user(curval, uaddr1);
+			ret = get_futex_value_any_size(&curval, uaddr1, flags);
 			if (ret)
 				goto out_put_keys;

@@ -2673,19 +2737,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 * while the syscall executes.
 	 */
 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;

 retry_private:
 	*hb = queue_lock(q);

-	ret = get_futex_value_locked(&uval, uaddr);
+	ret = get_futex_value_locked_any_size(&uval, uaddr, flags);

 	if (ret) {
 		queue_unlock(*hb);

-		ret = get_user(uval, uaddr);
+		ret = get_futex_value_any_size(&uval, uaddr, flags);
 		if (ret)
 			goto out;

@@ -2817,7 +2881,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);

 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;

@@ -3000,7 +3064,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 	if ((uval & FUTEX_TID_MASK) != vpid)
 		return -EPERM;

-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
 	if (ret)
 		return ret;

@@ -3237,7 +3301,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 */
 	rt_mutex_init_waiter(&rt_waiter);

-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;

@@ -3627,6 +3691,57 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		    cmd != FUTEX_WAIT_REQUEUE_PI)
 			return -ENOSYS;
 	}
+	if (op & FUTEX_ALL_SIZE_BITS) {
+		switch (cmd) {
+		case FUTEX_CMP_REQUEUE:
+			/*
+			 * for cmp_requeue, only uaddr has to be of the size
+			 * passed in the flags. uaddr2 can be of any size
+			 */
+		case FUTEX_WAIT:
+		case FUTEX_WAIT_BITSET:
+			switch (op & FUTEX_ALL_SIZE_BITS) {
+			case FUTEX_SIZE_8_BITS:
+				flags |= FLAGS_8_BITS;
+				break;
+			case FUTEX_SIZE_16_BITS:
+				flags |= FLAGS_16_BITS;
+				break;
+			case FUTEX_SIZE_32_BITS:
+				break;
+			default:
+				/*
+				 * if both bits are set we could silently treat
+				 * that as a 32 bit futex, but we may want to
+				 * use this pattern in the future to indicate a
+				 * 64 bit futex or an arbitrarily sized futex.
+				 * so we don't want code relying on this yet.
+				 */
+				return -EINVAL;
+			}
+			break;
+		case FUTEX_WAKE:
+		case FUTEX_REQUEUE:
+			/*
+			 * these instructions work with sized mutexes, but you
+			 * don't need to pass the size. we could silently
+			 * ignore the size argument, but the code won't verify
+			 * that the correct size is used, so it's preferable
+			 * to make that clear to the caller.
+			 *
+			 * for requeue the meaning would also be ambiguous: do
+			 * both of them have to be the same size or not? they
+			 * don't, and that's clearer when you just don't pass
+			 * a size argument.
+			 */
+			return -EINVAL;
+		default:
+			/*
+			 * all other cases are not supported yet
+			 */
+			return -EINVAL;
+		}
+	}

 	switch (cmd) {
 	case FUTEX_LOCK_PI:
--
2.17.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] futex: Support smaller futexes of one byte or two byte size.
  2019-12-04 23:52 [PATCH] futex: Support smaller futexes of one byte or two byte size Malte Skarupke
@ 2019-12-06 15:31 ` Peter Zijlstra
  2019-12-06 17:37   ` Peter Zijlstra
  2019-12-20 19:08 ` Malte Skarupke
  1 sibling, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2019-12-06 15:31 UTC (permalink / raw)
  To: Malte Skarupke; +Cc: tglx, mingo, dvhart, linux-kernel, malteskarupke

On Wed, Dec 04, 2019 at 06:52:38PM -0500, Malte Skarupke wrote:
> Two reasons for adding this:
> 1. People often want small mutexes. Having mutexes that are only one byte
> in size was the first reason WebKit mentioned for re-implementing futexes
> in a library:
> https://webkit.org/blog/6161/locking-in-webkit/

They have the best locking namespace _ever_!! Most impressed with them
for getting away with that.

> I had to change where we store two flags that were previously stored in
> the low bits of the address, since the address can no longer be assumed
> to be aligned on four bytes. Luckily the high bits were free as long as
> PAGE_SIZE is never more than 1 gigabyte.

For now it seems unlikely to run with 1G base pages :-)

> The reason for only supporting u8 and u16 is that those were easy to
> support with the existing interface. 64 bit futexes would require bigger
> changes. 

Might make sense to enumerate the issues you've found that stand in the
way of 64bit futexes. But yes, I can think of a few.


> @@ -467,7 +469,14 @@ static void drop_futex_key_refs(union futex_key *key)
> 
>  enum futex_access {

I prefer FUTEX_NONE, to match PROT_NONE.

>  	FUTEX_READ,
> -	FUTEX_WRITE
> +	FUTEX_WRITE,
> +	/*
> +	 * for operations that only need the address and don't touch the
> +	 * memory, like FUTEX_WAKE or FUTEX_REQUEUE. (not FUTEX_CMP_REQUEUE)
> +	 * this will skip the size checks of the futex, allowing those
> +	 * operations to be used with futexes of any size.
> +	 */
> +	FUTEX_NO_READ_WRITE
>  };
> 
>  /**

> @@ -520,25 +529,37 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
>   * lock_page() might sleep, the caller should not hold a spinlock.
>   */
>  static int
> -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
> +get_futex_key(u32 __user *uaddr, int flags, union futex_key *key, enum futex_access rw)
>  {
>  	unsigned long address = (unsigned long)uaddr;
>  	struct mm_struct *mm = current->mm;
>  	struct page *page, *tail;
>  	struct address_space *mapping;
> -	int err, ro = 0;
> +	int err, fshared, ro = 0;
> +
> +	fshared = flags & FLAGS_SHARED;
> 
>  	/*
>  	 * The futex address must be "naturally" aligned.
>  	 */
> +	if (flags & FLAGS_8_BITS || rw == FUTEX_NO_READ_WRITE) {
> +		if (unlikely(!access_ok(uaddr, sizeof(u8))))
> +			return -EFAULT;
> +	} else if (flags & FLAGS_16_BITS) {
> +		if (unlikely((address % sizeof(u16)) != 0))
> +			return -EINVAL;
> +		if (unlikely(!access_ok(uaddr, sizeof(u16))))
> +			return -EFAULT;
> +	} else {
> +		if (unlikely((address % sizeof(u32)) != 0))
> +			return -EINVAL;
> +		if (unlikely(!access_ok(uaddr, sizeof(u32))))
> +			return -EFAULT;
> +	}

That might be better written like:

	if (rw != FUTEX_NONE)
		size = futex_size(flags);
	else
		size = 1;

	if (unlikely((address % size) != 0))
		return -EINVAL;
	if (unlikely(!access_ok(uaddr, size)))
		return -EFAULT;


> @@ -799,6 +820,48 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
>  	return ret ? -EFAULT : 0;
>  }
> 
> +static int
> +get_futex_value_locked_any_size(u32 *dest, u32 __user *from, int flags)
> +{
> +	int ret;
> +	u8 dest_8_bits;
> +	u16 dest_16_bits;
> +
> +	pagefault_disable();
> +	if (flags & FLAGS_8_BITS) {
> +		ret = __get_user(dest_8_bits, (u8 __user *)from);
> +		*dest = dest_8_bits;
> +	} else if (flags & FLAGS_16_BITS) {
> +		ret = __get_user(dest_16_bits, (u16 __user *)from);
> +		*dest = dest_16_bits;
> +	} else {
> +		ret = __get_user(*dest, from);
> +	}
> +	pagefault_enable();
> +
> +	return ret ? -EFAULT : 0;
> +}
> +
> +static int get_futex_value_any_size(u32 *dest, u32 __user *from, int flags)
> +{
> +	int ret;
> +	u8 uval_8_bits;
> +	u16 uval_16_bits;
> +
> +	if (flags & FLAGS_8_BITS) {
> +		ret = get_user(uval_8_bits, (u8 __user *)from);
> +		if (ret == 0)
> +			*dest = uval_8_bits;
> +	} else if (flags & FLAGS_16_BITS) {
> +		ret = get_user(uval_16_bits, (u16 __user *)from);
> +		if (ret == 0)
> +			*dest = uval_16_bits;
> +	} else {
> +		ret = get_user(*dest, from);
> +	}
> +	return ret;
> +}

Perhaps write that like:

static inline int __get_futex_value(u32 *dest, u32 __user *from, int flags)
{
	u32 val = 0;
	int ret;

	switch (futex_size(flags)) {
	case 1:
		/*
		 * afaict __get_user() doesn't care about the type of
		 * the first argument
		 */
		ret = __get_user(val, (u8 __user *)from);
		break;

	case 2:
		....

	case 4:

	}

	if (!ret)
		*dest = val;

	return ret;
}

static inline int get_futex_value(...)
{
	do uaccess check
	return __get_futex_value();
}

static inline int get_futex_value_locked(...)
{
	int ret;

	pagefault_disable();
	ret = __get_futex_value(...);
	pagefault_enable();

	return ret;
}

>  /*
>   * PI code:

> +	if (op & FUTEX_ALL_SIZE_BITS) {
> +		switch (cmd) {
> +		case FUTEX_CMP_REQUEUE:
> +			/*
> +			 * for cmp_requeue, only uaddr has to be of the size
> +			 * passed in the flags. uaddr2 can be of any size
> +			 */
> +		case FUTEX_WAIT:
> +		case FUTEX_WAIT_BITSET:
> +			switch (op & FUTEX_ALL_SIZE_BITS) {
> +			case FUTEX_SIZE_8_BITS:
> +				flags |= FLAGS_8_BITS;
> +				break;
> +			case FUTEX_SIZE_16_BITS:
> +				flags |= FLAGS_16_BITS;
> +				break;
> +			case FUTEX_SIZE_32_BITS:
> +				break;
> +			default:
> +				/*
> +				 * if both bits are set we could silently treat
> +				 * that as a 32 bit futex, but we may want to
> +				 * use this pattern in the future to indicate a
> +				 * 64 bit futex or an arbitrarily sized futex.
> +				 * so we don't want code relying on this yet.
> +				 */
> +				return -EINVAL;
> +			}
> +			break;
> +		case FUTEX_WAKE:
> +		case FUTEX_REQUEUE:
> +			/*
> +			 * these instructions work with sized mutexes, but you
> +			 * don't need to pass the size. we could silently
> +			 * ignore the size argument, but the code won't verify
> +			 * that the correct size is used, so it's preferable
> +			 * to make that clear to the caller.
> +			 *
> +			 * for requeue the meaning would also be ambiguous: do
> +			 * both of them have to be the same size or not? they
> +			 * don't, and that's clearer when you just don't pass
> +			 * a size argument.
> +			 */
> +			return -EINVAL;

Took me a while to figure out this relies on FUTEX_NONE to avoid the
alignment tests.

> +		default:
> +			/*
> +			 * all other cases are not supported yet
> +			 */
> +			return -EINVAL;
> +		}
> +	}
> 
>  	switch (cmd) {
>  	case FUTEX_LOCK_PI:
> --
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] futex: Support smaller futexes of one byte or two byte size.
  2019-12-06 15:31 ` Peter Zijlstra
@ 2019-12-06 17:37   ` Peter Zijlstra
  2019-12-08 22:18     ` Malte Skarupke
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2019-12-06 17:37 UTC (permalink / raw)
  To: Malte Skarupke; +Cc: tglx, mingo, dvhart, linux-kernel, malteskarupke

On Fri, Dec 06, 2019 at 04:31:29PM +0100, Peter Zijlstra wrote:
> > +		case FUTEX_WAKE:
> > +		case FUTEX_REQUEUE:
> > +			/*
> > +			 * these instructions work with sized mutexes, but you
> > +			 * don't need to pass the size. we could silently
> > +			 * ignore the size argument, but the code won't verify
> > +			 * that the correct size is used, so it's preferable
> > +			 * to make that clear to the caller.
> > +			 *
> > +			 * for requeue the meaning would also be ambiguous: do
> > +			 * both of them have to be the same size or not? they
> > +			 * don't, and that's clearer when you just don't pass
> > +			 * a size argument.
> > +			 */
> > +			return -EINVAL;
> 
> Took me a while to figure out this relies on FUTEX_NONE to avoid the
> alignment tests.

And thikning more on that, I really _realy_ hate this.

You're basically saying WAKE is size-less, but that means we must
consider what it means to have a u32 WAIT on @ptr, and a u8 WAKE on
@ptr+1. If the wake really is size-less that should match.

I'd be much happier with requiring strict sizing. Because conversely,
what happens if you have a u32-WAIT at @ptr paired with a u8-WAKE at
@ptr? If we demand strict size we can say that should not match. This
does however mean we should include the size in the hash-match function.

Your Changelog did not consider these implications at all.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] futex: Support smaller futexes of one byte or two byte size.
  2019-12-06 17:37   ` Peter Zijlstra
@ 2019-12-08 22:18     ` Malte Skarupke
  2019-12-11 13:44       ` Peter Zijlstra
  0 siblings, 1 reply; 11+ messages in thread
From: Malte Skarupke @ 2019-12-08 22:18 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: tglx, mingo, dvhart, linux-kernel, malteskarupke

In my first version WAKE also took a size parameter, however I chose to not send that version because there are two problems with that. You already noticed the first problem: we would have to store the size and verify that it matches on WAKE.

The second problem is with REQUEUE and CMP_REQUEUE: The usual case there will be that you have a mutex that's 8 bits in size, but the condition variable is harder to fit into 8 bits and will be larger. So you would need to pass in two different size flags. But REQUEUE doesn't actually care what the size is at all. It just moves waiters around. CMP_REQUEUE does care, but only for one of the arguments. So it works out perfectly that there is one flag for the size.

Those two cases really helped me clarify what the size argument actually is needed for. It's only needed for the "compare" part of CMP_REQUEUE. Similarly WAIT could have also been named CMP_WAIT and if it was named that, it would also be obvious that the size argument is only needed for the "compare" part of WAIT. All the other work that WAIT does doesn't actually care about the memory behind the pointer at all. It only cares about the address.

That also illustrates what should happen if we splice a futex and call WAIT on @ptr and WAKE on @ptr+1: If I understand you correctly, (and correct me if I'm wrong here) your point is that there would be an inconsistency in the API there. You say it would be inconsistent for a size-less WAKE to not wake a futex that it's pointing in the middle of.

But with the above thoughts, we realize that since those are two different addresses, they refer to two different futexes. It doesn't matter that the WAIT was for a four byte futex. That information was only relevant for the "compare" part of WAIT. It's not relevant for anything else, and therefore it's not relevant for the identity of the futex. So the fact that splicing a futex doesn't work (and can't easily be made to work) does not point at an inconsistency in the API.

Taking all that into account, I believe there are two possible consistent implementations for sized futexes. One of them is the one you're asking for, where the size is always passed and always verified to be correct. The other one is the one I'm proposing, where the size argument only applies to the "compare" part of WAIT and CMP_REQUEUE, and all the other work of futexes is size-less and only works on the address. (and I think similar reasoning will work for the operations that are not supported yet)

I believe that between those two consistent implementations, the one with size-less WAKE and REQUEUE is preferable. REQUEUE in particular makes clear how we really don't care about the size in these operations. There is no difference in behavior when moving between futexes of different sizes or the same size. It just doesn't matter. But if REQUEUE is size-less, it would be inconsistent for WAKE to require a size since REQUEUE is just a WAKE with extra features.

The other downside of the version that checks the size is that we, well, have to check the size. That's extra work we have to do and extra data we have to store, and I can't come up with any case where a user would actually benefit from us doing that extra work.

All that being said I agree with your other comments (renaming FUTEX_NO_READ_WRITE to FUTEX_NONE, and introducing a futex_size() function to simplify some of the code). I'll change the code and send a new patch.

Meanwhile let me know what you would like to do about passing a size to WAKE or not.

--
  Malte Skarupke
  malteskarupke@web.de

Am Fr, 6. Dez 2019, um 12:37, schrieb Peter Zijlstra:
> On Fri, Dec 06, 2019 at 04:31:29PM +0100, Peter Zijlstra wrote:
> > > +		case FUTEX_WAKE:
> > > +		case FUTEX_REQUEUE:
> > > +			/*
> > > +			 * these instructions work with sized mutexes, but you
> > > +			 * don't need to pass the size. we could silently
> > > +			 * ignore the size argument, but the code won't verify
> > > +			 * that the correct size is used, so it's preferable
> > > +			 * to make that clear to the caller.
> > > +			 *
> > > +			 * for requeue the meaning would also be ambiguous: do
> > > +			 * both of them have to be the same size or not? they
> > > +			 * don't, and that's clearer when you just don't pass
> > > +			 * a size argument.
> > > +			 */
> > > +			return -EINVAL;
> >
> > Took me a while to figure out this relies on FUTEX_NONE to avoid the
> > alignment tests.
>
> And thikning more on that, I really _realy_ hate this.
>
> You're basically saying WAKE is size-less, but that means we must
> consider what it means to have a u32 WAIT on @ptr, and a u8 WAKE on
> @ptr+1. If the wake really is size-less that should match.
>
> I'd be much happier with requiring strict sizing. Because conversely,
> what happens if you have a u32-WAIT at @ptr paired with a u8-WAKE at
> @ptr? If we demand strict size we can say that should not match. This
> does however mean we should include the size in the hash-match function.
>
> Your Changelog did not consider these implications at all.
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] futex: Support smaller futexes of one byte or two byte size.
  2019-12-08 22:18     ` Malte Skarupke
@ 2019-12-11 13:44       ` Peter Zijlstra
  2019-12-11 19:48         ` Malte Skarupke
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Zijlstra @ 2019-12-11 13:44 UTC (permalink / raw)
  To: Malte Skarupke; +Cc: tglx, mingo, dvhart, linux-kernel, malteskarupke


Please read:

  https://people.kernel.org/tglx/notes-about-netiquette-qw89

On Sun, Dec 08, 2019 at 05:18:12PM -0500, Malte Skarupke wrote:
> In my first version WAKE also took a size parameter, however I chose
> to not send that version because there are two problems with that. You
> already noticed the first problem: we would have to store the size and
> verify that it matches on WAKE.
> 
> The second problem is with REQUEUE and CMP_REQUEUE: The usual case
> there will be that you have a mutex that's 8 bits in size, but the
> condition variable is harder to fit into 8 bits and will be larger. So
> you would need to pass in two different size flags. But REQUEUE
> doesn't actually care what the size is at all. It just moves waiters
> around. CMP_REQUEUE does care, but only for one of the arguments. So
> it works out perfectly that there is one flag for the size.
> 
> Those two cases really helped me clarify what the size argument
> actually is needed for. It's only needed for the "compare" part of
> CMP_REQUEUE. Similarly WAIT could have also been named CMP_WAIT and if
> it was named that, it would also be obvious that the size argument is
> only needed for the "compare" part of WAIT. All the other work that
> WAIT does doesn't actually care about the memory behind the pointer at
> all. It only cares about the address.
> 
> That also illustrates what should happen if we splice a futex and call
> WAIT on @ptr and WAKE on @ptr+1: If I understand you correctly, (and
> correct me if I'm wrong here) your point is that there would be an
> inconsistency in the API there. You say it would be inconsistent for a
> size-less WAKE to not wake a futex that it's pointing in the middle
> of.
> 
> But with the above thoughts, we realize that since those are two
> different addresses, they refer to two different futexes. It doesn't
> matter that the WAIT was for a four byte futex. That information was
> only relevant for the "compare" part of WAIT. It's not relevant for
> anything else, and therefore it's not relevant for the identity of the
> futex. So the fact that splicing a futex doesn't work (and can't
> easily be made to work) does not point at an inconsistency in the API.

See, I disagree. The WAIT/WAKE pair is an implementation of the MONITOR
pattern. Similar to x86's MONITOR/MWAIT or ARMv8's LDXR/WFE. It allows a
(task vs cpu) blocking implementation of:


  WAIT:
	while (READ_ONCE(*ptr) != val)
		cpu_relax;

  WAKE:
	WRITE_ONCE(*ptr, val);


With futex that turns into:


  WAIT:
	for (;;) {
		u32 t = READ_ONCE(*ptr);
		if (t == val)
			break;
		futex(WAIT, ptr, t);
	}

  WAKE:
	WRITE_ONCE(*ptr, val);
	futex(WAKE, ptr);


IOW, each WAKE corresponds to a STORE.

Extending the above to variable width, you'll see that:

	u32 var = 0xff;
	u32 *ptr = &var, val = 0xffff;

  WAIT:
	while (READ_ONCE(*ptr) != val)
		cpu_relax();

  WAKE:
	WRITE_ONCE(*(u8*)ptr+1, 0xff); // let's assume LE

will in fact release the WAIT. IOW, all WAIT cares about is that the
value it went to sleep on has changed. It doesn't care how many bits of
the variable changed or what size store made it happen.

It also works the other way:

	u8 var = 0, val = 1;
	u8 *ptr = &var;

  WAIT:
	while (READ_ONCE(*ptr) != val)
		cpu_relax();

  WAKE:
	WRITE_ONCE(*(u32 *)((unsigned long)ptr & ~0x3), 0x01010101);

That u32 store fills our @var with 1 and the WAIT terminates.

In neither example does @ptr (necessarily) match.

> Taking all that into account, I believe there are two possible
> consistent implementations for sized futexes. One of them is the one
> you're asking for, where the size is always passed and always verified
> to be correct. The other one is the one I'm proposing, where the size
> argument only applies to the "compare" part of WAIT and CMP_REQUEUE,
> and all the other work of futexes is size-less and only works on the
> address. (and I think similar reasoning will work for the operations
> that are not supported yet)
> 
> I believe that between those two consistent implementations, the one
> with size-less WAKE and REQUEUE is preferable. REQUEUE in particular
> makes clear how we really don't care about the size in these
> operations. There is no difference in behavior when moving between
> futexes of different sizes or the same size. It just doesn't matter.
> But if REQUEUE is size-less, it would be inconsistent for WAKE to
> require a size since REQUEUE is just a WAKE with extra features.
> 
> The other downside of the version that checks the size is that we,
> well, have to check the size. That's extra work we have to do and
> extra data we have to store, and I can't come up with any case where a
> user would actually benefit from us doing that extra work.

So I'll argue that, while the pure address mode is perhaps convenient,
it is not consistent.  In particular it allows some but disallows other
mixed size overlapping operations. Consistency would either allow or
disallow all.

Yes, being strict is perhaps a little more expensive, but I so prefer
strict and narrow semantics over weird and fuzzy semantics. Also, I
don't think it is actually that much more expensive.

Specifically, as you found, we have available space in
futex_key::offset, 2 more FUT_OFF_ bits would allow encoding 4 different
sizes (1,2,4,8 bytes).

Re REQUEUE, using 4 instead of 2 extra bits in the command word would
allow encoding 2 sizes, one for uaddr and one for uaddr2.

Thomas?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] futex: Support smaller futexes of one byte or two byte size.
  2019-12-11 13:44       ` Peter Zijlstra
@ 2019-12-11 19:48         ` Malte Skarupke
  0 siblings, 0 replies; 11+ messages in thread
From: Malte Skarupke @ 2019-12-11 19:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, mingo, dvhart, linux-kernel, malteskarupke


Am Mi, 11. Dez 2019, um 08:44, schrieb Peter Zijlstra:
> See, I disagree. The WAIT/WAKE pair is an implementation of the MONITOR
> pattern. Similar to x86's MONITOR/MWAIT or ARMv8's LDXR/WFE.

That is a good way of putting it. It's not the mental model I had of futexes,
but I can see how under that mental model a WAKE that doesn't wake an
overlapping futex is inconsistent. (my mental model was a memory comparison
followed by a hashtable store. The WAKE has to pass the same hash key to
find anything)

The question of whether to pass a size to WAKE or not is not something I care
about (as I said, my first version did pass a size to WAKE) so I will change
my code to require a size in WAKE. I have my opinions about which one I prefer
but if I can't convince you then that shouldn't hold back the change. But
a sized WAKE does open up a few questions:

1. It would be easy for me to verify that sizes match when the address that's
   passed in is the same address. Meaning a WAIT with 8 bits but a WAKE with
   16 bits on the same address would return -EINVAL. But it's harder to
   validate a similar mismatch on overlapping futexes. Meaning a 32 bit WAIT
   on @ptr and a 8 bit WAKE on @ptr+1 would just return 0.

   This might theoretically lead to problems if we ever want to have the
   semantics of MONITOR/MWAIT in the future, because I'm saying that that
   last mismatch does not wake anything and is not an error. This will make
   it hard to change the semantics in the future. Are we OK with that?

2. If we are OK with that, there are questions about what to do about weird
   uses like a 8 bit WAIT and a 32 bit WAIT on the same address. An 8 bit
   WAKE on the same address should now probably just wake the 8 bit WAIT, but
   a second 8 bit WAKE would then return an error because there is a
   mismatching size. This is confusing so my first thought is to just disallow
   all mixing of sizes, even among waiters. It seems unlikely that this will
   ever be needed.

3. Implementing the full semantics of MONITOR/MWAIT would give obvious answers
   to both of these questions, and I think I could even do it without slowing
   down futexes too much. (just use the address/4 as the hash key so that all
   potentially overlapping futexes hash to the same bucket) But it would also
   open up new questions. Like what happens if you call a 32 bit WAKE with
   val=1, meaning wake up one sleeper, but it overlaps with two 16 bit futexes?
   Does it wake one sleeper in both of them? That would be consistent with
   MONITOR/MWAIT behavior, but it would contradict the current documentation.
   (which might be OK because it would only do so for new behavior)


My current thinking for these is as follows, in reverse order:

3. I would not implement the full MONITOR/MWAIT behavior. It's not required
for my use case, it would definitely add a small amount of overhead, and it
bothers me that adjacent 8 bit futexes would hash to the same bucket. (at
least they would if I can't come up with another way of implementing it) All
I want is smaller mutexes and the behavior required by the C++ standard, so I
don't need new semantics for that.

2. I will disallow all operations that pass a different size for the same
address. That includes disallowing an 8 bit WAIT followed by a 32 bit WAIT on
the same address.

1. I will only verify that the size matches if the same address was passed.
Overlapping futexes are not detected and will not raise an error.

This will lead to an inconsistent implementation in the MONITOR/MWAIT mental
model, but it leads to a consistent implementation in the comparison+hashtable
mental model. It will have the same behavior as my initial patch except that
all operations will error out if a size mismatch is detected.

Please let me know if this is OK with you. I don't think that this gains us
anything over having an unsized WAKE, but if this is what others want, I can
write the code. If we instead want the full MONITOR/MWAIT semantics, I can
probably also make that happen with a very small slowdown compared to the
current code. I just need to know if that's what you want.

Thanks,

Malte

--
  Malte Skarupke
  malteskarupke@web.de

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] futex: Support smaller futexes of one byte or two byte size.
  2019-12-04 23:52 [PATCH] futex: Support smaller futexes of one byte or two byte size Malte Skarupke
  2019-12-06 15:31 ` Peter Zijlstra
@ 2019-12-20 19:08 ` Malte Skarupke
  2019-12-20 19:09   ` Malte Skarupke
  1 sibling, 1 reply; 11+ messages in thread
From: Malte Skarupke @ 2019-12-20 19:08 UTC (permalink / raw)
  To: tglx, mingo; +Cc: peterz, dvhart, linux-kernel, malteskarupke, malteskarupke

Hello again,

here are two patches for sized futexes. Option 1 is the same behavior as my
first patch, just with the fixes suggested above. Option 2 requires a size
flag for WAKE and REQUEUE.

The reason for sending two changelists is that after thinking about it for a
while, and after implementing both options, I realized that I do care which
option we pick. I still prefer option 1. I would also be OK with going with
option 2, but let me try one final time to convince you with three reasons:

1. The consistent API conversation. There were two reasons voiced for this:
a) In a private email Thomas Gleixner voiced that just having two different
   calls, WAIT and WAKE, where one takes the size and the other does not,
   would be inconsistent. I would like to think about this in terms of another
   popular API where one takes a size and the other does not: malloc() and
   free(). Over the years many allocator implementers would have hoped that
   free() also takes a size because it can save a few instructions, and the
   user usually knows what they are freeing and could pass in the size. But
   what if there was a version of free() that doesn't need the size, but it
   still takes a size argument and only uses it to verify that the correct
   size was passed in: If you pass an incorrect size it errors out and doesn't
   actually free the memory. Would that be a better interface?

   Because that's essentially what I'm implementing here: WAKE doesn't need
   the size, and the only thing it uses the size for is to verify that it is
   correct, and if it's not, WAKE errors out and doesn't actually wake. I
   don't think that that makes it a better API.

b) On this mailing list Peter Zijlstra voiced that my implementation does not
   implement the MONITOR/MWAIT pattern. This wasn't a problem when there was
   only one size of futex, but it is once you can have overlapping futexes of
   different sizes. I can see that it might be nice to implement this pattern,
   but I chose to not do it mainly because mutexes and condition variables
   don't actually need it. They just need the same semantics we had before,
   except for smaller types. There is no need to handle overlapping futexes,
   so I propose a different mental model: Comparisons+hashtable lookup. Under
   that mental model only an exact match on the address between WAIT and WAKE
   will do anything, because otherwise you won't find anything in the
   hashtable. And under that mental model the size only applies to the
   comparison. That's what I implemented and what I prefer. Option 1 is a more
   internally consistent implementation in that mental model.

2. Precedent. When Windows added support for futexes, they supported different
   sizes of futexes from the beginning. In their API WaitOnAddress() is the
   equivalent to our FUTEX_WAIT, and WakeByAddressSingle() and
   WakeByAddressAll() are the equivalents to our FUTEX_WAKE. In the Windows
   API WaitOnAddress() takes a size, but the WakeByAddress calls do not take a
   size. Obviously there may be reasons to do things differently from how
   Windows does it, but it does show that at least some other programmers came
   to the same conclusions that I did.

3. Odd behavior. I made one change compared to what I wrote earlier in the
   discussion. Earlier I said that calling WAIT with two different sizes on
   the same address should probably not be allowed. The reason for that is
   that it can lead to unexpected behavior. If you do a 8 bit WAIT followed by
   a 16 bit WAIT on the same address, then call 8 bit WAKE on the same address
   twice, the first WAKE call will work and return 1, but the second WAKE call
   will return -EINVAL because of a size mismatch.

   I thought that that's just a bit too weird so I said that I wouldn't allow
   WAITs with different sizes. The reason why I changed my mind on that is
   that I would have to change what WAIT does. Instead of just inserting
   itself into a linked list, it would have to iterate through that linked
   list to check if anybody else already went to sleep with a different size.
   So a O(1) operation would turn into a O(n) operation. Since there can be
   condition variables where many threads sleep on the same futex, this would
   actually be fairly common. So I thought that that would be an unacceptable
   change in behavior. (and a similar change would have to be made to REQUEUE)
   So this does mean that in the version where WAKE verifies the size, you can
   get weird behavior if WAIT was called with two different sizes. (or maybe
   if somebody wasn't consistent with their usage of the flags in REQUEUE) I
   don't think this is a huge concern because I don't expect people to have
   different sized futexes on the same address, but it is a weird API quirk.
   If this does cause bugs, they should also be fairly obvious and easy to
   fix. But they would be bugs that you couldn't have with option 1.

So now that I have tried to convince you one final time, you're free to choose
whichever option that you like. And obviously if you have more feedback for
the code, I would also implement those fixes. Also if you want a patch without
the "option 1" or "option 2" text in the description, I can provide that.

Thanks,

Malte




^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH] futex: Support smaller futexes of one byte or two byte size.
  2019-12-20 19:08 ` Malte Skarupke
@ 2019-12-20 19:09   ` Malte Skarupke
  2019-12-20 19:09     ` Malte Skarupke
  0 siblings, 1 reply; 11+ messages in thread
From: Malte Skarupke @ 2019-12-20 19:09 UTC (permalink / raw)
  To: tglx, mingo; +Cc: peterz, dvhart, linux-kernel, malteskarupke, malteskarupke

This is option 1 out of 2. In both versions only the operations WAIT, WAKE,
REQUEUE and CMP_REQUEUE are supported. In this version WAIT and CMP_REQUEUE
take a size argument, WAKE and REQUEUE do not and work on futexes of all
sizes. The other option requires a size for those operations, too. See the
mailing list for discussions as to which option we should use.

None of these operations handle overlapping futexes. The mental model is
comparison+hashtable lookup and only an exact match on the address will find
the matching futex between WAIT and WAKE. An alternative would have been the
MONITOR/MWAIT mental model to handle overlapping futexes of different sizes,
but for mutexes and condition variables I just needed the normal futex
behavior. (except with smaller types)

There are two reasons for adding smaller futexes:
1. People often want smaller mutexes. Having mutexes that are only one byte in
size was the first reason WebKit gave for re-implementing futexes:
https://webkit.org/blog/6161/locking-in-webkit/

2. The C++ standard added futexes to the standard library in C++20 under the
name atomic_wait and atomic_notify:
http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1135r5.html
The C++ version supports this for atomic variables of any size. The more sizes
we can support, the better the standard implementation can be on Linux.

I changed the location of two flags that used to be stored in the low bits of
the address, since the address is no longer aligned on four bytes. Luckily the
high bits were free as long as PAGE_SIZE is never more than 1 gigabyte.

The reason for only supporting 8 bit futexes and 16 bit futexes is that those
were easy to support with the existing interface. 64 bit futexes would require
bigger changes.

The reason for only supporting WAIT/WAKE and REQUEUE/CMP_REQUEUE is that those
are the only operations I need right now. (in fact the C++ standard only needs
WAIT/WAKE) The other operations are more complicated and will have to wait.

Signed-off-by: Malte Skarupke <malteskarupke@web.de>
---
 include/linux/futex.h      |  10 ++-
 include/uapi/linux/futex.h |   7 +-
 kernel/futex.c             | 155 +++++++++++++++++++++++++++++++------
 3 files changed, 145 insertions(+), 27 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 5cc3fed27d4c..3655e026c914 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -16,8 +16,8 @@ struct task_struct;
  * The key type depends on whether it's a shared or private mapping.
  * Don't rearrange members without looking at hash_futex().
  *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We use the two low order bits of offset to tell what is the kind of key :
+ * offset is the position within a page and is in the range [0, PAGE_SIZE).
+ * The high bits of offset indicate what kind of key this is :
  *  00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
  *       (no reference on an inode or mm)
  *  01 : Shared futex (PTHREAD_PROCESS_SHARED)
@@ -26,8 +26,10 @@ struct task_struct;
  *       (but private mapping on an mm, and reference taken on it)
 */

-#define FUT_OFF_INODE    1 /* We set bit 0 if key has a reference on inode */
-#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+/* We set the next highest bit if key has a reference on inode */
+#define FUT_OFF_INODE		PAGE_SIZE
+/* We set the bit after that if key has a reference on mm */
+#define FUT_OFF_MMSHARED	(PAGE_SIZE << 1)

 union futex_key {
 	struct {
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index a89eb0accd5e..1121603c5b64 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -24,7 +24,12 @@

 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
-#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+#define FUTEX_SIZE_8_BITS	512
+#define FUTEX_SIZE_16_BITS	1024
+#define FUTEX_SIZE_32_BITS	0
+#define FUTEX_ALL_SIZE_BITS	(FUTEX_SIZE_8_BITS | FUTEX_SIZE_16_BITS)
+#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \
+				  FUTEX_ALL_SIZE_BITS)

 #define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
diff --git a/kernel/futex.c b/kernel/futex.c
index 03c518e9747e..0e55295fa23e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -183,6 +183,9 @@ static int  __read_mostly futex_cmpxchg_enabled;
 #endif
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
+#define FLAGS_8_BITS		0x08
+#define FLAGS_16_BITS		0x10
+#define FLAGS_SIZE_BITS		(FLAGS_8_BITS | FLAGS_16_BITS)

 /*
  * Priority Inheritance state:
@@ -473,7 +476,14 @@ static void drop_futex_key_refs(union futex_key *key)

 enum futex_access {
 	FUTEX_READ,
-	FUTEX_WRITE
+	FUTEX_WRITE,
+	/*
+	 * for operations that only need the address and don't touch the
+	 * memory, like FUTEX_WAKE or FUTEX_REQUEUE. (not FUTEX_CMP_REQUEUE)
+	 * this will skip the size checks of the futex, allowing those
+	 * operations to be used with futexes of any size.
+	 */
+	FUTEX_NONE
 };

 /**
@@ -505,10 +515,20 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 	return timeout;
 }

+static inline int futex_size(int flags)
+{
+	if (flags & FLAGS_8_BITS)
+		return sizeof(u8);
+	else if (flags & FLAGS_16_BITS)
+		return sizeof(u16);
+	else
+		return sizeof(u32);
+}
+
 /**
  * get_futex_key() - Get parameters which are the keys for a futex
  * @uaddr:	virtual address of the futex
- * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:	futex flags (FLAGS_SHARED, FLAGS_8_BITS etc.)
  * @key:	address where result is stored.
  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
  *              FUTEX_WRITE)
@@ -524,23 +544,29 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
 static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
+get_futex_key(u32 __user *uaddr, int flags, union futex_key *key, enum futex_access rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
 	struct page *page, *tail;
 	struct address_space *mapping;
-	int err, ro = 0;
+	int err, fshared, size, ro = 0;
+
+	fshared = flags & FLAGS_SHARED;
+	if (rw == FUTEX_NONE)
+		size = 1;
+	else
+		size = futex_size(flags);

 	/*
 	 * The futex address must be "naturally" aligned.
 	 */
 	key->both.offset = address % PAGE_SIZE;
-	if (unlikely((address % sizeof(u32)) != 0))
+	if (unlikely((address & (size - 1)) != 0))
 		return -EINVAL;
 	address -= key->both.offset;

-	if (unlikely(!access_ok(uaddr, sizeof(u32))))
+	if (unlikely(!access_ok(uaddr, size)))
 		return -EFAULT;

 	if (unlikely(should_fail_futex(fshared)))
@@ -570,7 +596,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a
 	 * If write access is not required (eg. FUTEX_WAIT), try
 	 * and get read-only access.
 	 */
-	if (err == -EFAULT && rw == FUTEX_READ) {
+	if (err == -EFAULT && rw != FUTEX_WRITE) {
 		err = get_user_pages_fast(address, 1, 0, &page);
 		ro = 1;
 	}
@@ -792,6 +818,18 @@ static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
 	return ret;
 }

+static inline int __get_sized_futex_value(u32 *dest, u32 __user *from, int size)
+{
+	switch (size) {
+	case 1:
+		return __get_user(*dest, (u8 __user *)from);
+	case 2:
+		return __get_user(*dest, (u16 __user *)from);
+	default:
+		return __get_user(*dest, from);
+	}
+}
+
 static int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
 	int ret;
@@ -803,6 +841,26 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 	return ret ? -EFAULT : 0;
 }

+static int get_sized_futex_value_locked(u32 *dest, u32 __user *from, int size)
+{
+	int ret;
+
+	pagefault_disable();
+	ret = __get_sized_futex_value(dest, from, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+
+static int get_sized_futex_value(u32 *dest, u32 __user *from, int size)
+{
+	might_fault();
+	if (access_ok(from, size))
+		return __get_sized_futex_value(dest, from, size);
+	else
+		return -EFAULT;
+}
+

 /*
  * PI code:
@@ -1663,7 +1721,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	if (!bitset)
 		return -EINVAL;

-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_NONE);
 	if (unlikely(ret != 0))
 		goto out;

@@ -1762,10 +1820,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	DEFINE_WAKE_Q(wake_q);

 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out_put_key1;

@@ -2047,11 +2105,12 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	}

 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags, &key1,
+			    cmpval ? FUTEX_READ : FUTEX_NONE);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
-			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+	ret = get_futex_key(uaddr2, flags, &key2,
+			    requeue_pi ? FUTEX_WRITE : FUTEX_NONE);
 	if (unlikely(ret != 0))
 		goto out_put_key1;

@@ -2073,14 +2132,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,

 	if (likely(cmpval != NULL)) {
 		u32 curval;
+		int size = futex_size(flags);

-		ret = get_futex_value_locked(&curval, uaddr1);
+		ret = get_sized_futex_value_locked(&curval, uaddr1, size);

 		if (unlikely(ret)) {
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);

-			ret = get_user(curval, uaddr1);
+			ret = get_sized_futex_value(&curval, uaddr1, size);
 			if (ret)
 				goto out_put_keys;

@@ -2727,7 +2787,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 			   struct futex_q *q, struct futex_hash_bucket **hb)
 {
 	u32 uval;
-	int ret;
+	int ret, size = futex_size(flags);

 	/*
 	 * Access the page AFTER the hash-bucket is locked.
@@ -2748,19 +2808,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 * while the syscall executes.
 	 */
 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;

 retry_private:
 	*hb = queue_lock(q);

-	ret = get_futex_value_locked(&uval, uaddr);
+	ret = get_sized_futex_value_locked(&uval, uaddr, size);

 	if (ret) {
 		queue_unlock(*hb);

-		ret = get_user(uval, uaddr);
+		ret = get_sized_futex_value(&uval, uaddr, size);
 		if (ret)
 			goto out;

@@ -2893,7 +2953,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);

 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;

@@ -3084,7 +3144,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 	if ((uval & FUTEX_TID_MASK) != vpid)
 		return -EPERM;

-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
 	if (ret)
 		return ret;

@@ -3321,7 +3381,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 */
 	rt_mutex_init_waiter(&rt_waiter);

-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;

@@ -3847,6 +3907,13 @@ void futex_exit_release(struct task_struct *tsk)
 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
 }

+int futex_op_to_size_flags(int op)
+{
+	int size_flags = op & FUTEX_ALL_SIZE_BITS;
+
+	return (size_flags / FUTEX_SIZE_8_BITS) * FLAGS_8_BITS;
+}
+
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
@@ -3862,6 +3929,50 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		    cmd != FUTEX_WAIT_REQUEUE_PI)
 			return -ENOSYS;
 	}
+	if (op & FUTEX_ALL_SIZE_BITS) {
+		flags |= futex_op_to_size_flags(op);
+		if ((flags & FLAGS_SIZE_BITS) == FLAGS_SIZE_BITS) {
+			/*
+			 * if both bits are set we could silently treat that as
+			 * a 32 bit futex, but we may want to use this pattern
+			 * in the future to indicate a 64 bit futex or an
+			 * arbitrarily sized futex. so we don't want code
+			 * relying on this yet.
+			 */
+			return -EINVAL;
+		}
+		switch (cmd) {
+		case FUTEX_CMP_REQUEUE:
+			/*
+			 * for cmp_requeue, only uaddr has to be of the size
+			 * passed in the flags. uaddr2 can be of any size
+			 */
+		case FUTEX_WAIT:
+		case FUTEX_WAIT_BITSET:
+			break;
+		case FUTEX_WAKE:
+		case FUTEX_WAKE_BITSET:
+		case FUTEX_REQUEUE:
+			/*
+			 * these instructions work with sized futexes, but you
+			 * don't need to pass the size. we could silently
+			 * ignore the size argument, but the code won't verify
+			 * that the correct size is used, so it's preferable
+			 * to make that clear to the caller.
+			 *
+			 * for requeue the meaning would also be ambiguous: do
+			 * both of them have to be the same size or not? they
+			 * don't, and that's clearer when you just don't pass
+			 * a size argument.
+			 */
+			return -EINVAL;
+		default:
+			/*
+			 * all other cases are not supported yet
+			 */
+			return -EINVAL;
+		}
+	}

 	switch (cmd) {
 	case FUTEX_LOCK_PI:
--
2.17.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH] futex: Support smaller futexes of one byte or two byte size.
  2019-12-20 19:09   ` Malte Skarupke
@ 2019-12-20 19:09     ` Malte Skarupke
  2019-12-21 15:56       ` Malte Skarupke
  0 siblings, 1 reply; 11+ messages in thread
From: Malte Skarupke @ 2019-12-20 19:09 UTC (permalink / raw)
  To: tglx, mingo; +Cc: peterz, dvhart, linux-kernel, malteskarupke, malteskarupke

This is option 2 out of 2. In both versions only the operations WAIT, WAKE,
REQUEUE and CMP_REQUEUE are supported. In this version all four of those
operations take a size argument. WAKE and REQUEUE only use the size argument
to verify that they were called with the same flags as the matching WAIT. WAIT
and CMP_REQUEUE use the size argument only for the "compare" part of their
operation. The other option would not have required a size argument for WAKE
and REQUEUE. See the mailing list for discussions as to which option we
should use.

None of these operations handle overlapping futexes. The mental model is
comparison+hashtable lookup and only an exact match on the address will find
the matching futex between WAIT and WAKE. An alternative would have been the
MONITOR/MWAIT mental model to handle overlapping futexes of different sizes,
but for mutexes and condition variables I just needed the normal futex
behavior. (except with smaller types)

There are two reasons for adding smaller futexes:
1. People often want smaller mutexes. Having mutexes that are only one byte in
size was the first reason WebKit gave for re-implementing futexes:
https://webkit.org/blog/6161/locking-in-webkit/

2. The C++ standard added futexes to the standard library in C++20 under the
name atomic_wait and atomic_notify:
http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1135r5.html
The C++ version supports this for atomic variables of any size. The more sizes
we can support, the better the standard implementation can be on Linux.

I changed the location of two flags that used to be stored in the low bits of
the address, since the address is no longer aligned on four bytes. Luckily the
high bits were free as long as PAGE_SIZE is never more than 1 gigabyte.

The reason for only supporting 8 bit futexes and 16 bit futexes is that those
were easy to support with the existing interface. 64 bit futexes would require
bigger changes.

The reason for only supporting WAIT/WAKE and REQUEUE/CMP_REQUEUE is that those
are the only operations I need right now. (in fact the C++ standard only needs
WAIT/WAKE) The other operations are more complicated and will have to wait.

Signed-off-by: Malte Skarupke <malteskarupke@web.de>
---
 include/linux/futex.h      |  19 +++-
 include/uapi/linux/futex.h |  12 ++-
 kernel/futex.c             | 183 ++++++++++++++++++++++++++++++++-----
 3 files changed, 186 insertions(+), 28 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 5cc3fed27d4c..c05f4ad57133 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -16,18 +16,29 @@ struct task_struct;
  * The key type depends on whether it's a shared or private mapping.
  * Don't rearrange members without looking at hash_futex().
  *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We use the two low order bits of offset to tell what is the kind of key :
+ * offset is the position within a page and is in the range [0, PAGE_SIZE).
+ * The high bits of offset are used to store flags :
+ * The next two bits above PAGE_SIZE indicate what kind of key this is :
  *  00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
  *       (no reference on an inode or mm)
  *  01 : Shared futex (PTHREAD_PROCESS_SHARED)
  *	mapped on a file (reference on the underlying inode)
  *  10 : Shared futex (PTHREAD_PROCESS_SHARED)
  *       (but private mapping on an mm, and reference taken on it)
+ * The two bits after that indicate the size of the futex :
+ *  00 : 32 bits
+ *  01 : 8 bits
+ :  10 : 16 bits
 */

-#define FUT_OFF_INODE    1 /* We set bit 0 if key has a reference on inode */
-#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+/* We set the next highest bit if key has a reference on inode */
+#define FUT_OFF_INODE		PAGE_SIZE
+/* We set the bit after that if key has a reference on mm */
+#define FUT_OFF_MMSHARED	(PAGE_SIZE << 1)
+/* The bits after that indicate futexes of different sizes */
+#define FUT_OFF_8_BITS		(PAGE_SIZE << 2)
+#define FUT_OFF_16_BITS		(PAGE_SIZE << 3)
+#define FUT_OFF_SIZE_BITS	(FUT_OFF_8_BITS | FUT_OFF_16_BITS)

 union futex_key {
 	struct {
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index a89eb0accd5e..885961388c57 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -24,7 +24,17 @@

 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
-#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+#define FUTEX_SIZE_8_BITS	512
+#define FUTEX_SIZE_16_BITS	1024
+#define FUTEX_SIZE_32_BITS	0
+#define FUTEX_SECOND_SIZE_8_BITS	2048
+#define FUTEX_SECOND_SIZE_16_BITS	4096
+#define FUTEX_SECOND_SIZE_32_BITS	0
+#define FUTEX_ALL_SIZE_BITS	(FUTEX_SIZE_8_BITS | FUTEX_SIZE_16_BITS | \
+				 FUTEX_SECOND_SIZE_8_BITS | \
+				 FUTEX_SECOND_SIZE_16_BITS)
+#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \
+				  FUTEX_ALL_SIZE_BITS)

 #define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
diff --git a/kernel/futex.c b/kernel/futex.c
index 03c518e9747e..e2308cea7580 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -183,6 +183,14 @@ static int  __read_mostly futex_cmpxchg_enabled;
 #endif
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
+/* size flags used for uaddr1 */
+#define FLAGS_FIRST_8_BITS	0x08
+#define FLAGS_FIRST_16_BITS	0x10
+/* size flags used for uaddr2 */
+#define FLAGS_SECOND_8_BITS	0x20
+#define FLAGS_SECOND_16_BITS	0x40
+#define FLAGS_FIRST_SIZE_BITS	(FLAGS_FIRST_8_BITS | FLAGS_FIRST_16_BITS)
+#define FLAGS_SECOND_SIZE_BITS	(FLAGS_SECOND_8_BITS | FLAGS_SECOND_16_BITS)

 /*
  * Priority Inheritance state:
@@ -385,9 +393,15 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
  */
 static struct futex_hash_bucket *hash_futex(union futex_key *key)
 {
+	/*
+	 * we exclude the size bits from the hash so that futexes of different
+	 * sizes hash to the same bucket. we use this to check for mismatching
+	 * size flags between WAIT and WAKE
+	 */
+	u32 hash_init = key->both.offset & ~FUT_OFF_SIZE_BITS;
 	u32 hash = jhash2((u32*)&key->both.word,
 			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
-			  key->both.offset);
+			  hash_init);
 	return &futex_queues[hash & (futex_hashsize - 1)];
 }

@@ -401,10 +415,19 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
  */
 static inline int match_futex(union futex_key *key1, union futex_key *key2)
 {
+	/*
+	 * compare every member except for the size bits in offset. the size
+	 * bits are only used for verification, and for that we need to first
+	 * detect that two futexes are the same except for the size bits (which
+	 * this function does) and then check if the size bits are different.
+	 * since all other code doesn't care about the size bits, we can skip
+	 * the comparison in all other cases as well.
+	 */
 	return (key1 && key2
 		&& key1->both.word == key2->both.word
 		&& key1->both.ptr == key2->both.ptr
-		&& key1->both.offset == key2->both.offset);
+		&&   (key1->both.offset & ~FUT_OFF_SIZE_BITS)
+		  == (key2->both.offset & ~FUT_OFF_SIZE_BITS));
 }

 /*
@@ -505,10 +528,20 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 	return timeout;
 }

+static inline int futex_size(int flags)
+{
+	if (flags & (FLAGS_FIRST_8_BITS | FLAGS_SECOND_8_BITS))
+		return sizeof(u8);
+	else if (flags & (FLAGS_FIRST_16_BITS | FLAGS_SECOND_16_BITS))
+		return sizeof(u16);
+	else
+		return sizeof(u32);
+}
+
 /**
  * get_futex_key() - Get parameters which are the keys for a futex
  * @uaddr:	virtual address of the futex
- * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:	futex flags (FLAGS_SHARED, FLAGS_8_BITS etc.)
  * @key:	address where result is stored.
  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
  *              FUTEX_WRITE)
@@ -524,23 +557,31 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
 static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
+get_futex_key(u32 __user *uaddr, int flags, union futex_key *key, enum futex_access rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
 	struct page *page, *tail;
 	struct address_space *mapping;
-	int err, ro = 0;
+	int err, fshared, size, ro = 0;
+
+	fshared = flags & FLAGS_SHARED;
+	size = futex_size(flags);
+
+	key->both.offset = address % PAGE_SIZE;
+	if (size == sizeof(u8))
+		key->both.offset |= FUT_OFF_8_BITS;
+	else if (size == sizeof(u16))
+		key->both.offset |= FUT_OFF_16_BITS;

 	/*
 	 * The futex address must be "naturally" aligned.
 	 */
-	key->both.offset = address % PAGE_SIZE;
-	if (unlikely((address % sizeof(u32)) != 0))
+	if (unlikely((address & (size - 1)) != 0))
 		return -EINVAL;
 	address -= key->both.offset;

-	if (unlikely(!access_ok(uaddr, sizeof(u32))))
+	if (unlikely(!access_ok(uaddr, size)))
 		return -EFAULT;

 	if (unlikely(should_fail_futex(fshared)))
@@ -792,6 +833,18 @@ static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
 	return ret;
 }

+static inline int __get_sized_futex_value(u32 *dest, u32 __user *from, int size)
+{
+	switch (size) {
+	case 1:
+		return __get_user(*dest, (u8 __user *)from);
+	case 2:
+		return __get_user(*dest, (u16 __user *)from);
+	default:
+		return __get_user(*dest, from);
+	}
+}
+
 static int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
 	int ret;
@@ -803,6 +856,26 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 	return ret ? -EFAULT : 0;
 }

+static int get_sized_futex_value_locked(u32 *dest, u32 __user *from, int size)
+{
+	int ret;
+
+	pagefault_disable();
+	ret = __get_sized_futex_value(dest, from, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+
+static int get_sized_futex_value(u32 *dest, u32 __user *from, int size)
+{
+	might_fault();
+	if (access_ok(from, size))
+		return __get_sized_futex_value(dest, from, size);
+	else
+		return -EFAULT;
+}
+

 /*
  * PI code:
@@ -1658,12 +1731,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	struct futex_q *this, *next;
 	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
+	bool found_size_mismatch = false;
 	DEFINE_WAKE_Q(wake_q);

 	if (!bitset)
 		return -EINVAL;

-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;

@@ -1686,11 +1760,18 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 			if (!(this->bitset & bitset))
 				continue;

+			if (this->key.both.offset != key.both.offset) {
+				found_size_mismatch = true;
+				continue;
+			}
+
 			mark_wake_futex(&wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
 	}
+	if (found_size_mismatch && ret == 0)
+		ret = -EINVAL;

 	spin_unlock(&hb->lock);
 	wake_up_q(&wake_q);
@@ -1762,10 +1843,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	DEFINE_WAKE_Q(wake_q);

 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out_put_key1;

@@ -2001,6 +2082,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
 	int drop_count = 0, task_count = 0, ret;
+	bool found_size_mismatch = false;
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
@@ -2047,11 +2129,12 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	}

 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags & ~FLAGS_SECOND_SIZE_BITS,
+			    &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
-			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+	ret = get_futex_key(uaddr2, flags & ~FLAGS_FIRST_SIZE_BITS,
+			    &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out_put_key1;

@@ -2073,14 +2156,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,

 	if (likely(cmpval != NULL)) {
 		u32 curval;
+		int size = futex_size(flags & ~FLAGS_SECOND_SIZE_BITS);

-		ret = get_futex_value_locked(&curval, uaddr1);
+		ret = get_sized_futex_value_locked(&curval, uaddr1, size);

 		if (unlikely(ret)) {
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);

-			ret = get_user(curval, uaddr1);
+			ret = get_sized_futex_value(&curval, uaddr1, size);
 			if (ret)
 				goto out_put_keys;

@@ -2186,6 +2270,11 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 		if (!match_futex(&this->key, &key1))
 			continue;

+		if (this->key.both.offset != key1.both.offset) {
+			found_size_mismatch = true;
+			continue;
+		}
+
 		/*
 		 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
 		 * be paired with each other and no other futex ops.
@@ -2272,6 +2361,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	 */
 	put_pi_state(pi_state);

+	if (found_size_mismatch && ret == 0 && task_count == 0)
+		ret = -EINVAL;
+
 out_unlock:
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
@@ -2727,7 +2819,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 			   struct futex_q *q, struct futex_hash_bucket **hb)
 {
 	u32 uval;
-	int ret;
+	int ret, size = futex_size(flags);

 	/*
 	 * Access the page AFTER the hash-bucket is locked.
@@ -2748,19 +2840,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 * while the syscall executes.
 	 */
 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;

 retry_private:
 	*hb = queue_lock(q);

-	ret = get_futex_value_locked(&uval, uaddr);
+	ret = get_sized_futex_value_locked(&uval, uaddr, size);

 	if (ret) {
 		queue_unlock(*hb);

-		ret = get_user(uval, uaddr);
+		ret = get_sized_futex_value(&uval, uaddr, size);
 		if (ret)
 			goto out;

@@ -2893,7 +2985,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);

 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;

@@ -3084,7 +3176,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 	if ((uval & FUTEX_TID_MASK) != vpid)
 		return -EPERM;

-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
 	if (ret)
 		return ret;

@@ -3321,7 +3413,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 */
 	rt_mutex_init_waiter(&rt_waiter);

-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;

@@ -3847,6 +3939,13 @@ void futex_exit_release(struct task_struct *tsk)
 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
 }

+int futex_op_to_size_flags(int op)
+{
+	int size_flags = op & FUTEX_ALL_SIZE_BITS;
+
+	return (size_flags / FUTEX_SIZE_8_BITS) * FLAGS_FIRST_8_BITS;
+}
+
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
@@ -3862,6 +3961,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		    cmd != FUTEX_WAIT_REQUEUE_PI)
 			return -ENOSYS;
 	}
+	if (op & FUTEX_ALL_SIZE_BITS) {
+		flags |= futex_op_to_size_flags(op);
+
+		if ((flags & FLAGS_FIRST_SIZE_BITS) == FLAGS_FIRST_SIZE_BITS ||
+		   (flags & FLAGS_SECOND_SIZE_BITS) == FLAGS_SECOND_SIZE_BITS) {
+			/*
+			 * if both bits are set we could silently treat that as
+			 * a 32 bit futex, but we may want to use this pattern
+			 * in the future to indicate a 64 bit futex or an
+			 * arbitrarily sized futex. so we don't want code
+			 * relying on this yet.
+			 */
+			return -EINVAL;
+		}
+		switch (cmd) {
+		case FUTEX_WAIT:
+		case FUTEX_WAIT_BITSET:
+		case FUTEX_WAKE:
+		case FUTEX_WAKE_BITSET:
+			if (flags & FLAGS_SECOND_SIZE_BITS) {
+				/*
+				 * these operations only take one argument so
+				 * only the size bits for the first argument
+				 * should be used
+				 */
+				return -EINVAL;
+			}
+			break;
+		case FUTEX_REQUEUE:
+		case FUTEX_CMP_REQUEUE:
+			break;
+		default:
+			/*
+			 * all other cases are not supported yet
+			 */
+			return -EINVAL;
+		}
+	}

 	switch (cmd) {
 	case FUTEX_LOCK_PI:
--
2.17.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH] futex: Support smaller futexes of one byte or two byte size.
  2019-12-20 19:09     ` Malte Skarupke
@ 2019-12-21 15:56       ` Malte Skarupke
  2019-12-21 15:56         ` Malte Skarupke
  0 siblings, 1 reply; 11+ messages in thread
From: Malte Skarupke @ 2019-12-21 15:56 UTC (permalink / raw)
  To: tglx, mingo; +Cc: peterz, dvhart, linux-kernel, malteskarupke, malteskarupke

And hello one more time,

turns out a last minute change broke my patch for "option 2". I had moved some
lines around and didn't test the results properly... As a result I literally
woke up from a nightmare at 4am tonight, immediately knowing that there was a
bug in the code I had sent. Below you'll find a patch with the fix.

I re-emailed the whole patch to make it easier to use if you want to go with
option 2. But if you're curious what the difference is between this new
version and the last version, the diff between the two is just this:

diff --git a/kernel/futex.c b/kernel/futex.c
index e2308cea7580..ac460fd612ae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -569,10 +569,6 @@ get_futex_key(u32 __user *uaddr, int flags, union futex_key *key, enum futex_acc
   size = futex_size(flags);

   key->both.offset = address % PAGE_SIZE;
-  if (size == sizeof(u8))
-     key->both.offset |= FUT_OFF_8_BITS;
-  else if (size == sizeof(u16))
-     key->both.offset |= FUT_OFF_16_BITS;

   /*
    * The futex address must be "naturally" aligned.
@@ -581,6 +577,11 @@ get_futex_key(u32 __user *uaddr, int flags, union futex_key *key, enum futex_acc
      return -EINVAL;
   address -= key->both.offset;

+  if (size == sizeof(u8))
+     key->both.offset |= FUT_OFF_8_BITS;
+  else if (size == sizeof(u16))
+     key->both.offset |= FUT_OFF_16_BITS;
+
   if (unlikely(!access_ok(uaddr, size)))
      return -EFAULT;




^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH] futex: Support smaller futexes of one byte or two byte size.
  2019-12-21 15:56       ` Malte Skarupke
@ 2019-12-21 15:56         ` Malte Skarupke
  0 siblings, 0 replies; 11+ messages in thread
From: Malte Skarupke @ 2019-12-21 15:56 UTC (permalink / raw)
  To: tglx, mingo; +Cc: peterz, dvhart, linux-kernel, malteskarupke, malteskarupke

This is option 2 out of 2. In both versions only the operations WAIT, WAKE,
REQUEUE and CMP_REQUEUE are supported. In this version all four of those
operations take a size argument. WAKE and REQUEUE only use the size argument
to verify that they were called with the same flags as the matching WAIT. WAIT
and CMP_REQUEUE use the size argument only for the "compare" part of their
operation. The other option would not have required a size argument for WAKE
and REQUEUE. See the mailing list for discussions as to which option we
should use.

None of these operations handle overlapping futexes. The mental model is
comparison+hashtable lookup and only an exact match on the address will find
the matching futex between WAIT and WAKE. An alternative would have been the
MONITOR/MWAIT mental model to handle overlapping futexes of different sizes,
but for mutexes and condition variables I just needed the normal futex
behavior. (except with smaller types)

There are two reasons for adding smaller futexes:
1. People often want smaller mutexes. Having mutexes that are only one byte in
size was the first reason WebKit gave for re-implementing futexes:
https://webkit.org/blog/6161/locking-in-webkit/

2. The C++ standard added futexes to the standard library in C++20 under the
name atomic_wait and atomic_notify:
http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p1135r5.html
The C++ version supports this for atomic variables of any size. The more sizes
we can support, the better the standard implementation can be on Linux.

I changed the location of two flags that used to be stored in the low bits of
the address, since the address is no longer aligned on four bytes. Luckily the
high bits were free as long as PAGE_SIZE is never more than 1 gigabyte.

The reason for only supporting 8 bit futexes and 16 bit futexes is that those
were easy to support with the existing interface. 64 bit futexes would require
bigger changes.

The reason for only supporting WAIT/WAKE and REQUEUE/CMP_REQUEUE is that those
are the only operations I need right now. (in fact the C++ standard only needs
WAIT/WAKE) The other operations are more complicated and will have to wait.

Signed-off-by: Malte Skarupke <malteskarupke@web.de>
---
 include/linux/futex.h      |  19 +++-
 include/uapi/linux/futex.h |  12 ++-
 kernel/futex.c             | 184 ++++++++++++++++++++++++++++++++-----
 3 files changed, 187 insertions(+), 28 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 5cc3fed27d4c..c05f4ad57133 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -16,18 +16,29 @@ struct task_struct;
  * The key type depends on whether it's a shared or private mapping.
  * Don't rearrange members without looking at hash_futex().
  *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We use the two low order bits of offset to tell what is the kind of key :
+ * offset is the position within a page and is in the range [0, PAGE_SIZE).
+ * The high bits of offset are used to store flags :
+ * The next two bits above PAGE_SIZE indicate what kind of key this is :
  *  00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
  *       (no reference on an inode or mm)
  *  01 : Shared futex (PTHREAD_PROCESS_SHARED)
  *	mapped on a file (reference on the underlying inode)
  *  10 : Shared futex (PTHREAD_PROCESS_SHARED)
  *       (but private mapping on an mm, and reference taken on it)
+ * The two bits after that indicate the size of the futex :
+ *  00 : 32 bits
+ *  01 : 8 bits
+ :  10 : 16 bits
 */

-#define FUT_OFF_INODE    1 /* We set bit 0 if key has a reference on inode */
-#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
+/* We set the next highest bit if key has a reference on inode */
+#define FUT_OFF_INODE		PAGE_SIZE
+/* We set the bit after that if key has a reference on mm */
+#define FUT_OFF_MMSHARED	(PAGE_SIZE << 1)
+/* The bits after that indicate futexes of different sizes */
+#define FUT_OFF_8_BITS		(PAGE_SIZE << 2)
+#define FUT_OFF_16_BITS		(PAGE_SIZE << 3)
+#define FUT_OFF_SIZE_BITS	(FUT_OFF_8_BITS | FUT_OFF_16_BITS)

 union futex_key {
 	struct {
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index a89eb0accd5e..885961388c57 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -24,7 +24,17 @@

 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
-#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+#define FUTEX_SIZE_8_BITS	512
+#define FUTEX_SIZE_16_BITS	1024
+#define FUTEX_SIZE_32_BITS	0
+#define FUTEX_SECOND_SIZE_8_BITS	2048
+#define FUTEX_SECOND_SIZE_16_BITS	4096
+#define FUTEX_SECOND_SIZE_32_BITS	0
+#define FUTEX_ALL_SIZE_BITS	(FUTEX_SIZE_8_BITS | FUTEX_SIZE_16_BITS | \
+				 FUTEX_SECOND_SIZE_8_BITS | \
+				 FUTEX_SECOND_SIZE_16_BITS)
+#define FUTEX_CMD_MASK		~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME | \
+				  FUTEX_ALL_SIZE_BITS)

 #define FUTEX_WAIT_PRIVATE	(FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_PRIVATE	(FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
diff --git a/kernel/futex.c b/kernel/futex.c
index 03c518e9747e..ac460fd612ae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -183,6 +183,14 @@ static int  __read_mostly futex_cmpxchg_enabled;
 #endif
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
+/* size flags used for uaddr1 */
+#define FLAGS_FIRST_8_BITS	0x08
+#define FLAGS_FIRST_16_BITS	0x10
+/* size flags used for uaddr2 */
+#define FLAGS_SECOND_8_BITS	0x20
+#define FLAGS_SECOND_16_BITS	0x40
+#define FLAGS_FIRST_SIZE_BITS	(FLAGS_FIRST_8_BITS | FLAGS_FIRST_16_BITS)
+#define FLAGS_SECOND_SIZE_BITS	(FLAGS_SECOND_8_BITS | FLAGS_SECOND_16_BITS)

 /*
  * Priority Inheritance state:
@@ -385,9 +393,15 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
  */
 static struct futex_hash_bucket *hash_futex(union futex_key *key)
 {
+	/*
+	 * we exclude the size bits from the hash so that futexes of different
+	 * sizes hash to the same bucket. we use this to check for mismatching
+	 * size flags between WAIT and WAKE
+	 */
+	u32 hash_init = key->both.offset & ~FUT_OFF_SIZE_BITS;
 	u32 hash = jhash2((u32*)&key->both.word,
 			  (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
-			  key->both.offset);
+			  hash_init);
 	return &futex_queues[hash & (futex_hashsize - 1)];
 }

@@ -401,10 +415,19 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
  */
 static inline int match_futex(union futex_key *key1, union futex_key *key2)
 {
+	/*
+	 * compare every member except for the size bits in offset. the size
+	 * bits are only used for verification, and for that we need to first
+	 * detect that two futexes are the same except for the size bits (which
+	 * this function does) and then check if the size bits are different.
+	 * since all other code doesn't care about the size bits, we can skip
+	 * the comparison in all other cases as well.
+	 */
 	return (key1 && key2
 		&& key1->both.word == key2->both.word
 		&& key1->both.ptr == key2->both.ptr
-		&& key1->both.offset == key2->both.offset);
+		&&   (key1->both.offset & ~FUT_OFF_SIZE_BITS)
+		  == (key2->both.offset & ~FUT_OFF_SIZE_BITS));
 }

 /*
@@ -505,10 +528,20 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
 	return timeout;
 }

+static inline int futex_size(int flags)
+{
+	if (flags & (FLAGS_FIRST_8_BITS | FLAGS_SECOND_8_BITS))
+		return sizeof(u8);
+	else if (flags & (FLAGS_FIRST_16_BITS | FLAGS_SECOND_16_BITS))
+		return sizeof(u16);
+	else
+		return sizeof(u32);
+}
+
 /**
  * get_futex_key() - Get parameters which are the keys for a futex
  * @uaddr:	virtual address of the futex
- * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:	futex flags (FLAGS_SHARED, FLAGS_8_BITS etc.)
  * @key:	address where result is stored.
  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
  *              FUTEX_WRITE)
@@ -524,23 +557,32 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
 static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
+get_futex_key(u32 __user *uaddr, int flags, union futex_key *key, enum futex_access rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
 	struct page *page, *tail;
 	struct address_space *mapping;
-	int err, ro = 0;
+	int err, fshared, size, ro = 0;
+
+	fshared = flags & FLAGS_SHARED;
+	size = futex_size(flags);
+
+	key->both.offset = address % PAGE_SIZE;

 	/*
 	 * The futex address must be "naturally" aligned.
 	 */
-	key->both.offset = address % PAGE_SIZE;
-	if (unlikely((address % sizeof(u32)) != 0))
+	if (unlikely((address & (size - 1)) != 0))
 		return -EINVAL;
 	address -= key->both.offset;

-	if (unlikely(!access_ok(uaddr, sizeof(u32))))
+	if (size == sizeof(u8))
+		key->both.offset |= FUT_OFF_8_BITS;
+	else if (size == sizeof(u16))
+		key->both.offset |= FUT_OFF_16_BITS;
+
+	if (unlikely(!access_ok(uaddr, size)))
 		return -EFAULT;

 	if (unlikely(should_fail_futex(fshared)))
@@ -792,6 +834,18 @@ static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
 	return ret;
 }

+static inline int __get_sized_futex_value(u32 *dest, u32 __user *from, int size)
+{
+	switch (size) {
+	case 1:
+		return __get_user(*dest, (u8 __user *)from);
+	case 2:
+		return __get_user(*dest, (u16 __user *)from);
+	default:
+		return __get_user(*dest, from);
+	}
+}
+
 static int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
 	int ret;
@@ -803,6 +857,26 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 	return ret ? -EFAULT : 0;
 }

+static int get_sized_futex_value_locked(u32 *dest, u32 __user *from, int size)
+{
+	int ret;
+
+	pagefault_disable();
+	ret = __get_sized_futex_value(dest, from, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+
+static int get_sized_futex_value(u32 *dest, u32 __user *from, int size)
+{
+	might_fault();
+	if (access_ok(from, size))
+		return __get_sized_futex_value(dest, from, size);
+	else
+		return -EFAULT;
+}
+

 /*
  * PI code:
@@ -1658,12 +1732,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	struct futex_q *this, *next;
 	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
+	bool found_size_mismatch = false;
 	DEFINE_WAKE_Q(wake_q);

 	if (!bitset)
 		return -EINVAL;

-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;

@@ -1686,11 +1761,18 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 			if (!(this->bitset & bitset))
 				continue;

+			if (this->key.both.offset != key.both.offset) {
+				found_size_mismatch = true;
+				continue;
+			}
+
 			mark_wake_futex(&wake_q, this);
 			if (++ret >= nr_wake)
 				break;
 		}
 	}
+	if (found_size_mismatch && ret == 0)
+		ret = -EINVAL;

 	spin_unlock(&hb->lock);
 	wake_up_q(&wake_q);
@@ -1762,10 +1844,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	DEFINE_WAKE_Q(wake_q);

 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out_put_key1;

@@ -2001,6 +2083,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
 	int drop_count = 0, task_count = 0, ret;
+	bool found_size_mismatch = false;
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
@@ -2047,11 +2130,12 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	}

 retry:
-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+	ret = get_futex_key(uaddr1, flags & ~FLAGS_SECOND_SIZE_BITS,
+			    &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
-			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+	ret = get_futex_key(uaddr2, flags & ~FLAGS_FIRST_SIZE_BITS,
+			    &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ);
 	if (unlikely(ret != 0))
 		goto out_put_key1;

@@ -2073,14 +2157,15 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,

 	if (likely(cmpval != NULL)) {
 		u32 curval;
+		int size = futex_size(flags & ~FLAGS_SECOND_SIZE_BITS);

-		ret = get_futex_value_locked(&curval, uaddr1);
+		ret = get_sized_futex_value_locked(&curval, uaddr1, size);

 		if (unlikely(ret)) {
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);

-			ret = get_user(curval, uaddr1);
+			ret = get_sized_futex_value(&curval, uaddr1, size);
 			if (ret)
 				goto out_put_keys;

@@ -2186,6 +2271,11 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 		if (!match_futex(&this->key, &key1))
 			continue;

+		if (this->key.both.offset != key1.both.offset) {
+			found_size_mismatch = true;
+			continue;
+		}
+
 		/*
 		 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
 		 * be paired with each other and no other futex ops.
@@ -2272,6 +2362,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	 */
 	put_pi_state(pi_state);

+	if (found_size_mismatch && ret == 0 && task_count == 0)
+		ret = -EINVAL;
+
 out_unlock:
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
@@ -2727,7 +2820,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 			   struct futex_q *q, struct futex_hash_bucket **hb)
 {
 	u32 uval;
-	int ret;
+	int ret, size = futex_size(flags);

 	/*
 	 * Access the page AFTER the hash-bucket is locked.
@@ -2748,19 +2841,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
 	 * while the syscall executes.
 	 */
 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+	ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
 	if (unlikely(ret != 0))
 		return ret;

 retry_private:
 	*hb = queue_lock(q);

-	ret = get_futex_value_locked(&uval, uaddr);
+	ret = get_sized_futex_value_locked(&uval, uaddr, size);

 	if (ret) {
 		queue_unlock(*hb);

-		ret = get_user(uval, uaddr);
+		ret = get_sized_futex_value(&uval, uaddr, size);
 		if (ret)
 			goto out;

@@ -2893,7 +2986,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 	to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);

 retry:
-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;

@@ -3084,7 +3177,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 	if ((uval & FUTEX_TID_MASK) != vpid)
 		return -EPERM;

-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+	ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
 	if (ret)
 		return ret;

@@ -3321,7 +3414,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 */
 	rt_mutex_init_waiter(&rt_waiter);

-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+	ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
 		goto out;

@@ -3847,6 +3940,13 @@ void futex_exit_release(struct task_struct *tsk)
 	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
 }

+int futex_op_to_size_flags(int op)
+{
+	int size_flags = op & FUTEX_ALL_SIZE_BITS;
+
+	return (size_flags / FUTEX_SIZE_8_BITS) * FLAGS_FIRST_8_BITS;
+}
+
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
@@ -3862,6 +3962,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		    cmd != FUTEX_WAIT_REQUEUE_PI)
 			return -ENOSYS;
 	}
+	if (op & FUTEX_ALL_SIZE_BITS) {
+		flags |= futex_op_to_size_flags(op);
+
+		if ((flags & FLAGS_FIRST_SIZE_BITS) == FLAGS_FIRST_SIZE_BITS ||
+		   (flags & FLAGS_SECOND_SIZE_BITS) == FLAGS_SECOND_SIZE_BITS) {
+			/*
+			 * if both bits are set we could silently treat that as
+			 * a 32 bit futex, but we may want to use this pattern
+			 * in the future to indicate a 64 bit futex or an
+			 * arbitrarily sized futex. so we don't want code
+			 * relying on this yet.
+			 */
+			return -EINVAL;
+		}
+		switch (cmd) {
+		case FUTEX_WAIT:
+		case FUTEX_WAIT_BITSET:
+		case FUTEX_WAKE:
+		case FUTEX_WAKE_BITSET:
+			if (flags & FLAGS_SECOND_SIZE_BITS) {
+				/*
+				 * these operations only take one argument so
+				 * only the size bits for the first argument
+				 * should be used
+				 */
+				return -EINVAL;
+			}
+			break;
+		case FUTEX_REQUEUE:
+		case FUTEX_CMP_REQUEUE:
+			break;
+		default:
+			/*
+			 * all other cases are not supported yet
+			 */
+			return -EINVAL;
+		}
+	}

 	switch (cmd) {
 	case FUTEX_LOCK_PI:
--
2.17.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-12-21 16:01 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-04 23:52 [PATCH] futex: Support smaller futexes of one byte or two byte size Malte Skarupke
2019-12-06 15:31 ` Peter Zijlstra
2019-12-06 17:37   ` Peter Zijlstra
2019-12-08 22:18     ` Malte Skarupke
2019-12-11 13:44       ` Peter Zijlstra
2019-12-11 19:48         ` Malte Skarupke
2019-12-20 19:08 ` Malte Skarupke
2019-12-20 19:09   ` Malte Skarupke
2019-12-20 19:09     ` Malte Skarupke
2019-12-21 15:56       ` Malte Skarupke
2019-12-21 15:56         ` Malte Skarupke

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.