[RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.

* [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
@ 2019-11-04  0:29 Shawn Landden
  2019-11-04  0:51 ` Shawn Landden
                   ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Shawn Landden @ 2019-11-04  0:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: libc-alpha, linux-api, linux-kernel, Arnd Bergmann,
	Deepa Dinamani, Oleg Nesterov, Andrew Morton, Catalin Marinas,
	Shawn Landden, Keith Packard

The robust futexes ABI was designed to be flexible to changing ABIs in
glibc, however it did not take into consideration that this ABI is
particularly sticky, and suffers from lock-step problems, due to the
fact that the ABI is shared between processes. This introduces a new
size in set_robust_list that takes an additional futex_offset2 value
so that two locking ABIs can be used at the same time.

If this new ABI is used, then bit 1 of the *next pointer of the
user-space robust_list indicates that the futex_offset2 value should
be used in place of the existing futex_offset.

The use case for this is sharing locks between 32-bit and 64-bit
processes, which Linux supports, but glibc does not, and is difficult
to implement with the current Linux support because of mix-matched
ABIs. Keith Packard has complained about this:
https://keithp.com/blogs/Shared_Memory_Fences/

This can also be used to add a new ABI that uses smaller structs,
as the existing ABI on x86_64 is a minimum of 32 bytes, and 20 bytes
would suffice.

v2: fix size of compat_extended_robust_list_head
    fix some issues with number literals being implicitly ints
---
 include/linux/compat.h     |   5 +
 include/linux/sched.h      |   6 ++
 include/uapi/linux/futex.h |  31 +++++++
 kernel/futex.c             | 182 ++++++++++++++++++++++++-------------
 4 files changed, 160 insertions(+), 64 deletions(-)

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 16dafd9f4b86..00a0741bf658 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -379,10 +379,15 @@ struct compat_robust_list_head {
 	struct compat_robust_list	list;
 	compat_long_t			futex_offset;
 	compat_uptr_t			list_op_pending;
 };
 
+struct compat_extended_robust_list_head {
+	struct compat_robust_list_head list_head;
+	compat_long_t			futex_offset2;
+};
+
 #ifdef CONFIG_COMPAT_OLD_SIGACTION
 struct compat_old_sigaction {
 	compat_uptr_t			sa_handler;
 	compat_old_sigset_t		sa_mask;
 	compat_ulong_t			sa_flags;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9f51932bd543..894258fd44ac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1057,10 +1057,16 @@ struct task_struct {
 #ifdef CONFIG_X86_CPU_RESCTRL
 	u32				closid;
 	u32				rmid;
 #endif
 #ifdef CONFIG_FUTEX
+    /*
+     * bottom two bits are masked
+     * 0: struct extended_robust_list_head
+     * 1: struct robust_list_head
+     * same for compat_robust_list
+     */
 	struct robust_list_head __user	*robust_list;
 #ifdef CONFIG_COMPAT
 	struct compat_robust_list_head __user *compat_robust_list;
 #endif
 	struct list_head		pi_state_list;
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index a89eb0accd5e..30c08e07f26b 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -92,10 +92,41 @@ struct robust_list_head {
 	 * so only truly owned locks will be handled.
 	 */
 	struct robust_list __user *list_op_pending;
 };
 
+/*
+ * Extensible per-thread list head:
+ *
+ * As locks are shared between processes, the futex_offset field
+ * has ABI lock-stepping issues, which the original robust_list_head
+ * structure did not anticipate. (And which prevents 32-bit/64-bit
+ * interoperability, as well as shrinking of mutex structures).
+ * This new extensible_robust_list_head allows multiple
+ * concurrent futex_offset values, chosen using the bottom 2 bits of the
+ * robust_list *next pointer, which are now masked in BOTH the old and
+ * new ABI.
+ *
+ * Note: this structure is part of the syscall ABI like
+ * robust_list_head above, and must have a different size than
+ * robust_list_head.
+ *
+ */
+struct extended_robust_list_head {
+	struct robust_list_head list_head;
+
+	/*
+	 * These relative offsets are set by user-space. They give the kernel
+	 * the relative position of the futex field to examine, based on the
+	 * bit 1 *next pointer.
+	 * The original version was insufficiently flexible. Locks are held
+	 * in shared memory between processes, and a process might want to hold
+	 * locks of two ABIs at the same time.
+	 */
+	long futex_offset2;
+};
+
 /*
  * Are there any waiters for this robust futex:
  */
 #define FUTEX_WAITERS		0x80000000
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 6d50728ef2e7..3a17d2d63178 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3396,17 +3396,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
 		size_t, len)
 {
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
-	/*
-	 * The kernel knows only one size for now:
-	 */
-	if (unlikely(len != sizeof(*head)))
+
+	if (unlikely(len != sizeof(struct robust_list_head) &&
+		     len != sizeof(struct extensible_robust_list_head)))
 		return -EINVAL;
 
-	current->robust_list = head;
+	current->robust_list = head & 0b11UL;
+	BUILD_BUG_ON(sizeof(struct robust_list_head) ==
+		     sizeof(struct extended_robust_list_head));
+	if (len == sizeof(struct robust_list_head))
+		current->robust_list |= 1;
 
 	return 0;
 }
 
 /**
@@ -3419,10 +3422,11 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 		struct robust_list_head __user * __user *, head_ptr,
 		size_t __user *, len_ptr)
 {
 	struct robust_list_head __user *head;
 	unsigned long ret;
+	size_t len;
 	struct task_struct *p;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
 
@@ -3439,14 +3443,18 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 
 	ret = -EPERM;
 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
-	head = p->robust_list;
+	head = p->robust_list & ~0b11UL;
+	if (p->robust_list & 0b11 == 0b1)
+		len = sizeof(struct robust_list_head);
+	else
+		len = sizeof(struct extended_robust_list_head);
 	rcu_read_unlock();
 
-	if (put_user(sizeof(*head), len_ptr))
+	if (put_user(len, len_ptr))
 		return -EFAULT;
 	return put_user(head, head_ptr);
 
 err_unlock:
 	rcu_read_unlock();
@@ -3524,23 +3532,26 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int p
 
 	return 0;
 }
 
 /*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes. Bit 1 choses which
+ * futex_offset to use:
  */
 static inline int fetch_robust_entry(struct robust_list __user **entry,
 				     struct robust_list __user * __user *head,
-				     unsigned int *pi)
+				     unsigned int *pi,
+				     *unsigned int *second_abi)
 {
 	unsigned long uentry;
 
 	if (get_user(uentry, (unsigned long __user *)head))
 		return -EFAULT;
 
-	*entry = (void __user *)(uentry & ~1UL);
+	*entry = (void __user *)(uentry & ~0b11UL);
 	*pi = uentry & 1;
+	*second_abi = uentry & 0b10;
 
 	return 0;
 }
 
 /*
@@ -3549,69 +3560,84 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
  *
  * We silently return on any sign of list-walking problem.
  */
 void exit_robust_list(struct task_struct *curr)
 {
-	struct robust_list_head __user *head = curr->robust_list;
-	struct robust_list __user *entry, *next_entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	unsigned int uninitialized_var(next_pi);
-	unsigned long futex_offset;
+	struct robust_list_head __user *head_ptr = curr->robust_list & ~1UL;
+	unsigned int is_extended_list = curr->robust_list & 1 == 0;
+	struct extended_robust_list_head head;
+	struct robust_list __user *entry = &head->list_head.list.next,
+					   *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
+			     second_abip;
+	unsigned int uninitialized_var(next_pi),
+		     uninitialized_var(next_second_abi);
 	int rc;
 
 	if (!futex_cmpxchg_enabled)
 		return;
 
 	/*
-	 * Fetch the list head (which was registered earlier, via
-	 * sys_set_robust_list()):
+	 * fetch_robust_entry code is duplicated here to avoid excessive calls
+	 * to get_user()
 	 */
-	if (fetch_robust_entry(&entry, &head->list.next, &pi))
-		return;
-	/*
-	 * Fetch the relative futex offset:
-	 */
-	if (get_user(futex_offset, &head->futex_offset))
-		return;
-	/*
-	 * Fetch any possibly pending lock-add first, and handle it
-	 * if it exists:
-	 */
-	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
-		return;
+	if (is_extended_list) {
+		if (get_user(head, (struct extended_robust_list_head *)
+			     head_ptr))
+			return;
+	} else {
+		if (get_user(head.list_head, head_ptr))
+			return;
+	}
+
+	pi = head.list_head.list.next & 1;
+	second_abi = head.list_head.list.next & 0b10;
+	head.list_head.list.next &= ~0b11UL;
+	pip = head.list_head.list_op_pending & 1;
+	second_abip = head.list_head.list_op_pending & 0b10;
+	head.list_head.list_op_pending &= ~0b11UL;
 
 	next_entry = NULL;	/* avoid warning with gcc */
-	while (entry != &head->list) {
+	while (entry != &head->list_head.list) {
 		/*
 		 * Fetch the next entry in the list before calling
 		 * handle_futex_death:
 		 */
-		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi,
+					&next_second_abi);
 		/*
 		 * A pending lock might already be on the list, so
 		 * don't process it twice:
 		 */
-		if (entry != pending)
+		if (entry != pending) {
+			long futex_offset = second_abi ?
+					    head.futex_offset2 :
+					    head.list_head.futex_offset;
 			if (handle_futex_death((void __user *)entry + futex_offset,
 						curr, pi))
 				return;
+		}
 		if (rc)
 			return;
 		entry = next_entry;
 		pi = next_pi;
+		second_abi = next_second_abi;
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
 		if (!--limit)
 			break;
 
 		cond_resched();
 	}
 
-	if (pending)
+	if (pending) {
+		long futex_offset = second_abip ? head.futex_offset2 :
+						  head.list_head.futex_offset;
 		handle_futex_death((void __user *)pending + futex_offset,
 				   curr, pip);
+	}
 }
 
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
@@ -3707,21 +3733,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
 #ifdef CONFIG_COMPAT
 /*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes.
+ * Bit 1 choses which futex_offset to use:
  */
 static inline int
-compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-		   compat_uptr_t __user *head, unsigned int *pi)
+compat_fetch_robust_entry(compat_uptr_t *uentry,
+			  struct robust_list __user **entry,
+			  compat_uptr_t __user *head, unsigned int *pi,
+			  unsigned int *second_abi)
 {
 	if (get_user(*uentry, head))
 		return -EFAULT;
 
-	*entry = compat_ptr((*uentry) & ~1);
+	*entry = compat_ptr((*uentry) & ~0b11);
 	*pi = (unsigned int)(*uentry) & 1;
+	*second_abi = (unsigned int)(*uentry) & 0b10;
 
 	return 0;
 }
 
 static void __user *futex_uaddr(struct robust_list __user *entry,
@@ -3739,72 +3769,86 @@ static void __user *futex_uaddr(struct robust_list __user *entry,
  *
  * We silently return on any sign of list-walking problem.
  */
 void compat_exit_robust_list(struct task_struct *curr)
 {
-	struct compat_robust_list_head __user *head = curr->compat_robust_list;
-	struct robust_list __user *entry, *next_entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	unsigned int uninitialized_var(next_pi);
+	struct compat_robust_list_head __user *head = curr->compat_robust_list &
+						      ~1UL;
+	unsigned int is_extended_list = curr->compat_robust_list & 1 == 0;
+	struct compat_extended_robust_list_head head;
+	struct robust_list __user *entry = &head->list_head.list.next,
+					   *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
+			     second_abip;
+	unsigned int uninitialized_var(next_pi),
+		     uninitialized_var(next_second_abi);
 	compat_uptr_t uentry, next_uentry, upending;
-	compat_long_t futex_offset;
 	int rc;
 
 	if (!futex_cmpxchg_enabled)
 		return;
 
 	/*
-	 * Fetch the list head (which was registered earlier, via
-	 * sys_set_robust_list()):
-	 */
-	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
-		return;
-	/*
-	 * Fetch the relative futex offset:
-	 */
-	if (get_user(futex_offset, &head->futex_offset))
-		return;
-	/*
-	 * Fetch any possibly pending lock-add first, and handle it
-	 * if it exists:
+	 * compat_fetch_robust_entry code is duplicated here to avoid excessive
+	 * calls to get_user()
 	 */
-	if (compat_fetch_robust_entry(&upending, &pending,
-			       &head->list_op_pending, &pip))
-		return;
+	if (is_extended_list) {
+		if (get_user(head, (struct compat_extended_robust_list_head *)
+			     head_ptr))
+			return;
+	} else {
+		if (get_user(head.list_head, head_ptr))
+			return;
+	}
+
+	pi = head.list_head.list.next & 1;
+	second_abi = head.list_head.list.next & 0b10;
+	head.list_head.list.next &= ~0b11UL;
+	pip = head.list_head.list_op_pending & 1;
+	second_abip = head.list_head.list_op_pending & 0b10;
+	head.list_head.list_op_pending &= ~0b11UL;
 
 	next_entry = NULL;	/* avoid warning with gcc */
 	while (entry != (struct robust_list __user *) &head->list) {
 		/*
 		 * Fetch the next entry in the list before calling
 		 * handle_futex_death:
 		 */
 		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
-			(compat_uptr_t __user *)&entry->next, &next_pi);
+			(compat_uptr_t __user *)&entry->next, &next_pi,
+			&next_second_abi);
 		/*
 		 * A pending lock might already be on the list, so
 		 * dont process it twice:
 		 */
 		if (entry != pending) {
+			compat_long_t futex_offset = second_abi ?
+				head.futex_offset2 :
+				head.list_head.futex_offset;
 			void __user *uaddr = futex_uaddr(entry, futex_offset);
 
 			if (handle_futex_death(uaddr, curr, pi))
 				return;
 		}
 		if (rc)
 			return;
 		uentry = next_uentry;
 		entry = next_entry;
 		pi = next_pi;
+		second_abi = next_second_abi;
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
 		if (!--limit)
 			break;
 
 		cond_resched();
 	}
 	if (pending) {
+		compat_long_t futex_offset = second_abip ?
+					     head.futex_offset2 :
+					     head.list_head.futex_offset;
 		void __user *uaddr = futex_uaddr(pending, futex_offset);
 
 		handle_futex_death(uaddr, curr, pip);
 	}
 }
@@ -3814,23 +3858,29 @@ COMPAT_SYSCALL_DEFINE2(set_robust_list,
 		compat_size_t, len)
 {
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
 
-	if (unlikely(len != sizeof(*head)))
+	if (unlikely(len != sizeof(struct compat_robust_list_head) &&
+		     len != sizeof(struct compat_extended_robust_list_head)))
 		return -EINVAL;
 
-	current->compat_robust_list = head;
+	current->compat_robust_list = head & ~0b11;
+	BUILD_BUG_ON(sizeof(compat_robust_list_head) ==
+		     sizeof(compat_extended_robust_list_head));
+	if (len == sizeof(compat_robust_list_head))
+		current->compat_robust_list |= 0b1;
 
 	return 0;
 }
 
 COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 			compat_uptr_t __user *, head_ptr,
 			compat_size_t __user *, len_ptr)
 {
 	struct compat_robust_list_head __user *head;
+	size_t len;
 	unsigned long ret;
 	struct task_struct *p;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -3848,14 +3898,18 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 
 	ret = -EPERM;
 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
-	head = p->compat_robust_list;
+	head = p->compat_robust_list & ~0b11;
+	if (p->compat_robust_list & 0b11 == 0b1)
+		len = sizeof(struct compat_robust_list_head);
+	else
+		len = sizeof(struct compat_extended_robust_list_head);
 	rcu_read_unlock();
 
-	if (put_user(sizeof(*head), len_ptr))
+	if (put_user(len, len_ptr))
 		return -EFAULT;
 	return put_user(ptr_to_compat(head), head_ptr);
 
 err_unlock:
 	rcu_read_unlock();
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread