linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
@ 2019-11-04  0:29 Shawn Landden
  2019-11-04  0:51 ` Shawn Landden
                   ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: Shawn Landden @ 2019-11-04  0:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: libc-alpha, linux-api, linux-kernel, Arnd Bergmann,
	Deepa Dinamani, Oleg Nesterov, Andrew Morton, Catalin Marinas,
	Shawn Landden, Keith Packard

The robust futexes ABI was designed to be flexible to changing ABIs in
glibc, however it did not take into consideration that this ABI is
particularly sticky, and suffers from lock-step problems, due to the
fact that the ABI is shared between processes. This introduces a new
size in set_robust_list that takes an additional futex_offset2 value
so that two locking ABIs can be used at the same time.

If this new ABI is used, then bit 1 of the *next pointer of the
user-space robust_list indicates that the futex_offset2 value should
be used in place of the existing futex_offset.

The use case for this is sharing locks between 32-bit and 64-bit
processes, which Linux supports, but glibc does not, and is difficult
to implement with the current Linux support because of mix-matched
ABIs. Keith Packard has complained about this:
https://keithp.com/blogs/Shared_Memory_Fences/

This can also be used to add a new ABI that uses smaller structs,
as the existing ABI on x86_64 is a minimum of 32 bytes, and 20 bytes
would suffice.

v2: fix size of compat_extended_robust_list_head
    fix some issues with number literals being implicitly ints
---
 include/linux/compat.h     |   5 +
 include/linux/sched.h      |   6 ++
 include/uapi/linux/futex.h |  31 +++++++
 kernel/futex.c             | 182 ++++++++++++++++++++++++-------------
 4 files changed, 160 insertions(+), 64 deletions(-)

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 16dafd9f4b86..00a0741bf658 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -379,10 +379,15 @@ struct compat_robust_list_head {
 	struct compat_robust_list	list;
 	compat_long_t			futex_offset;
 	compat_uptr_t			list_op_pending;
 };
 
+struct compat_extended_robust_list_head {
+	struct compat_robust_list_head list_head;
+	compat_long_t			futex_offset2;
+};
+
 #ifdef CONFIG_COMPAT_OLD_SIGACTION
 struct compat_old_sigaction {
 	compat_uptr_t			sa_handler;
 	compat_old_sigset_t		sa_mask;
 	compat_ulong_t			sa_flags;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9f51932bd543..894258fd44ac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1057,10 +1057,16 @@ struct task_struct {
 #ifdef CONFIG_X86_CPU_RESCTRL
 	u32				closid;
 	u32				rmid;
 #endif
 #ifdef CONFIG_FUTEX
+    /*
+     * bottom two bits are masked
+     * 0: struct extended_robust_list_head
+     * 1: struct robust_list_head
+     * same for compat_robust_list
+     */
 	struct robust_list_head __user	*robust_list;
 #ifdef CONFIG_COMPAT
 	struct compat_robust_list_head __user *compat_robust_list;
 #endif
 	struct list_head		pi_state_list;
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index a89eb0accd5e..30c08e07f26b 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -92,10 +92,41 @@ struct robust_list_head {
 	 * so only truly owned locks will be handled.
 	 */
 	struct robust_list __user *list_op_pending;
 };
 
+/*
+ * Extensible per-thread list head:
+ *
+ * As locks are shared between processes, the futex_offset field
+ * has ABI lock-stepping issues, which the original robust_list_head
+ * structure did not anticipate. (And which prevents 32-bit/64-bit
+ * interoperability, as well as shrinking of mutex structures).
+ * This new extensible_robust_list_head allows multiple
+ * concurrent futex_offset values, chosen using the bottom 2 bits of the
+ * robust_list *next pointer, which are now masked in BOTH the old and
+ * new ABI.
+ *
+ * Note: this structure is part of the syscall ABI like
+ * robust_list_head above, and must have a different size than
+ * robust_list_head.
+ *
+ */
+struct extended_robust_list_head {
+	struct robust_list_head list_head;
+
+	/*
+	 * These relative offsets are set by user-space. They give the kernel
+	 * the relative position of the futex field to examine, based on the
+	 * bit 1 *next pointer.
+	 * The original version was insufficiently flexible. Locks are held
+	 * in shared memory between processes, and a process might want to hold
+	 * locks of two ABIs at the same time.
+	 */
+	long futex_offset2;
+};
+
 /*
  * Are there any waiters for this robust futex:
  */
 #define FUTEX_WAITERS		0x80000000
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 6d50728ef2e7..3a17d2d63178 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3396,17 +3396,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
 		size_t, len)
 {
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
-	/*
-	 * The kernel knows only one size for now:
-	 */
-	if (unlikely(len != sizeof(*head)))
+
+	if (unlikely(len != sizeof(struct robust_list_head) &&
+		     len != sizeof(struct extensible_robust_list_head)))
 		return -EINVAL;
 
-	current->robust_list = head;
+	current->robust_list = head & 0b11UL;
+	BUILD_BUG_ON(sizeof(struct robust_list_head) ==
+		     sizeof(struct extended_robust_list_head));
+	if (len == sizeof(struct robust_list_head))
+		current->robust_list |= 1;
 
 	return 0;
 }
 
 /**
@@ -3419,10 +3422,11 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 		struct robust_list_head __user * __user *, head_ptr,
 		size_t __user *, len_ptr)
 {
 	struct robust_list_head __user *head;
 	unsigned long ret;
+	size_t len;
 	struct task_struct *p;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
 
@@ -3439,14 +3443,18 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
 
 	ret = -EPERM;
 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
-	head = p->robust_list;
+	head = p->robust_list & ~0b11UL;
+	if (p->robust_list & 0b11 == 0b1)
+		len = sizeof(struct robust_list_head);
+	else
+		len = sizeof(struct extended_robust_list_head);
 	rcu_read_unlock();
 
-	if (put_user(sizeof(*head), len_ptr))
+	if (put_user(len, len_ptr))
 		return -EFAULT;
 	return put_user(head, head_ptr);
 
 err_unlock:
 	rcu_read_unlock();
@@ -3524,23 +3532,26 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int p
 
 	return 0;
 }
 
 /*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes. Bit 1 choses which
+ * futex_offset to use:
  */
 static inline int fetch_robust_entry(struct robust_list __user **entry,
 				     struct robust_list __user * __user *head,
-				     unsigned int *pi)
+				     unsigned int *pi,
+				     *unsigned int *second_abi)
 {
 	unsigned long uentry;
 
 	if (get_user(uentry, (unsigned long __user *)head))
 		return -EFAULT;
 
-	*entry = (void __user *)(uentry & ~1UL);
+	*entry = (void __user *)(uentry & ~0b11UL);
 	*pi = uentry & 1;
+	*second_abi = uentry & 0b10;
 
 	return 0;
 }
 
 /*
@@ -3549,69 +3560,84 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
  *
  * We silently return on any sign of list-walking problem.
  */
 void exit_robust_list(struct task_struct *curr)
 {
-	struct robust_list_head __user *head = curr->robust_list;
-	struct robust_list __user *entry, *next_entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	unsigned int uninitialized_var(next_pi);
-	unsigned long futex_offset;
+	struct robust_list_head __user *head_ptr = curr->robust_list & ~1UL;
+	unsigned int is_extended_list = curr->robust_list & 1 == 0;
+	struct extended_robust_list_head head;
+	struct robust_list __user *entry = &head->list_head.list.next,
+					   *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
+			     second_abip;
+	unsigned int uninitialized_var(next_pi),
+		     uninitialized_var(next_second_abi);
 	int rc;
 
 	if (!futex_cmpxchg_enabled)
 		return;
 
 	/*
-	 * Fetch the list head (which was registered earlier, via
-	 * sys_set_robust_list()):
+	 * fetch_robust_entry code is duplicated here to avoid excessive calls
+	 * to get_user()
 	 */
-	if (fetch_robust_entry(&entry, &head->list.next, &pi))
-		return;
-	/*
-	 * Fetch the relative futex offset:
-	 */
-	if (get_user(futex_offset, &head->futex_offset))
-		return;
-	/*
-	 * Fetch any possibly pending lock-add first, and handle it
-	 * if it exists:
-	 */
-	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
-		return;
+	if (is_extended_list) {
+		if (get_user(head, (struct extended_robust_list_head *)
+			     head_ptr))
+			return;
+	} else {
+		if (get_user(head.list_head, head_ptr))
+			return;
+	}
+
+	pi = head.list_head.list.next & 1;
+	second_abi = head.list_head.list.next & 0b10;
+	head.list_head.list.next &= ~0b11UL;
+	pip = head.list_head.list_op_pending & 1;
+	second_abip = head.list_head.list_op_pending & 0b10;
+	head.list_head.list_op_pending &= ~0b11UL;
 
 	next_entry = NULL;	/* avoid warning with gcc */
-	while (entry != &head->list) {
+	while (entry != &head->list_head.list) {
 		/*
 		 * Fetch the next entry in the list before calling
 		 * handle_futex_death:
 		 */
-		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi,
+					&next_second_abi);
 		/*
 		 * A pending lock might already be on the list, so
 		 * don't process it twice:
 		 */
-		if (entry != pending)
+		if (entry != pending) {
+			long futex_offset = second_abi ?
+					    head.futex_offset2 :
+					    head.list_head.futex_offset;
 			if (handle_futex_death((void __user *)entry + futex_offset,
 						curr, pi))
 				return;
+		}
 		if (rc)
 			return;
 		entry = next_entry;
 		pi = next_pi;
+		second_abi = next_second_abi;
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
 		if (!--limit)
 			break;
 
 		cond_resched();
 	}
 
-	if (pending)
+	if (pending) {
+		long futex_offset = second_abip ? head.futex_offset2 :
+						  head.list_head.futex_offset;
 		handle_futex_death((void __user *)pending + futex_offset,
 				   curr, pip);
+	}
 }
 
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
@@ -3707,21 +3733,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
 #ifdef CONFIG_COMPAT
 /*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes.
+ * Bit 1 choses which futex_offset to use:
  */
 static inline int
-compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-		   compat_uptr_t __user *head, unsigned int *pi)
+compat_fetch_robust_entry(compat_uptr_t *uentry,
+			  struct robust_list __user **entry,
+			  compat_uptr_t __user *head, unsigned int *pi,
+			  unsigned int *second_abi)
 {
 	if (get_user(*uentry, head))
 		return -EFAULT;
 
-	*entry = compat_ptr((*uentry) & ~1);
+	*entry = compat_ptr((*uentry) & ~0b11);
 	*pi = (unsigned int)(*uentry) & 1;
+	*second_abi = (unsigned int)(*uentry) & 0b10;
 
 	return 0;
 }
 
 static void __user *futex_uaddr(struct robust_list __user *entry,
@@ -3739,72 +3769,86 @@ static void __user *futex_uaddr(struct robust_list __user *entry,
  *
  * We silently return on any sign of list-walking problem.
  */
 void compat_exit_robust_list(struct task_struct *curr)
 {
-	struct compat_robust_list_head __user *head = curr->compat_robust_list;
-	struct robust_list __user *entry, *next_entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-	unsigned int uninitialized_var(next_pi);
+	struct compat_robust_list_head __user *head = curr->compat_robust_list &
+						      ~1UL;
+	unsigned int is_extended_list = curr->compat_robust_list & 1 == 0;
+	struct compat_extended_robust_list_head head;
+	struct robust_list __user *entry = &head->list_head.list.next,
+					   *next_entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
+			     second_abip;
+	unsigned int uninitialized_var(next_pi),
+		     uninitialized_var(next_second_abi);
 	compat_uptr_t uentry, next_uentry, upending;
-	compat_long_t futex_offset;
 	int rc;
 
 	if (!futex_cmpxchg_enabled)
 		return;
 
 	/*
-	 * Fetch the list head (which was registered earlier, via
-	 * sys_set_robust_list()):
-	 */
-	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
-		return;
-	/*
-	 * Fetch the relative futex offset:
-	 */
-	if (get_user(futex_offset, &head->futex_offset))
-		return;
-	/*
-	 * Fetch any possibly pending lock-add first, and handle it
-	 * if it exists:
+	 * compat_fetch_robust_entry code is duplicated here to avoid excessive
+	 * calls to get_user()
 	 */
-	if (compat_fetch_robust_entry(&upending, &pending,
-			       &head->list_op_pending, &pip))
-		return;
+	if (is_extended_list) {
+		if (get_user(head, (struct compat_extended_robust_list_head *)
+			     head_ptr))
+			return;
+	} else {
+		if (get_user(head.list_head, head_ptr))
+			return;
+	}
+
+	pi = head.list_head.list.next & 1;
+	second_abi = head.list_head.list.next & 0b10;
+	head.list_head.list.next &= ~0b11UL;
+	pip = head.list_head.list_op_pending & 1;
+	second_abip = head.list_head.list_op_pending & 0b10;
+	head.list_head.list_op_pending &= ~0b11UL;
 
 	next_entry = NULL;	/* avoid warning with gcc */
 	while (entry != (struct robust_list __user *) &head->list) {
 		/*
 		 * Fetch the next entry in the list before calling
 		 * handle_futex_death:
 		 */
 		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
-			(compat_uptr_t __user *)&entry->next, &next_pi);
+			(compat_uptr_t __user *)&entry->next, &next_pi,
+			&next_second_abi);
 		/*
 		 * A pending lock might already be on the list, so
 		 * dont process it twice:
 		 */
 		if (entry != pending) {
+			compat_long_t futex_offset = second_abi ?
+				head.futex_offset2 :
+				head.list_head.futex_offset;
 			void __user *uaddr = futex_uaddr(entry, futex_offset);
 
 			if (handle_futex_death(uaddr, curr, pi))
 				return;
 		}
 		if (rc)
 			return;
 		uentry = next_uentry;
 		entry = next_entry;
 		pi = next_pi;
+		second_abi = next_second_abi;
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
 		if (!--limit)
 			break;
 
 		cond_resched();
 	}
 	if (pending) {
+		compat_long_t futex_offset = second_abip ?
+					     head.futex_offset2 :
+					     head.list_head.futex_offset;
 		void __user *uaddr = futex_uaddr(pending, futex_offset);
 
 		handle_futex_death(uaddr, curr, pip);
 	}
 }
@@ -3814,23 +3858,29 @@ COMPAT_SYSCALL_DEFINE2(set_robust_list,
 		compat_size_t, len)
 {
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
 
-	if (unlikely(len != sizeof(*head)))
+	if (unlikely(len != sizeof(struct compat_robust_list_head) &&
+		     len != sizeof(struct compat_extended_robust_list_head)))
 		return -EINVAL;
 
-	current->compat_robust_list = head;
+	current->compat_robust_list = head & ~0b11;
+	BUILD_BUG_ON(sizeof(compat_robust_list_head) ==
+		     sizeof(compat_extended_robust_list_head));
+	if (len == sizeof(compat_robust_list_head))
+		current->compat_robust_list |= 0b1;
 
 	return 0;
 }
 
 COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 			compat_uptr_t __user *, head_ptr,
 			compat_size_t __user *, len_ptr)
 {
 	struct compat_robust_list_head __user *head;
+	size_t len;
 	unsigned long ret;
 	struct task_struct *p;
 
 	if (!futex_cmpxchg_enabled)
 		return -ENOSYS;
@@ -3848,14 +3898,18 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
 
 	ret = -EPERM;
 	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
 		goto err_unlock;
 
-	head = p->compat_robust_list;
+	head = p->compat_robust_list & ~0b11;
+	if (p->compat_robust_list & 0b11 == 0b1)
+		len = sizeof(struct compat_robust_list_head);
+	else
+		len = sizeof(struct compat_extended_robust_list_head);
 	rcu_read_unlock();
 
-	if (put_user(sizeof(*head), len_ptr))
+	if (put_user(len, len_ptr))
 		return -EFAULT;
 	return put_user(ptr_to_compat(head), head_ptr);
 
 err_unlock:
 	rcu_read_unlock();
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-04  0:29 [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time Shawn Landden
@ 2019-11-04  0:51 ` Shawn Landden
  2019-11-04 15:37 ` Thomas Gleixner
  2019-11-05  9:48 ` Florian Weimer
  2 siblings, 0 replies; 29+ messages in thread
From: Shawn Landden @ 2019-11-04  0:51 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: libc-alpha, linux-api, linux-kernel, Arnd Bergmann,
	Deepa Dinamani, Oleg Nesterov, Andrew Morton, Catalin Marinas,
	Keith Packard

I am sorry, I will fix this and resubmit.

03.11.2019, 19:29, "Shawn Landden" <shawn@git.icu>:
> The robust futexes ABI was designed to be flexible to changing ABIs in
> glibc, however it did not take into consideration that this ABI is
> particularly sticky, and suffers from lock-step problems, due to the
> fact that the ABI is shared between processes. This introduces a new
> size in set_robust_list that takes an additional futex_offset2 value
> so that two locking ABIs can be used at the same time.
>
> If this new ABI is used, then bit 1 of the *next pointer of the
> user-space robust_list indicates that the futex_offset2 value should
> be used in place of the existing futex_offset.
>
> The use case for this is sharing locks between 32-bit and 64-bit
> processes, which Linux supports, but glibc does not, and is difficult
> to implement with the current Linux support because of mix-matched
> ABIs. Keith Packard has complained about this:
> https://keithp.com/blogs/Shared_Memory_Fences/
>
> This can also be used to add a new ABI that uses smaller structs,
> as the existing ABI on x86_64 is a minimum of 32 bytes, and 20 bytes
> would suffice.
>
> v2: fix size of compat_extended_robust_list_head
>     fix some issues with number literals being implicitly ints
> ---
>  include/linux/compat.h | 5 +
>  include/linux/sched.h | 6 ++
>  include/uapi/linux/futex.h | 31 +++++++
>  kernel/futex.c | 182 ++++++++++++++++++++++++-------------
>  4 files changed, 160 insertions(+), 64 deletions(-)
>
> diff --git a/include/linux/compat.h b/include/linux/compat.h
> index 16dafd9f4b86..00a0741bf658 100644
> --- a/include/linux/compat.h
> +++ b/include/linux/compat.h
> @@ -379,10 +379,15 @@ struct compat_robust_list_head {
>          struct compat_robust_list list;
>          compat_long_t futex_offset;
>          compat_uptr_t list_op_pending;
>  };
>
> +struct compat_extended_robust_list_head {
> + struct compat_robust_list_head list_head;
> + compat_long_t futex_offset2;
> +};
> +
>  #ifdef CONFIG_COMPAT_OLD_SIGACTION
>  struct compat_old_sigaction {
>          compat_uptr_t sa_handler;
>          compat_old_sigset_t sa_mask;
>          compat_ulong_t sa_flags;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9f51932bd543..894258fd44ac 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1057,10 +1057,16 @@ struct task_struct {
>  #ifdef CONFIG_X86_CPU_RESCTRL
>          u32 closid;
>          u32 rmid;
>  #endif
>  #ifdef CONFIG_FUTEX
> + /*
> + * bottom two bits are masked
> + * 0: struct extended_robust_list_head
> + * 1: struct robust_list_head
> + * same for compat_robust_list
> + */
>          struct robust_list_head __user *robust_list;
>  #ifdef CONFIG_COMPAT
>          struct compat_robust_list_head __user *compat_robust_list;
>  #endif
>          struct list_head pi_state_list;
> diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
> index a89eb0accd5e..30c08e07f26b 100644
> --- a/include/uapi/linux/futex.h
> +++ b/include/uapi/linux/futex.h
> @@ -92,10 +92,41 @@ struct robust_list_head {
>           * so only truly owned locks will be handled.
>           */
>          struct robust_list __user *list_op_pending;
>  };
>
> +/*
> + * Extensible per-thread list head:
> + *
> + * As locks are shared between processes, the futex_offset field
> + * has ABI lock-stepping issues, which the original robust_list_head
> + * structure did not anticipate. (And which prevents 32-bit/64-bit
> + * interoperability, as well as shrinking of mutex structures).
> + * This new extensible_robust_list_head allows multiple
> + * concurrent futex_offset values, chosen using the bottom 2 bits of the
> + * robust_list *next pointer, which are now masked in BOTH the old and
> + * new ABI.
> + *
> + * Note: this structure is part of the syscall ABI like
> + * robust_list_head above, and must have a different size than
> + * robust_list_head.
> + *
> + */
> +struct extended_robust_list_head {
> + struct robust_list_head list_head;
> +
> + /*
> + * These relative offsets are set by user-space. They give the kernel
> + * the relative position of the futex field to examine, based on the
> + * bit 1 *next pointer.
> + * The original version was insufficiently flexible. Locks are held
> + * in shared memory between processes, and a process might want to hold
> + * locks of two ABIs at the same time.
> + */
> + long futex_offset2;
> +};
> +
>  /*
>   * Are there any waiters for this robust futex:
>   */
>  #define FUTEX_WAITERS 0x80000000
>
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 6d50728ef2e7..3a17d2d63178 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -3396,17 +3396,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
>  SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
>                  size_t, len)
>  {
>          if (!futex_cmpxchg_enabled)
>                  return -ENOSYS;
> - /*
> - * The kernel knows only one size for now:
> - */
> - if (unlikely(len != sizeof(*head)))
> +
> + if (unlikely(len != sizeof(struct robust_list_head) &&
> + len != sizeof(struct extensible_robust_list_head)))
>                  return -EINVAL;
>
> - current->robust_list = head;
> + current->robust_list = head & 0b11UL;
> + BUILD_BUG_ON(sizeof(struct robust_list_head) ==
> + sizeof(struct extended_robust_list_head));
> + if (len == sizeof(struct robust_list_head))
> + current->robust_list |= 1;
>
>          return 0;
>  }
>
>  /**
> @@ -3419,10 +3422,11 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
>                  struct robust_list_head __user * __user *, head_ptr,
>                  size_t __user *, len_ptr)
>  {
>          struct robust_list_head __user *head;
>          unsigned long ret;
> + size_t len;
>          struct task_struct *p;
>
>          if (!futex_cmpxchg_enabled)
>                  return -ENOSYS;
>
> @@ -3439,14 +3443,18 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
>
>          ret = -EPERM;
>          if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
>                  goto err_unlock;
>
> - head = p->robust_list;
> + head = p->robust_list & ~0b11UL;
> + if (p->robust_list & 0b11 == 0b1)
> + len = sizeof(struct robust_list_head);
> + else
> + len = sizeof(struct extended_robust_list_head);
>          rcu_read_unlock();
>
> - if (put_user(sizeof(*head), len_ptr))
> + if (put_user(len, len_ptr))
>                  return -EFAULT;
>          return put_user(head, head_ptr);
>
>  err_unlock:
>          rcu_read_unlock();
> @@ -3524,23 +3532,26 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int p
>
>          return 0;
>  }
>
>  /*
> - * Fetch a robust-list pointer. Bit 0 signals PI futexes:
> + * Fetch a robust-list pointer. Bit 0 signals PI futexes. Bit 1 choses which
> + * futex_offset to use:
>   */
>  static inline int fetch_robust_entry(struct robust_list __user **entry,
>                                       struct robust_list __user * __user *head,
> - unsigned int *pi)
> + unsigned int *pi,
> + *unsigned int *second_abi)
>  {
>          unsigned long uentry;
>
>          if (get_user(uentry, (unsigned long __user *)head))
>                  return -EFAULT;
>
> - *entry = (void __user *)(uentry & ~1UL);
> + *entry = (void __user *)(uentry & ~0b11UL);
>          *pi = uentry & 1;
> + *second_abi = uentry & 0b10;
>
>          return 0;
>  }
>
>  /*
> @@ -3549,69 +3560,84 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
>   *
>   * We silently return on any sign of list-walking problem.
>   */
>  void exit_robust_list(struct task_struct *curr)
>  {
> - struct robust_list_head __user *head = curr->robust_list;
> - struct robust_list __user *entry, *next_entry, *pending;
> - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
> - unsigned int uninitialized_var(next_pi);
> - unsigned long futex_offset;
> + struct robust_list_head __user *head_ptr = curr->robust_list & ~1UL;
> + unsigned int is_extended_list = curr->robust_list & 1 == 0;
> + struct extended_robust_list_head head;
> + struct robust_list __user *entry = &head->list_head.list.next,
> + *next_entry, *pending;
> + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
> + second_abip;
> + unsigned int uninitialized_var(next_pi),
> + uninitialized_var(next_second_abi);
>          int rc;
>
>          if (!futex_cmpxchg_enabled)
>                  return;
>
>          /*
> - * Fetch the list head (which was registered earlier, via
> - * sys_set_robust_list()):
> + * fetch_robust_entry code is duplicated here to avoid excessive calls
> + * to get_user()
>           */
> - if (fetch_robust_entry(&entry, &head->list.next, &pi))
> - return;
> - /*
> - * Fetch the relative futex offset:
> - */
> - if (get_user(futex_offset, &head->futex_offset))
> - return;
> - /*
> - * Fetch any possibly pending lock-add first, and handle it
> - * if it exists:
> - */
> - if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
> - return;
> + if (is_extended_list) {
> + if (get_user(head, (struct extended_robust_list_head *)
> + head_ptr))
> + return;
> + } else {
> + if (get_user(head.list_head, head_ptr))
> + return;
> + }
> +
> + pi = head.list_head.list.next & 1;
> + second_abi = head.list_head.list.next & 0b10;
> + head.list_head.list.next &= ~0b11UL;
> + pip = head.list_head.list_op_pending & 1;
> + second_abip = head.list_head.list_op_pending & 0b10;
> + head.list_head.list_op_pending &= ~0b11UL;
>
>          next_entry = NULL; /* avoid warning with gcc */
> - while (entry != &head->list) {
> + while (entry != &head->list_head.list) {
>                  /*
>                   * Fetch the next entry in the list before calling
>                   * handle_futex_death:
>                   */
> - rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
> + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi,
> + &next_second_abi);
>                  /*
>                   * A pending lock might already be on the list, so
>                   * don't process it twice:
>                   */
> - if (entry != pending)
> + if (entry != pending) {
> + long futex_offset = second_abi ?
> + head.futex_offset2 :
> + head.list_head.futex_offset;
>                          if (handle_futex_death((void __user *)entry + futex_offset,
>                                                  curr, pi))
>                                  return;
> + }
>                  if (rc)
>                          return;
>                  entry = next_entry;
>                  pi = next_pi;
> + second_abi = next_second_abi;
>                  /*
>                   * Avoid excessively long or circular lists:
>                   */
>                  if (!--limit)
>                          break;
>
>                  cond_resched();
>          }
>
> - if (pending)
> + if (pending) {
> + long futex_offset = second_abip ? head.futex_offset2 :
> + head.list_head.futex_offset;
>                  handle_futex_death((void __user *)pending + futex_offset,
>                                     curr, pip);
> + }
>  }
>
>  long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
>                  u32 __user *uaddr2, u32 val2, u32 val3)
>  {
> @@ -3707,21 +3733,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
>          return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
>  }
>
>  #ifdef CONFIG_COMPAT
>  /*
> - * Fetch a robust-list pointer. Bit 0 signals PI futexes:
> + * Fetch a robust-list pointer. Bit 0 signals PI futexes.
> + * Bit 1 choses which futex_offset to use:
>   */
>  static inline int
> -compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
> - compat_uptr_t __user *head, unsigned int *pi)
> +compat_fetch_robust_entry(compat_uptr_t *uentry,
> + struct robust_list __user **entry,
> + compat_uptr_t __user *head, unsigned int *pi,
> + unsigned int *second_abi)
>  {
>          if (get_user(*uentry, head))
>                  return -EFAULT;
>
> - *entry = compat_ptr((*uentry) & ~1);
> + *entry = compat_ptr((*uentry) & ~0b11);
>          *pi = (unsigned int)(*uentry) & 1;
> + *second_abi = (unsigned int)(*uentry) & 0b10;
>
>          return 0;
>  }
>
>  static void __user *futex_uaddr(struct robust_list __user *entry,
> @@ -3739,72 +3769,86 @@ static void __user *futex_uaddr(struct robust_list __user *entry,
>   *
>   * We silently return on any sign of list-walking problem.
>   */
>  void compat_exit_robust_list(struct task_struct *curr)
>  {
> - struct compat_robust_list_head __user *head = curr->compat_robust_list;
> - struct robust_list __user *entry, *next_entry, *pending;
> - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
> - unsigned int uninitialized_var(next_pi);
> + struct compat_robust_list_head __user *head = curr->compat_robust_list &
> + ~1UL;
> + unsigned int is_extended_list = curr->compat_robust_list & 1 == 0;
> + struct compat_extended_robust_list_head head;
> + struct robust_list __user *entry = &head->list_head.list.next,
> + *next_entry, *pending;
> + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
> + second_abip;
> + unsigned int uninitialized_var(next_pi),
> + uninitialized_var(next_second_abi);
>          compat_uptr_t uentry, next_uentry, upending;
> - compat_long_t futex_offset;
>          int rc;
>
>          if (!futex_cmpxchg_enabled)
>                  return;
>
>          /*
> - * Fetch the list head (which was registered earlier, via
> - * sys_set_robust_list()):
> - */
> - if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
> - return;
> - /*
> - * Fetch the relative futex offset:
> - */
> - if (get_user(futex_offset, &head->futex_offset))
> - return;
> - /*
> - * Fetch any possibly pending lock-add first, and handle it
> - * if it exists:
> + * compat_fetch_robust_entry code is duplicated here to avoid excessive
> + * calls to get_user()
>           */
> - if (compat_fetch_robust_entry(&upending, &pending,
> - &head->list_op_pending, &pip))
> - return;
> + if (is_extended_list) {
> + if (get_user(head, (struct compat_extended_robust_list_head *)
> + head_ptr))
> + return;
> + } else {
> + if (get_user(head.list_head, head_ptr))
> + return;
> + }
> +
> + pi = head.list_head.list.next & 1;
> + second_abi = head.list_head.list.next & 0b10;
> + head.list_head.list.next &= ~0b11UL;
> + pip = head.list_head.list_op_pending & 1;
> + second_abip = head.list_head.list_op_pending & 0b10;
> + head.list_head.list_op_pending &= ~0b11UL;
>
>          next_entry = NULL; /* avoid warning with gcc */
>          while (entry != (struct robust_list __user *) &head->list) {
>                  /*
>                   * Fetch the next entry in the list before calling
>                   * handle_futex_death:
>                   */
>                  rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
> - (compat_uptr_t __user *)&entry->next, &next_pi);
> + (compat_uptr_t __user *)&entry->next, &next_pi,
> + &next_second_abi);
>                  /*
>                   * A pending lock might already be on the list, so
>                   * dont process it twice:
>                   */
>                  if (entry != pending) {
> + compat_long_t futex_offset = second_abi ?
> + head.futex_offset2 :
> + head.list_head.futex_offset;
>                          void __user *uaddr = futex_uaddr(entry, futex_offset);
>
>                          if (handle_futex_death(uaddr, curr, pi))
>                                  return;
>                  }
>                  if (rc)
>                          return;
>                  uentry = next_uentry;
>                  entry = next_entry;
>                  pi = next_pi;
> + second_abi = next_second_abi;
>                  /*
>                   * Avoid excessively long or circular lists:
>                   */
>                  if (!--limit)
>                          break;
>
>                  cond_resched();
>          }
>          if (pending) {
> + compat_long_t futex_offset = second_abip ?
> + head.futex_offset2 :
> + head.list_head.futex_offset;
>                  void __user *uaddr = futex_uaddr(pending, futex_offset);
>
>                  handle_futex_death(uaddr, curr, pip);
>          }
>  }
> @@ -3814,23 +3858,29 @@ COMPAT_SYSCALL_DEFINE2(set_robust_list,
>                  compat_size_t, len)
>  {
>          if (!futex_cmpxchg_enabled)
>                  return -ENOSYS;
>
> - if (unlikely(len != sizeof(*head)))
> + if (unlikely(len != sizeof(struct compat_robust_list_head) &&
> + len != sizeof(struct compat_extended_robust_list_head)))
>                  return -EINVAL;
>
> - current->compat_robust_list = head;
> + current->compat_robust_list = head & ~0b11;
> + BUILD_BUG_ON(sizeof(compat_robust_list_head) ==
> + sizeof(compat_extended_robust_list_head));
> + if (len == sizeof(compat_robust_list_head))
> + current->compat_robust_list |= 0b1;
>
>          return 0;
>  }
>
>  COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
>                          compat_uptr_t __user *, head_ptr,
>                          compat_size_t __user *, len_ptr)
>  {
>          struct compat_robust_list_head __user *head;
> + size_t len;
>          unsigned long ret;
>          struct task_struct *p;
>
>          if (!futex_cmpxchg_enabled)
>                  return -ENOSYS;
> @@ -3848,14 +3898,18 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
>
>          ret = -EPERM;
>          if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
>                  goto err_unlock;
>
> - head = p->compat_robust_list;
> + head = p->compat_robust_list & ~0b11;
> + if (p->compat_robust_list & 0b11 == 0b1)
> + len = sizeof(struct compat_robust_list_head);
> + else
> + len = sizeof(struct compat_extended_robust_list_head);
>          rcu_read_unlock();
>
> - if (put_user(sizeof(*head), len_ptr))
> + if (put_user(len, len_ptr))
>                  return -EFAULT;
>          return put_user(ptr_to_compat(head), head_ptr);
>
>  err_unlock:
>          rcu_read_unlock();
> --
> 2.20.1

-- 
Shawn Landden


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-04  0:29 [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time Shawn Landden
  2019-11-04  0:51 ` Shawn Landden
@ 2019-11-04 15:37 ` Thomas Gleixner
  2019-11-05  0:10   ` Thomas Gleixner
  2019-11-05  9:48 ` Florian Weimer
  2 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-04 15:37 UTC (permalink / raw)
  To: Shawn Landden
  Cc: libc-alpha, linux-api, LKML, Arnd Bergmann, Deepa Dinamani,
	Oleg Nesterov, Andrew Morton, Catalin Marinas, Keith Packard,
	Ingo Molnar, Peter Zijlstra, Darren Hart

On Sun, 3 Nov 2019, Shawn Landden wrote:

While you Cc'ed various people for whatever reasons, you missed to Cc 3/4
of the FUTEX maintainers/reviewers from the MAINTAINERS file:

FUTEX SUBSYSTEM
M:      Thomas Gleixner <tglx@linutronix.de>
M:      Ingo Molnar <mingo@redhat.com>
R:      Peter Zijlstra <peterz@infradead.org>
R:      Darren Hart <dvhart@infradead.org>

> The robust futexes ABI was designed to be flexible to changing ABIs in
> glibc, however it did not take into consideration that this ABI is
> particularly sticky, and suffers from lock-step problems, due to the
> fact that the ABI is shared between processes. This introduces a new
> size in set_robust_list that takes an additional futex_offset2 value
> so that two locking ABIs can be used at the same time.

This text does not really explain the problem you are trying to solve. I
can crystalball something out of it, but that's not what a changelog should
force people to do.

> If this new ABI is used, then bit 1 of the *next pointer of the
> user-space robust_list indicates that the futex_offset2 value should
> be used in place of the existing futex_offset.
> 
> The use case for this is sharing locks between 32-bit and 64-bit
> processes, which Linux supports, but glibc does not, and is difficult
> to implement with the current Linux support because of mix-matched
> ABIs.

The problem with mixed processes is that the *next pointer in the robust
list points to some random data structure, but not to the actual futex
value itself. The pointer to the underlying futex value is calculated from
there with 'futex_offset'.

I assume that pthread_mutex and friends have different layouts on 32 and 64
bit and therefore futex_offset is different, right?

But even if you manage to keep track of the offsets per futex, I don't see
how sharing the actual pthread_mutex between a 32bit and a 64bit process
would work at all if the underlying data structures are not fundamentally
the same. The contain more than the robust list.

> Keith Packard has complained about this:
> https://keithp.com/blogs/Shared_Memory_Fences/

That complaint is important because the info there is missing in the
changelog, right? Such links are mostly useless as they are outdated sooner
than later.

Aside of that the blog entry is absolutely not useful to fill the blanks in
your changelog.

> This can also be used to add a new ABI that uses smaller structs, as the
> existing ABI on x86_64 is a minimum of 32 bytes, and 20 bytes would
> suffice.

I have no idea how you make that a minimum of 32 bytes. It's fixed 24 bytes
sized on x86_64.

struct robust_list_head {
	struct robust_list         list;                 /*     0     8 */
	long int                   futex_offset;         /*     8     8 */
	struct robust_list *       list_op_pending;      /*    16     8 */

	/* size: 24, cachelines: 1, members: 3 */
};

>

This lacks a Signed-off-by: ....

> 
> v2: fix size of compat_extended_robust_list_head
>     fix some issues with number literals being implicitly ints

Please move the vN change history after a --- separator so tools can
discard it. It's not part of the changelog. Additionally a link to the
previous submission would be helpful for keeping track.

i.e.

---
v2: Fix ....

v1: https://lore.kernel.org/r/$MESSAGE_ID
---
diffstat
....

> +struct compat_extended_robust_list_head {
> +	struct compat_robust_list_head list_head;
> +	compat_long_t			futex_offset2;

Please align the names of the members properly. list head should be
separated from the type by a tab not by a space.

>  #ifdef CONFIG_FUTEX
> +    /*
> +     * bottom two bits are masked
> +     * 0: struct extended_robust_list_head
> +     * 1: struct robust_list_head
> +     * same for compat_robust_list

What? That's not what the code does. It merily sets bit 0.

> +/*
> + * Extensible per-thread list head:
> + *
> + * As locks are shared between processes, the futex_offset field
> + * has ABI lock-stepping issues, which the original robust_list_head
> + * structure did not anticipate. (And which prevents 32-bit/64-bit
> + * interoperability, as well as shrinking of mutex structures).
> + * This new extensible_robust_list_head allows multiple
> + * concurrent futex_offset values, chosen using the bottom 2 bits of the
> + * robust_list *next pointer, which are now masked in BOTH the old and
> + * new ABI.
> + *
> + * Note: this structure is part of the syscall ABI like
> + * robust_list_head above, and must have a different size than
> + * robust_list_head.

Versioning an ABI by different sizes is an horrible idea.

> + *
> + */
> +struct extended_robust_list_head {
> +	struct robust_list_head list_head;
> +
> +	/*
> +	 * These relative offsets are set by user-space. They give the kernel
> +	 * the relative position of the futex field to examine, based on the
> +	 * bit 1 *next pointer.

What on earth is a 'bit 1 *next pointer'?

> +	 * The original version was insufficiently flexible. Locks are held
> +	 * in shared memory between processes, and a process might want to hold
> +	 * locks of two ABIs at the same time.
> +	 */
> +	long futex_offset2;

Where is *list_op_pending gone and how is that supposed to work? Are you
now enqueueing the next lock into list_head list right away?

Please provide proper design information about this.

> +};
> +
>  /*
>   * Are there any waiters for this robust futex:
>   */
>  #define FUTEX_WAITERS		0x80000000
>  
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 6d50728ef2e7..3a17d2d63178 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -3396,17 +3396,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
>  SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
>  		size_t, len)
>  {
>  	if (!futex_cmpxchg_enabled)
>  		return -ENOSYS;
> -	/*
> -	 * The kernel knows only one size for now:
> -	 */
> -	if (unlikely(len != sizeof(*head)))
> +
> +	if (unlikely(len != sizeof(struct robust_list_head) &&
> +		     len != sizeof(struct extensible_robust_list_head)))
>  		return -EINVAL;
>  
> -	current->robust_list = head;
> +	current->robust_list = head & 0b11UL;

What? This results in:

	0 <= current->robust_list <=3

How is that supposed to work and why would the 'head' argument ever have
any of those bits set in the first place?

>  void exit_robust_list(struct task_struct *curr)
>  {
> -	struct robust_list_head __user *head = curr->robust_list;
> -	struct robust_list __user *entry, *next_entry, *pending;
> -	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
> -	unsigned int uninitialized_var(next_pi);
> -	unsigned long futex_offset;
> +	struct robust_list_head __user *head_ptr = curr->robust_list & ~1UL;
> +	unsigned int is_extended_list = curr->robust_list & 1 == 0;
> +	struct extended_robust_list_head head;
> +	struct robust_list __user *entry = &head->list_head.list.next,
> +					   *next_entry, *pending;
> +	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip, second_abi,
> +			     second_abip;
> +	unsigned int uninitialized_var(next_pi),
> +		     uninitialized_var(next_second_abi);
>  	int rc;
>  
>  	if (!futex_cmpxchg_enabled)
>  		return;
>  
>  	/*
> -	 * Fetch the list head (which was registered earlier, via
> -	 * sys_set_robust_list()):
> +	 * fetch_robust_entry code is duplicated here to avoid excessive calls
> +	 * to get_user()
>  	 */
> -	if (fetch_robust_entry(&entry, &head->list.next, &pi))
> -		return;
> -	/*
> -	 * Fetch the relative futex offset:
> -	 */
> -	if (get_user(futex_offset, &head->futex_offset))
> -		return;
> -	/*
> -	 * Fetch any possibly pending lock-add first, and handle it
> -	 * if it exists:
> -	 */
> -	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
> -		return;
> +	if (is_extended_list) {
> +		if (get_user(head, (struct extended_robust_list_head *)
> +			     head_ptr))
> +			return;
> +	} else {
> +		if (get_user(head.list_head, head_ptr))
> +			return;
> +	}
> +
> +	pi = head.list_head.list.next & 1;
> +	second_abi = head.list_head.list.next & 0b10;

That's truly consistent. For current->robust_list you use bit 0 to
determine whether it is the extended one, but in user space you use bit 1
because bit 0 is already occupied for PI.

> +	head.list_head.list.next &= ~0b11UL;
> +	pip = head.list_head.list_op_pending & 1;
> +	second_abip = head.list_head.list_op_pending & 0b10;
> +	head.list_head.list_op_pending &= ~0b11UL;

So you are seriously trying to mix both ABIs in one robust list? Why on
earth?

One thread can hardly use two different libraries accessing the robust list
with two different versions of the ABI. That's just a recipe for
disaster. Futexes are complicated enough and the robust list is a fragile
beast to begin with.

To be honest. The result of this approach is just unreadable garbage and
this combo patch does too many things in one go to be reviewable at all.

Please provide a proper design and explain how this should work together
with existing robust list using code first.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-04 15:37 ` Thomas Gleixner
@ 2019-11-05  0:10   ` Thomas Gleixner
  0 siblings, 0 replies; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-05  0:10 UTC (permalink / raw)
  To: Shawn Landden
  Cc: libc-alpha, linux-api, LKML, Arnd Bergmann, Deepa Dinamani,
	Oleg Nesterov, Andrew Morton, Catalin Marinas, Keith Packard,
	Ingo Molnar, Peter Zijlstra, Darren Hart

On Mon, 4 Nov 2019, Thomas Gleixner wrote:
> Please provide a proper design and explain how this should work together
> with existing robust list using code first.

Just for clarification:

Any change to that interface needs to be discussed with the *libc people
first. Your way of overriding the robust list interfacce with some ad hoc
solution will break existing users which is not acceptable.

So either the problem is solved with the *libc people in a way which allows
them to build upon or if your intention is solely to solve the problem
Keith described then none of this is required at all as user space can
handle the different layouts for that magic fence implementation perfectly
fine.

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-04  0:29 [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time Shawn Landden
  2019-11-04  0:51 ` Shawn Landden
  2019-11-04 15:37 ` Thomas Gleixner
@ 2019-11-05  9:48 ` Florian Weimer
  2019-11-05  9:59   ` Thomas Gleixner
  2 siblings, 1 reply; 29+ messages in thread
From: Florian Weimer @ 2019-11-05  9:48 UTC (permalink / raw)
  To: Shawn Landden
  Cc: Thomas Gleixner, libc-alpha, linux-api, linux-kernel,
	Arnd Bergmann, Deepa Dinamani, Oleg Nesterov, Andrew Morton,
	Catalin Marinas, Keith Packard

* Shawn Landden:

> If this new ABI is used, then bit 1 of the *next pointer of the
> user-space robust_list indicates that the futex_offset2 value should
> be used in place of the existing futex_offset.

The futex interface currently has some races which can only be fixed by
API changes.  I'm concerned that we sacrifice the last bit for some
rather obscure feature.  What if we need that bit for fixing the
correctness issues?

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-05  9:48 ` Florian Weimer
@ 2019-11-05  9:59   ` Thomas Gleixner
  2019-11-05 10:06     ` Florian Weimer
  2019-11-05 15:27     ` handle_exit_race && PF_EXITING Oleg Nesterov
  0 siblings, 2 replies; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-05  9:59 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Shawn Landden, libc-alpha, linux-api, LKML, Arnd Bergmann,
	Deepa Dinamani, Oleg Nesterov, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On Tue, 5 Nov 2019, Florian Weimer wrote:
> * Shawn Landden:
> > If this new ABI is used, then bit 1 of the *next pointer of the
> > user-space robust_list indicates that the futex_offset2 value should
> > be used in place of the existing futex_offset.
> 
> The futex interface currently has some races which can only be fixed by
> API changes.  I'm concerned that we sacrifice the last bit for some
> rather obscure feature.  What if we need that bit for fixing the
> correctness issues?

That current approach is going nowhere and if we change the ABI ever then
this needs to happen with all *libc folks involved and agreeing.

Out of curiosity, what's the race issue vs. robust list which you are
trying to solve?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-05  9:59   ` Thomas Gleixner
@ 2019-11-05 10:06     ` Florian Weimer
  2019-11-05 11:56       ` Thomas Gleixner
  2019-11-05 15:27     ` handle_exit_race && PF_EXITING Oleg Nesterov
  1 sibling, 1 reply; 29+ messages in thread
From: Florian Weimer @ 2019-11-05 10:06 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Shawn Landden, libc-alpha, linux-api, LKML, Arnd Bergmann,
	Deepa Dinamani, Oleg Nesterov, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

* Thomas Gleixner:

> On Tue, 5 Nov 2019, Florian Weimer wrote:
>> * Shawn Landden:
>> > If this new ABI is used, then bit 1 of the *next pointer of the
>> > user-space robust_list indicates that the futex_offset2 value should
>> > be used in place of the existing futex_offset.
>> 
>> The futex interface currently has some races which can only be fixed by
>> API changes.  I'm concerned that we sacrifice the last bit for some
>> rather obscure feature.  What if we need that bit for fixing the
>> correctness issues?
>
> That current approach is going nowhere and if we change the ABI ever then
> this needs to happen with all *libc folks involved and agreeing.
>
> Out of curiosity, what's the race issue vs. robust list which you are
> trying to solve?

Sadly I'm not trying to solve them.  Here's one of the issues:

  <https://sourceware.org/bugzilla/show_bug.cgi?id=14485>

I think there are others, but I can't find a reference to them.  If
anyone wants to work on this, I can dig out the details and ask some
folks who have looked at this in the past.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-05 10:06     ` Florian Weimer
@ 2019-11-05 11:56       ` Thomas Gleixner
  2019-11-05 14:10         ` Carlos O'Donell
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-05 11:56 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Shawn Landden, libc-alpha, linux-api, LKML, Arnd Bergmann,
	Deepa Dinamani, Oleg Nesterov, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On Tue, 5 Nov 2019, Florian Weimer wrote:
> * Thomas Gleixner:
> > On Tue, 5 Nov 2019, Florian Weimer wrote:
> >> * Shawn Landden:
> >> > If this new ABI is used, then bit 1 of the *next pointer of the
> >> > user-space robust_list indicates that the futex_offset2 value should
> >> > be used in place of the existing futex_offset.
> >> 
> >> The futex interface currently has some races which can only be fixed by
> >> API changes.  I'm concerned that we sacrifice the last bit for some
> >> rather obscure feature.  What if we need that bit for fixing the
> >> correctness issues?
> >
> > That current approach is going nowhere and if we change the ABI ever then
> > this needs to happen with all *libc folks involved and agreeing.
> >
> > Out of curiosity, what's the race issue vs. robust list which you are
> > trying to solve?
> 
> Sadly I'm not trying to solve them.  Here's one of the issues:
> 
>   <https://sourceware.org/bugzilla/show_bug.cgi?id=14485>

That one seems more a life time problem, i.e. the mutex is destroyed,
memory freed and map address reused while another thread was not yet out of
the mutex_unlock() call. Nasty.

Thanks,

	tglx

 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-05 11:56       ` Thomas Gleixner
@ 2019-11-05 14:10         ` Carlos O'Donell
  2019-11-05 14:27           ` Florian Weimer
  2019-11-05 14:27           ` Thomas Gleixner
  0 siblings, 2 replies; 29+ messages in thread
From: Carlos O'Donell @ 2019-11-05 14:10 UTC (permalink / raw)
  To: Thomas Gleixner, Florian Weimer
  Cc: Shawn Landden, libc-alpha, linux-api, LKML, Arnd Bergmann,
	Deepa Dinamani, Oleg Nesterov, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On 11/5/19 6:56 AM, Thomas Gleixner wrote:
> On Tue, 5 Nov 2019, Florian Weimer wrote:
>> * Thomas Gleixner:
>>> On Tue, 5 Nov 2019, Florian Weimer wrote:
>>>> * Shawn Landden:
>>>>> If this new ABI is used, then bit 1 of the *next pointer of the
>>>>> user-space robust_list indicates that the futex_offset2 value should
>>>>> be used in place of the existing futex_offset.
>>>>
>>>> The futex interface currently has some races which can only be fixed by
>>>> API changes.  I'm concerned that we sacrifice the last bit for some
>>>> rather obscure feature.  What if we need that bit for fixing the
>>>> correctness issues?
>>>
>>> That current approach is going nowhere and if we change the ABI ever then
>>> this needs to happen with all *libc folks involved and agreeing.
>>>
>>> Out of curiosity, what's the race issue vs. robust list which you are
>>> trying to solve?
>>
>> Sadly I'm not trying to solve them.  Here's one of the issues:
>>
>>   <https://sourceware.org/bugzilla/show_bug.cgi?id=14485>
> 
> That one seems more a life time problem, i.e. the mutex is destroyed,
> memory freed and map address reused while another thread was not yet out of
> the mutex_unlock() call. Nasty.

It is difficult to fix.

The other issue is this:

"Robust mutexes do not take ROBUST_LIST_LIMIT into account"
https://sourceware.org/bugzilla/show_bug.cgi?id=19089

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-05 14:10         ` Carlos O'Donell
@ 2019-11-05 14:27           ` Florian Weimer
  2019-11-05 14:53             ` Thomas Gleixner
  2019-11-05 14:27           ` Thomas Gleixner
  1 sibling, 1 reply; 29+ messages in thread
From: Florian Weimer @ 2019-11-05 14:27 UTC (permalink / raw)
  To: Carlos O'Donell
  Cc: Thomas Gleixner, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Oleg Nesterov, Andrew Morton,
	Catalin Marinas, Keith Packard, Peter Zijlstra

* Carlos O'Donell:

> On 11/5/19 6:56 AM, Thomas Gleixner wrote:
>> On Tue, 5 Nov 2019, Florian Weimer wrote:
>>> * Thomas Gleixner:
>>>> On Tue, 5 Nov 2019, Florian Weimer wrote:
>>>>> * Shawn Landden:
>>>>>> If this new ABI is used, then bit 1 of the *next pointer of the
>>>>>> user-space robust_list indicates that the futex_offset2 value should
>>>>>> be used in place of the existing futex_offset.
>>>>>
>>>>> The futex interface currently has some races which can only be fixed by
>>>>> API changes.  I'm concerned that we sacrifice the last bit for some
>>>>> rather obscure feature.  What if we need that bit for fixing the
>>>>> correctness issues?
>>>>
>>>> That current approach is going nowhere and if we change the ABI ever then
>>>> this needs to happen with all *libc folks involved and agreeing.
>>>>
>>>> Out of curiosity, what's the race issue vs. robust list which you are
>>>> trying to solve?
>>>
>>> Sadly I'm not trying to solve them.  Here's one of the issues:
>>>
>>>   <https://sourceware.org/bugzilla/show_bug.cgi?id=14485>
>> 
>> That one seems more a life time problem, i.e. the mutex is destroyed,
>> memory freed and map address reused while another thread was not yet out of
>> the mutex_unlock() call. Nasty.
>
> It is difficult to fix.
>
> The other issue is this:
>
> "Robust mutexes do not take ROBUST_LIST_LIMIT into account"
> https://sourceware.org/bugzilla/show_bug.cgi?id=19089

That's just a missing check in our implementation and something that few
applications will encounter, if any.  There is this one here:

  <https://sourceware.org/bugzilla/show_bug.cgi?id=19004>

It contains a kernel patch.

I thought that there were more issues in the current implementation, but
I can't a record of them. 8-(

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-05 14:10         ` Carlos O'Donell
  2019-11-05 14:27           ` Florian Weimer
@ 2019-11-05 14:27           ` Thomas Gleixner
  2019-11-05 14:33             ` Florian Weimer
  2019-11-06 14:00             ` Zack Weinberg
  1 sibling, 2 replies; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-05 14:27 UTC (permalink / raw)
  To: Carlos O'Donell
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Oleg Nesterov, Andrew Morton,
	Catalin Marinas, Keith Packard, Peter Zijlstra

On Tue, 5 Nov 2019, Carlos O'Donell wrote:
> On 11/5/19 6:56 AM, Thomas Gleixner wrote:
> The other issue is this:
> 
> "Robust mutexes do not take ROBUST_LIST_LIMIT into account"
> https://sourceware.org/bugzilla/show_bug.cgi?id=19089

  "The kernel limits the length of the robust mutex list to 2048 entries.
   This constant does not seem to be exported to user space."

FWIW, the constant is defined in the UAPI futex header.

The main concern here is not the actual number of futexes held by a task.

The real issue is that the robust list could be circular by incident or
malice and there is no way for the kernel to figure that out. That would
prevent the task from exiting and make it iterate over the list until
doomsday, i.e. a nice unpriviledged DoS.

So I fear the kernel cannot really help with this one.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-05 14:27           ` Thomas Gleixner
@ 2019-11-05 14:33             ` Florian Weimer
  2019-11-05 14:48               ` Thomas Gleixner
  2019-11-06 14:00             ` Zack Weinberg
  1 sibling, 1 reply; 29+ messages in thread
From: Florian Weimer @ 2019-11-05 14:33 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Carlos O'Donell, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Oleg Nesterov, Andrew Morton,
	Catalin Marinas, Keith Packard, Peter Zijlstra

* Thomas Gleixner:

> On Tue, 5 Nov 2019, Carlos O'Donell wrote:
>> On 11/5/19 6:56 AM, Thomas Gleixner wrote:
>> The other issue is this:
>> 
>> "Robust mutexes do not take ROBUST_LIST_LIMIT into account"
>> https://sourceware.org/bugzilla/show_bug.cgi?id=19089
>
>   "The kernel limits the length of the robust mutex list to 2048 entries.
>    This constant does not seem to be exported to user space."
>
> FWIW, the constant is defined in the UAPI futex header.
>
> The main concern here is not the actual number of futexes held by a task.
>
> The real issue is that the robust list could be circular by incident or
> malice and there is no way for the kernel to figure that out. That would
> prevent the task from exiting and make it iterate over the list until
> doomsday, i.e. a nice unpriviledged DoS.
>
> So I fear the kernel cannot really help with this one.

I'm actually fine with treating ROBUST_LIST_LIMIT as an ABI constant.
It's just not clear to me if the constant has this status today.  I
suspect it was just split from the implementation headers at one point.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-05 14:33             ` Florian Weimer
@ 2019-11-05 14:48               ` Thomas Gleixner
  0 siblings, 0 replies; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-05 14:48 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Carlos O'Donell, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Oleg Nesterov, Andrew Morton,
	Catalin Marinas, Keith Packard, Peter Zijlstra

On Tue, 5 Nov 2019, Florian Weimer wrote:
> * Thomas Gleixner:
> 
> > On Tue, 5 Nov 2019, Carlos O'Donell wrote:
> >> On 11/5/19 6:56 AM, Thomas Gleixner wrote:
> >> The other issue is this:
> >> 
> >> "Robust mutexes do not take ROBUST_LIST_LIMIT into account"
> >> https://sourceware.org/bugzilla/show_bug.cgi?id=19089
> >
> >   "The kernel limits the length of the robust mutex list to 2048 entries.
> >    This constant does not seem to be exported to user space."
> >
> > FWIW, the constant is defined in the UAPI futex header.
> >
> > The main concern here is not the actual number of futexes held by a task.
> >
> > The real issue is that the robust list could be circular by incident or
> > malice and there is no way for the kernel to figure that out. That would
> > prevent the task from exiting and make it iterate over the list until
> > doomsday, i.e. a nice unpriviledged DoS.
> >
> > So I fear the kernel cannot really help with this one.
> 
> I'm actually fine with treating ROBUST_LIST_LIMIT as an ABI constant.
> It's just not clear to me if the constant has this status today.  I
> suspect it was just split from the implementation headers at one point.

Yes, but we really can declare it as an ABI constant.

I think the limit is reasonably sized. But I'm not familiar with the lock
nesting expectations of insanely big enterprise applications.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-05 14:27           ` Florian Weimer
@ 2019-11-05 14:53             ` Thomas Gleixner
  0 siblings, 0 replies; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-05 14:53 UTC (permalink / raw)
  To: Florian Weimer
  Cc: Carlos O'Donell, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Oleg Nesterov, Andrew Morton,
	Catalin Marinas, Keith Packard, Peter Zijlstra

On Tue, 5 Nov 2019, Florian Weimer wrote:
> * Carlos O'Donell:
> > "Robust mutexes do not take ROBUST_LIST_LIMIT into account"
> > https://sourceware.org/bugzilla/show_bug.cgi?id=19089
> 
> That's just a missing check in our implementation and something that few
> applications will encounter, if any.  There is this one here:
> 
>   <https://sourceware.org/bugzilla/show_bug.cgi?id=19004>
> 
> It contains a kernel patch.
> 
> I thought that there were more issues in the current implementation, but
> I can't a record of them. 8-(

There is a nasty one in my inbox with a kernel patch fixing it, which I
still need to review with all futex brain cells activated:

  https://lore.kernel.org/r/1572573789-16557-1-git-send-email-wang.yi59@zte.com.cn

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* handle_exit_race && PF_EXITING
  2019-11-05  9:59   ` Thomas Gleixner
  2019-11-05 10:06     ` Florian Weimer
@ 2019-11-05 15:27     ` Oleg Nesterov
  2019-11-05 17:28       ` Thomas Gleixner
  1 sibling, 1 reply; 29+ messages in thread
From: Oleg Nesterov @ 2019-11-05 15:27 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On 11/05, Thomas Gleixner wrote:
>
> Out of curiosity, what's the race issue vs. robust list which you are
> trying to solve?

Off-topic, but this reminds me...

	#include <sched.h>
	#include <assert.h>
	#include <unistd.h>
	#include <syscall.h>

	#define FUTEX_LOCK_PI		6

	int main(void)
	{
		struct sched_param sp = {};

		sp.sched_priority = 2;
		assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);

		int lock = vfork();
		if (!lock) {
			sp.sched_priority = 1;
			assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
			_exit(0);
		}

		syscall(__NR_futex, &lock, FUTEX_LOCK_PI, 0,0,0);
		return 0;
	}

this creates the unkillable RT process spinning in futex_lock_pi() on
a single CPU machine (or you can use taskset).

Probably the patch below makes sense anyway, but of course it doesn't
solve the real problem: futex_lock_pi() should not spin in this case.

It seems to me I even sent the fix a long ago, but I can't recall what
exactly it did. Probably the PF_EXITING check in attach_to_pi_owner()
must simply die, I'll try to recall...

Oleg.

--- x/kernel/futex.c
+++ x/kernel/futex.c
@@ -2842,10 +2842,12 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
 			 *   exit to complete.
 			 * - The user space value changed.
 			 */
-			queue_unlock(hb);
-			put_futex_key(&q.key);
-			cond_resched();
-			goto retry;
+			if (!fatal_signal_pending(current)) {
+				queue_unlock(hb);
+				put_futex_key(&q.key);
+				cond_resched();
+				goto retry;
+			}
 		default:
 			goto out_unlock_put_key;
 		}


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-05 15:27     ` handle_exit_race && PF_EXITING Oleg Nesterov
@ 2019-11-05 17:28       ` Thomas Gleixner
  2019-11-05 17:59         ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-05 17:28 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On Tue, 5 Nov 2019, Oleg Nesterov wrote:
> On 11/05, Thomas Gleixner wrote:
> >
> > Out of curiosity, what's the race issue vs. robust list which you are
> > trying to solve?
> 
> Off-topic, but this reminds me...
> 
> 	#include <sched.h>
> 	#include <assert.h>
> 	#include <unistd.h>
> 	#include <syscall.h>
> 
> 	#define FUTEX_LOCK_PI		6
> 
> 	int main(void)
> 	{
> 		struct sched_param sp = {};
> 
> 		sp.sched_priority = 2;
> 		assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
> 
> 		int lock = vfork();
> 		if (!lock) {
> 			sp.sched_priority = 1;
> 			assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
> 			_exit(0);
> 		}
> 
> 		syscall(__NR_futex, &lock, FUTEX_LOCK_PI, 0,0,0);
> 		return 0;
> 	}
> 
> this creates the unkillable RT process spinning in futex_lock_pi() on
> a single CPU machine (or you can use taskset).

Uuurgh.

> Probably the patch below makes sense anyway, but of course it doesn't
> solve the real problem: futex_lock_pi() should not spin in this case.

Obviously not.

> It seems to me I even sent the fix a long ago, but I can't recall what
> exactly it did. Probably the PF_EXITING check in attach_to_pi_owner()
> must simply die, I'll try to recall...

We can't do that. The task might have released the futex already, so the
waiter would operate on inconsistent state :(

The problem with that exit race is that there is no form of serialization
which might be used to address that. We can't abuse any of the task locks
for this.

I was working on a patch set some time ago to fix another more esoteric
futex exit issue. Let me resurrect that.

Vs. the signal pending check. That makes sense on its own, but we should
not restrict it to fatal signals. Futexes are interruptible by definition.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-05 17:28       ` Thomas Gleixner
@ 2019-11-05 17:59         ` Thomas Gleixner
  2019-11-05 18:56           ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-05 17:59 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On Tue, 5 Nov 2019, Thomas Gleixner wrote:

> On Tue, 5 Nov 2019, Oleg Nesterov wrote:
> > On 11/05, Thomas Gleixner wrote:
> > >
> > > Out of curiosity, what's the race issue vs. robust list which you are
> > > trying to solve?
> > 
> > Off-topic, but this reminds me...
> > 
> > 	#include <sched.h>
> > 	#include <assert.h>
> > 	#include <unistd.h>
> > 	#include <syscall.h>
> > 
> > 	#define FUTEX_LOCK_PI		6
> > 
> > 	int main(void)
> > 	{
> > 		struct sched_param sp = {};
> > 
> > 		sp.sched_priority = 2;
> > 		assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
> > 
> > 		int lock = vfork();
> > 		if (!lock) {
> > 			sp.sched_priority = 1;
> > 			assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
> > 			_exit(0);
> > 		}
> > 
> > 		syscall(__NR_futex, &lock, FUTEX_LOCK_PI, 0,0,0);
> > 		return 0;
> > 	}
> > 
> > this creates the unkillable RT process spinning in futex_lock_pi() on
> > a single CPU machine (or you can use taskset).
> 
> Uuurgh.

But staring more at it. That's a scheduler bug.

parent	    	    	child

 set FIFO prio 2

 fork()	         ->	set FIFO prio 1
 		 	 sched_setscheduler(...)
			   return from syscall		<= BUG

 		 	_exit()

When the child lowers its priority from 2 to 1, then the parent _must_
preempt the child simply because the parent is now the top priority task on
that CPU. Child should never reach exit before the parent blocks on the
futex.

Peter?

What's even more disturbing is that even with that bug happening the child
is able to set PF_EXITING, but not PF_EXITPIDONE. That doesn't make sense.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-05 17:59         ` Thomas Gleixner
@ 2019-11-05 18:56           ` Thomas Gleixner
  2019-11-05 19:19             ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-05 18:56 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On Tue, 5 Nov 2019, Thomas Gleixner wrote:
> On Tue, 5 Nov 2019, Thomas Gleixner wrote:
> > On Tue, 5 Nov 2019, Oleg Nesterov wrote:
> > > On 11/05, Thomas Gleixner wrote:
> > > >
> > > > Out of curiosity, what's the race issue vs. robust list which you are
> > > > trying to solve?
> > > 
> > > Off-topic, but this reminds me...
> > > 
> > > 	#include <sched.h>
> > > 	#include <assert.h>
> > > 	#include <unistd.h>
> > > 	#include <syscall.h>
> > > 
> > > 	#define FUTEX_LOCK_PI		6
> > > 
> > > 	int main(void)
> > > 	{
> > > 		struct sched_param sp = {};
> > > 
> > > 		sp.sched_priority = 2;
> > > 		assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
> > > 
> > > 		int lock = vfork();
> > > 		if (!lock) {
> > > 			sp.sched_priority = 1;
> > > 			assert(sched_setscheduler(0, SCHED_FIFO, &sp) == 0);
> > > 			_exit(0);
> > > 		}
> > > 
> > > 		syscall(__NR_futex, &lock, FUTEX_LOCK_PI, 0,0,0);
> > > 		return 0;
> > > 	}
> > > 
> > > this creates the unkillable RT process spinning in futex_lock_pi() on
> > > a single CPU machine (or you can use taskset).
> > 
> > Uuurgh.
> 
> But staring more at it. That's a scheduler bug.
> 
> parent	    	    	child
> 
>  set FIFO prio 2
> 
>  fork()	         ->	set FIFO prio 1
>  		 	 sched_setscheduler(...)
> 			   return from syscall		<= BUG
> 
>  		 	_exit()
> 
> When the child lowers its priority from 2 to 1, then the parent _must_
> preempt the child simply because the parent is now the top priority task on
> that CPU. Child should never reach exit before the parent blocks on the
> futex.

I'm a moron. It's vfork() not fork() so the behaviour is expected.

Staring more at the trace which shows me where this goes down the drain.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-05 18:56           ` Thomas Gleixner
@ 2019-11-05 19:19             ` Thomas Gleixner
  2019-11-06  8:55               ` Oleg Nesterov
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-05 19:19 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On Tue, 5 Nov 2019, Thomas Gleixner wrote:
> 
> I'm a moron. It's vfork() not fork() so the behaviour is expected.
> 
> Staring more at the trace which shows me where this goes down the drain.

 parent	    	    	child
 
  set FIFO prio 2
 
  vfork()			->	set FIFO prio 1
   implies wait_for_child()	 	sched_setscheduler(...)
 			   		exit()
					do_exit()
					tsk->flags |= PF_EXITING;
 					....
					mm_release()
					  exit_futex(); (NOOP in this case)
					  complete() --> wakes parent
 sys_futex()
    loop infinite because
    	 PF_EXITING is set,
	 but PF_EXITPIDONE not

So the obvious question is why PF_EXITPIDONE is set way after the futex
exit cleanup has run, but moving this right after exit_futex() would not
solve the exit race completely because the code after setting PF_EXITING is
preemptible. So the same crap could happen just by preemption:

  task holds futex
  ...
  do_exit()
    tsk->flags |= PF_EXITING;

preemption (unrelated wakeup of some other higher prio task, e.g. timer)

  switch_to(other_task)

  return to user
  sys_futex()
	loop infinite as above

And just for the fun of it the futex exit cleanup could trigger the wakeup
itself before PF_EXITPIDONE is set.

There is some other issue which I need to lookup again. That's a slightly
different problem but related to futex exit race conditions.

The way we can deal with that is:

    do_exit()
    tsk->flags |= PF_EXITING;
    ...
    mutex_lock(&tsk->futex_exit_mutex);
    futex_exit();
    tsk->flags |= PF_EXITPIDONE;
    mutex_unlock(&tsk->futex_exit_mutex);
    
and on the futex lock_pi side:

    if (!(tsk->flags & PF_EXITING))
    	return 0;		<- All good

    if (tsk->flags & PF_EXITPIDONE)
        return -EOWNERDEAD;	<- Locker can take over

    mutex_lock(&tsk->futex_exit_mutex);
    if (tsk->flags & PF_EXITPIDONE) {
        mutex_unlock(&tsk->futex_exit_mutex);
        return -EOWNERDEAD;	<- Locker can take over
    }

    queue_futex();
    mutex_unlock(&tsk->futex_exit_mutex);

Not that I think it's pretty, but it plugs all holes AFAICT.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-05 19:19             ` Thomas Gleixner
@ 2019-11-06  8:55               ` Oleg Nesterov
  2019-11-06  9:53                 ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Oleg Nesterov @ 2019-11-06  8:55 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On 11/05, Thomas Gleixner wrote:
>
>  sys_futex()
>     loop infinite because
>     	 PF_EXITING is set,
> 	 but PF_EXITPIDONE not

Yes.

IOW, the problem is very simple. RT task preempts the exiting lock owner
after it sets PF_EXITING but before it sets PF_EXITPIDONE, if they run on
the same CPU futex_lock_pi() will spin forever.

> So the obvious question is why PF_EXITPIDONE is set way after the futex
> exit cleanup has run,

Another obvious question is why this code checks PF_EXITING. I still think
it should not.

> The way we can deal with that is:
>
>     do_exit()
>     tsk->flags |= PF_EXITING;
>     ...
>     mutex_lock(&tsk->futex_exit_mutex);
>     futex_exit();
>     tsk->flags |= PF_EXITPIDONE;
>     mutex_unlock(&tsk->futex_exit_mutex);
>
> and on the futex lock_pi side:
>
>     if (!(tsk->flags & PF_EXITING))
>     	return 0;		<- All good
>
>     if (tsk->flags & PF_EXITPIDONE)
>         return -EOWNERDEAD;	<- Locker can take over
>
>     mutex_lock(&tsk->futex_exit_mutex);
>     if (tsk->flags & PF_EXITPIDONE) {
>         mutex_unlock(&tsk->futex_exit_mutex);
>         return -EOWNERDEAD;	<- Locker can take over
>     }
>
>     queue_futex();
>     mutex_unlock(&tsk->futex_exit_mutex);
>
> Not that I think it's pretty, but it plugs all holes AFAICT.

I have found the fix I sent in 2015, attached below. I forgot everything
I knew about futex.c, so I need some time to adapt it to the current code.

But I think it is clear what this patch tries to do, do you see any hole?

Oleg.

[PATCH] futex: don't spin waiting for PF_EXITING -> PF_EXITPIDONE transition

It is absolutely not clear why attach_to_pi_owner() returns -EAGAIN which
triggers "retry" if the lock owner is PF_EXITING but not PF_EXITPIDONE.
This burns CPU for no reason and this can even livelock if the rt_task()
caller preempts a PF_EXITING owner.

Remove the PF_EXITING check altogether. We do not care if it is exiting,
all we need to know is can we rely on exit_pi_state_list() or not. So we
also need to set PF_EXITPIDONE before we flush ->pi_state_list and call
exit_pi_state_list() unconditionally.

Perhaps we can add the fast-path list_empty() check in mm_release() back,
but lets fix the problem first. Besides, in theory this check is already
not correct, at least it should be list_empty_careful() to avoid the race
with free_pi_state() in progress.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/exit.c  |   22 +---------------------
 kernel/fork.c  |    3 +--
 kernel/futex.c |   40 ++++++++++------------------------------
 3 files changed, 12 insertions(+), 53 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 6806c55..bc969ed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -683,27 +683,13 @@ void do_exit(long code)
 	 */
 	if (unlikely(tsk->flags & PF_EXITING)) {
 		pr_alert("Fixing recursive fault but reboot is needed!\n");
-		/*
-		 * We can do this unlocked here. The futex code uses
-		 * this flag just to verify whether the pi state
-		 * cleanup has been done or not. In the worst case it
-		 * loops once more. We pretend that the cleanup was
-		 * done as there is no way to return. Either the
-		 * OWNER_DIED bit is set by now or we push the blocked
-		 * task into the wait for ever nirwana as well.
-		 */
+		/* Avoid the new additions to ->pi_state_list at least */
 		tsk->flags |= PF_EXITPIDONE;
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 	}
 
 	exit_signals(tsk);  /* sets PF_EXITING */
-	/*
-	 * tsk->flags are checked in the futex code to protect against
-	 * an exiting task cleaning up the robust pi futexes.
-	 */
-	smp_mb();
-	raw_spin_unlock_wait(&tsk->pi_lock);
 
 	if (unlikely(in_atomic()))
 		pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -779,12 +765,6 @@ void do_exit(long code)
 	 * Make sure we are holding no locks:
 	 */
 	debug_check_no_locks_held();
-	/*
-	 * We can do this unlocked here. The futex code uses this flag
-	 * just to verify whether the pi state cleanup has been done
-	 * or not. In the worst case it loops once more.
-	 */
-	tsk->flags |= PF_EXITPIDONE;
 
 	if (tsk->io_context)
 		exit_io_context(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2dda..ec3208e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -803,8 +803,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 		tsk->compat_robust_list = NULL;
 	}
 #endif
-	if (unlikely(!list_empty(&tsk->pi_state_list)))
-		exit_pi_state_list(tsk);
+	exit_pi_state_list(tsk);
 #endif
 
 	uprobe_free_utask(tsk);
diff --git a/kernel/futex.c b/kernel/futex.c
index b101381..c1104a8 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -716,11 +716,13 @@ void exit_pi_state_list(struct task_struct *curr)
 
 	if (!futex_cmpxchg_enabled)
 		return;
+
 	/*
-	 * We are a ZOMBIE and nobody can enqueue itself on
-	 * pi_state_list anymore, but we have to be careful
-	 * versus waiters unqueueing themselves:
+	 * attach_to_pi_owner() can no longer add the new entry. But
+	 * we have to be careful versus waiters unqueueing themselves.
 	 */
+	curr->flags |= PF_EXITPIDONE;
+
 	raw_spin_lock_irq(&curr->pi_lock);
 	while (!list_empty(head)) {
 
@@ -905,24 +907,12 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
 		return -EPERM;
 	}
 
-	/*
-	 * We need to look at the task state flags to figure out,
-	 * whether the task is exiting. To protect against the do_exit
-	 * change of the task flags, we do this protected by
-	 * p->pi_lock:
-	 */
 	raw_spin_lock_irq(&p->pi_lock);
-	if (unlikely(p->flags & PF_EXITING)) {
-		/*
-		 * The task is on the way out. When PF_EXITPIDONE is
-		 * set, we know that the task has finished the
-		 * cleanup:
-		 */
-		int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
-
+	if (unlikely(p->flags & PF_EXITPIDONE)) {
+		/* exit_pi_state_list() was already called */
 		raw_spin_unlock_irq(&p->pi_lock);
 		put_task_struct(p);
-		return ret;
+		return -ESRCH;
 	}
 
 	/*
@@ -1633,12 +1623,7 @@ retry_private:
 				goto retry;
 			goto out;
 		case -EAGAIN:
-			/*
-			 * Two reasons for this:
-			 * - Owner is exiting and we just wait for the
-			 *   exit to complete.
-			 * - The user space value changed.
-			 */
+			/* The user space value changed. */
 			free_pi_state(pi_state);
 			pi_state = NULL;
 			double_unlock_hb(hb1, hb2);
@@ -2295,12 +2280,7 @@ retry_private:
 		case -EFAULT:
 			goto uaddr_faulted;
 		case -EAGAIN:
-			/*
-			 * Two reasons for this:
-			 * - Task is exiting and we just wait for the
-			 *   exit to complete.
-			 * - The user space value changed.
-			 */
+			/* The user space value changed. */
 			queue_unlock(hb);
 			put_futex_key(&q.key);
 			cond_resched();
-- 
1.5.5.1




^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-06  8:55               ` Oleg Nesterov
@ 2019-11-06  9:53                 ` Thomas Gleixner
  2019-11-06 10:35                   ` Oleg Nesterov
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-06  9:53 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

Oleg,

On Wed, 6 Nov 2019, Oleg Nesterov wrote:
> I have found the fix I sent in 2015, attached below. I forgot everything
> I knew about futex.c, so I need some time to adapt it to the current code.
> 
> But I think it is clear what this patch tries to do, do you see any hole?

> @@ -716,11 +716,13 @@ void exit_pi_state_list(struct task_struct *curr)
>  
>  	if (!futex_cmpxchg_enabled)
>  		return;
> +
>  	/*
> -	 * We are a ZOMBIE and nobody can enqueue itself on
> -	 * pi_state_list anymore, but we have to be careful
> -	 * versus waiters unqueueing themselves:
> +	 * attach_to_pi_owner() can no longer add the new entry. But
> +	 * we have to be careful versus waiters unqueueing themselves.
>  	 */
> +	curr->flags |= PF_EXITPIDONE;

This obviously would need a barrier or would have to be moved inside of the
pi_lock region.

>  	raw_spin_lock_irq(&curr->pi_lock);
>  	while (!list_empty(head)) {
>  
> @@ -905,24 +907,12 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
>  		return -EPERM;
>  	}
>  
> -	/*
> -	 * We need to look at the task state flags to figure out,
> -	 * whether the task is exiting. To protect against the do_exit
> -	 * change of the task flags, we do this protected by
> -	 * p->pi_lock:
> -	 */
>  	raw_spin_lock_irq(&p->pi_lock);
> -	if (unlikely(p->flags & PF_EXITING)) {
> -		/*
> -		 * The task is on the way out. When PF_EXITPIDONE is
> -		 * set, we know that the task has finished the
> -		 * cleanup:
> -		 */
> -		int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
> -
> +	if (unlikely(p->flags & PF_EXITPIDONE)) {
> +		/* exit_pi_state_list() was already called */
>  		raw_spin_unlock_irq(&p->pi_lock);
>  		put_task_struct(p);
> -		return ret;
> +		return -ESRCH;

But, this is incorrect because we'd return -ESRCH to user space while the
futex value still has the TID of the exiting task set which will
subsequently cleanout the futex and set the owner died bit.

The result is inconsistent state and will trigger the asserts in the futex
test suite and in the pthread_mutex implementation.

The only reason why -ESRCH can be returned is when the user space value of
the futex contains garbage. But in this case it does not contain garbage
and returning -ESRCH violates the implicit robustness guarantee of PI
futexes and causes unexpected havoc.

See da791a667536 ("futex: Cure exit race") for example.

The futex PI contract between kernel and user space relies on consistent
state. Guess why that code has more corner case handling than actual
functionality. :)

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-06  9:53                 ` Thomas Gleixner
@ 2019-11-06 10:35                   ` Oleg Nesterov
  2019-11-06 11:07                     ` Thomas Gleixner
  0 siblings, 1 reply; 29+ messages in thread
From: Oleg Nesterov @ 2019-11-06 10:35 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On 11/06, Thomas Gleixner wrote:
>
> > @@ -716,11 +716,13 @@ void exit_pi_state_list(struct task_struct *curr)
> >
> >  	if (!futex_cmpxchg_enabled)
> >  		return;
> > +
> >  	/*
> > -	 * We are a ZOMBIE and nobody can enqueue itself on
> > -	 * pi_state_list anymore, but we have to be careful
> > -	 * versus waiters unqueueing themselves:
> > +	 * attach_to_pi_owner() can no longer add the new entry. But
> > +	 * we have to be careful versus waiters unqueueing themselves.
> >  	 */
> > +	curr->flags |= PF_EXITPIDONE;
>
> This obviously would need a barrier or would have to be moved inside of the
> pi_lock region.

probably yes,

> > +	if (unlikely(p->flags & PF_EXITPIDONE)) {
> > +		/* exit_pi_state_list() was already called */
> >  		raw_spin_unlock_irq(&p->pi_lock);
> >  		put_task_struct(p);
> > -		return ret;
> > +		return -ESRCH;
>
> But, this is incorrect because we'd return -ESRCH to user space while the
> futex value still has the TID of the exiting task set which will
> subsequently cleanout the futex and set the owner died bit.

Heh. Of course this is not correct. As I said, this patch should be adapted
to the current code. See below.

> See da791a667536 ("futex: Cure exit race") for example.

Thomas, I simply can't resist ;)

I reported this race when I sent this patch in 2015,

https://lore.kernel.org/lkml/20150205181014.GA20244@redhat.com/

but somehow that discussion died with no result.

> Guess why that code has more corner case handling than actual
> functionality. :)

I know why. To confuse me!

Oleg.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-06 10:35                   ` Oleg Nesterov
@ 2019-11-06 11:07                     ` Thomas Gleixner
  2019-11-06 12:11                       ` Oleg Nesterov
  0 siblings, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-06 11:07 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On Wed, 6 Nov 2019, Oleg Nesterov wrote:
> On 11/06, Thomas Gleixner wrote:
> > > +	if (unlikely(p->flags & PF_EXITPIDONE)) {
> > > +		/* exit_pi_state_list() was already called */
> > >  		raw_spin_unlock_irq(&p->pi_lock);
> > >  		put_task_struct(p);
> > > -		return ret;
> > > +		return -ESRCH;
> >
> > But, this is incorrect because we'd return -ESRCH to user space while the
> > futex value still has the TID of the exiting task set which will
> > subsequently cleanout the futex and set the owner died bit.
> 
> Heh. Of course this is not correct. As I said, this patch should be adapted
> to the current code. See below.
> 
> > See da791a667536 ("futex: Cure exit race") for example.
> 
> Thomas, I simply can't resist ;)
> 
> I reported this race when I sent this patch in 2015,
> 
> https://lore.kernel.org/lkml/20150205181014.GA20244@redhat.com/
> 
> but somehow that discussion died with no result.

Yes. I was not paying attention for some reason. Don't ask me what happened
in Feb. 2015 :)

But even if we adapt that patch to the current code it won't solve the
-ESRCH issue I described above.

> > Guess why that code has more corner case handling than actual
> > functionality. :)
> 
> I know why. To confuse me!

Of course. As Rusty said: "Futexes are also cursed"

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-06 11:07                     ` Thomas Gleixner
@ 2019-11-06 12:11                       ` Oleg Nesterov
  2019-11-06 13:38                         ` Thomas Gleixner
  2019-11-06 17:42                         ` Thomas Gleixner
  0 siblings, 2 replies; 29+ messages in thread
From: Oleg Nesterov @ 2019-11-06 12:11 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On 11/06, Thomas Gleixner wrote:
>
> But even if we adapt that patch to the current code it won't solve the
> -ESRCH issue I described above.

Hmm. I must have missed something. Why?

Please see the unfinished/untested patch below.

I think that (with or without this fix) handle_exit_race() logic needs
cleanups, there is no reason for get_futex_value_locked(), we can drop
->pi_lock right after we see PF_EXITPIDONE. Lets discuss this later.

But why we can not rely on handle_exit_race() without PF_EXITING check?

Yes, exit_robust_list() is called before exit_pi_state_list() which (with
this patch) sets PF_EXITPIDONE. But this is fine? handle_futex_death()
doesn't wakeup pi futexes, exit_pi_state_list() does this.

Could you explain?

Oleg.


diff --git a/kernel/exit.c b/kernel/exit.c
index a46a50d..a6c788c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -761,17 +761,6 @@ void __noreturn do_exit(long code)
 	}
 
 	exit_signals(tsk);  /* sets PF_EXITING */
-	/*
-	 * Ensure that all new tsk->pi_lock acquisitions must observe
-	 * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
-	 */
-	smp_mb();
-	/*
-	 * Ensure that we must observe the pi_state in exit_mm() ->
-	 * mm_release() -> exit_pi_state_list().
-	 */
-	raw_spin_lock_irq(&tsk->pi_lock);
-	raw_spin_unlock_irq(&tsk->pi_lock);
 
 	if (unlikely(in_atomic())) {
 		pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -846,12 +835,6 @@ void __noreturn do_exit(long code)
 	 * Make sure we are holding no locks:
 	 */
 	debug_check_no_locks_held();
-	/*
-	 * We can do this unlocked here. The futex code uses this flag
-	 * just to verify whether the pi state cleanup has been done
-	 * or not. In the worst case it loops once more.
-	 */
-	tsk->flags |= PF_EXITPIDONE;
 
 	if (tsk->io_context)
 		exit_io_context(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index bcdf531..0d8edcf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1297,8 +1297,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 		tsk->compat_robust_list = NULL;
 	}
 #endif
-	if (unlikely(!list_empty(&tsk->pi_state_list)))
-		exit_pi_state_list(tsk);
+	exit_pi_state_list(tsk);
 #endif
 
 	uprobe_free_utask(tsk);
diff --git a/kernel/futex.c b/kernel/futex.c
index bd18f60..c3b5d33 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -905,6 +905,7 @@ void exit_pi_state_list(struct task_struct *curr)
 	 * versus waiters unqueueing themselves:
 	 */
 	raw_spin_lock_irq(&curr->pi_lock);
+	curr->flags |= PF_EXITPIDONE;
 	while (!list_empty(head)) {
 		next = head->next;
 		pi_state = list_entry(next, struct futex_pi_state, list);
@@ -1169,18 +1170,11 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
 	return ret;
 }
 
-static int handle_exit_race(u32 __user *uaddr, u32 uval,
-			    struct task_struct *tsk)
+static int handle_exit_race(u32 __user *uaddr, u32 uval)
 {
 	u32 uval2;
 
 	/*
-	 * If PF_EXITPIDONE is not yet set, then try again.
-	 */
-	if (tsk && !(tsk->flags & PF_EXITPIDONE))
-		return -EAGAIN;
-
-	/*
 	 * Reread the user space value to handle the following situation:
 	 *
 	 * CPU0				CPU1
@@ -1245,7 +1239,7 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 		return -EAGAIN;
 	p = find_get_task_by_vpid(pid);
 	if (!p)
-		return handle_exit_race(uaddr, uval, NULL);
+		return handle_exit_race(uaddr, uval);
 
 	if (unlikely(p->flags & PF_KTHREAD)) {
 		put_task_struct(p);
@@ -1259,13 +1253,13 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 	 * p->pi_lock:
 	 */
 	raw_spin_lock_irq(&p->pi_lock);
-	if (unlikely(p->flags & PF_EXITING)) {
+	if (unlikely(p->flags & PF_EXITPIDONE)) {
 		/*
 		 * The task is on the way out. When PF_EXITPIDONE is
 		 * set, we know that the task has finished the
 		 * cleanup:
 		 */
-		int ret = handle_exit_race(uaddr, uval, p);
+		int ret = handle_exit_race(uaddr, uval);
 
 		raw_spin_unlock_irq(&p->pi_lock);
 		put_task_struct(p);


^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-06 12:11                       ` Oleg Nesterov
@ 2019-11-06 13:38                         ` Thomas Gleixner
  2019-11-06 17:42                         ` Thomas Gleixner
  1 sibling, 0 replies; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-06 13:38 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On Wed, 6 Nov 2019, Oleg Nesterov wrote:
> But why we can not rely on handle_exit_race() without PF_EXITING check?
> 
> Yes, exit_robust_list() is called before exit_pi_state_list() which (with
> this patch) sets PF_EXITPIDONE. But this is fine? handle_futex_death()
> doesn't wakeup pi futexes, exit_pi_state_list() does this.

I know. You still create inconsistent state because of this:

>  	raw_spin_lock_irq(&p->pi_lock);
> -	if (unlikely(p->flags & PF_EXITING)) {
> +	if (unlikely(p->flags & PF_EXITPIDONE)) {
>  		/*
>  		 * The task is on the way out. When PF_EXITPIDONE is
>  		 * set, we know that the task has finished the
>  		 * cleanup:
>  		 */
> -		int ret = handle_exit_race(uaddr, uval, p);
> +		int ret = handle_exit_race(uaddr, uval);
>  
>  		raw_spin_unlock_irq(&p->pi_lock);
>  		put_task_struct(p);

Same explanation as before just not prosa this time:

exit()					lock_pi(futex2)
  exit_pi_state_list()
   lock(tsk->pi_lock)
   tsk->flags |= PF_EXITPIDONE;		 attach_to_pi_owner()
					  ...
  // Loop unrolled for clarity
  while(!list_empty())			  lock(tsk->pi_lock);
     cleanup(futex1)
       unlock(tsk->pi_lock)
     ...			          if (tsk->flags & PF_EXITPIDONE)
					     ret = handle_exit_race()
					       if (uval != orig_uval)
					           return -EAGAIN;
					       return -ESRCH;

    cleanup(futex2)			  return to userspace err = -ESRCH
      update futex2->uval
      with new owner TID and set
      OWNER_DIED

    					 userspace handles -ESRCH

					 but futex2->uval has a valid TID
					 and OWNER_DIED set.

That's inconsistent state, the futex became a SNAFUtex and user space
cannot recover from that. At least not existing user space and we cannot
break that, right?

If the kernel returns -ESRCH then the futex value must be preserved (except
for the waiters bit being set, but that's not part of the problem at hand).

You cannot just look at the kernel state with futexes. We have to guarantee
consistent state between kernel and user space no matter what. And of
course we have to be careful about user space creating inconsistent state
for stupid or malicious reasons. See the whole state table above
attach_to_pi_state().

The only way to fix this live lock issue is to gracefully wait for the
cleanup to complete instead of busy looping.

Yes, it sucks, but futexes suck by definition.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-05 14:27           ` Thomas Gleixner
  2019-11-05 14:33             ` Florian Weimer
@ 2019-11-06 14:00             ` Zack Weinberg
  2019-11-06 14:04               ` Florian Weimer
  1 sibling, 1 reply; 29+ messages in thread
From: Zack Weinberg @ 2019-11-06 14:00 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Carlos O'Donell, Florian Weimer, Shawn Landden,
	GNU C Library, linux-api, LKML, Arnd Bergmann, Deepa Dinamani,
	Oleg Nesterov, Andrew Morton, Catalin Marinas, Keith Packard,
	Peter Zijlstra

On Tue, Nov 5, 2019 at 9:28 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> The real issue is that the robust list could be circular by incident or
> malice and there is no way for the kernel to figure that out. That would
> prevent the task from exiting and make it iterate over the list until
> doomsday, i.e. a nice unpriviledged DoS.

Why can't the kernel use the standard tortoise-and-hare algorithm for
detecting circular linked lists here?

zw

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time.
  2019-11-06 14:00             ` Zack Weinberg
@ 2019-11-06 14:04               ` Florian Weimer
  0 siblings, 0 replies; 29+ messages in thread
From: Florian Weimer @ 2019-11-06 14:04 UTC (permalink / raw)
  To: Zack Weinberg
  Cc: Thomas Gleixner, Carlos O'Donell, Shawn Landden,
	GNU C Library, linux-api, LKML, Arnd Bergmann, Deepa Dinamani,
	Oleg Nesterov, Andrew Morton, Catalin Marinas, Keith Packard,
	Peter Zijlstra

* Zack Weinberg:

> On Tue, Nov 5, 2019 at 9:28 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>>
>> The real issue is that the robust list could be circular by incident or
>> malice and there is no way for the kernel to figure that out. That would
>> prevent the task from exiting and make it iterate over the list until
>> doomsday, i.e. a nice unpriviledged DoS.
>
> Why can't the kernel use the standard tortoise-and-hare algorithm for
> detecting circular linked lists here?

It's not guaranteed to terminate if the list is in shared memory.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-06 12:11                       ` Oleg Nesterov
  2019-11-06 13:38                         ` Thomas Gleixner
@ 2019-11-06 17:42                         ` Thomas Gleixner
  2019-11-07 15:51                           ` Oleg Nesterov
  1 sibling, 1 reply; 29+ messages in thread
From: Thomas Gleixner @ 2019-11-06 17:42 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On Wed, 6 Nov 2019, Oleg Nesterov wrote:
> 
> I think that (with or without this fix) handle_exit_race() logic needs
> cleanups, there is no reason for get_futex_value_locked(), we can drop
> ->pi_lock right after we see PF_EXITPIDONE. Lets discuss this later.

Which still is in atomic because the hash bucket lock is held, ergo
get_futex_value_locked() needs to stay for now.

So the only thing we could do is to reduce the pi_lock held section a bit.

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: handle_exit_race && PF_EXITING
  2019-11-06 17:42                         ` Thomas Gleixner
@ 2019-11-07 15:51                           ` Oleg Nesterov
  0 siblings, 0 replies; 29+ messages in thread
From: Oleg Nesterov @ 2019-11-07 15:51 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Florian Weimer, Shawn Landden, libc-alpha, linux-api, LKML,
	Arnd Bergmann, Deepa Dinamani, Andrew Morton, Catalin Marinas,
	Keith Packard, Peter Zijlstra

On 11/06, Thomas Gleixner wrote:
>
> On Wed, 6 Nov 2019, Oleg Nesterov wrote:
> >
> > I think that (with or without this fix) handle_exit_race() logic needs
> > cleanups, there is no reason for get_futex_value_locked(), we can drop
> > ->pi_lock right after we see PF_EXITPIDONE. Lets discuss this later.
>
> Which still is in atomic because the hash bucket lock is held, ergo
> get_futex_value_locked() needs to stay for now.

Indeed, you are right.

> Same explanation as before just not prosa this time:
>
> exit()					lock_pi(futex2)
>   exit_pi_state_list()
>    lock(tsk->pi_lock)
>    tsk->flags |= PF_EXITPIDONE;		 attach_to_pi_owner()
> 					  ...
>   // Loop unrolled for clarity
>   while(!list_empty())			  lock(tsk->pi_lock);
>      cleanup(futex1)
>        unlock(tsk->pi_lock)
         ^^^^^^^^^^^^^^^^^^^^
Ah! Thanks.


Hmm. In particular, exit_pi_state() drops pi_lock if refcount_inc_not_zero() fails.

Isn't this another potential source of livelock ?

Suppose that a realtime lock owner X sleeps somewhere, another task T
calls put_pi_state(), refcount_dec_and_test() succeeds.

What if, say, X is killed right after that and preempts T on the same
CPU?

Oleg.


^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2019-11-07 15:51 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-04  0:29 [RFC v2 PATCH] futex: extend set_robust_list to allow 2 locking ABIs at the same time Shawn Landden
2019-11-04  0:51 ` Shawn Landden
2019-11-04 15:37 ` Thomas Gleixner
2019-11-05  0:10   ` Thomas Gleixner
2019-11-05  9:48 ` Florian Weimer
2019-11-05  9:59   ` Thomas Gleixner
2019-11-05 10:06     ` Florian Weimer
2019-11-05 11:56       ` Thomas Gleixner
2019-11-05 14:10         ` Carlos O'Donell
2019-11-05 14:27           ` Florian Weimer
2019-11-05 14:53             ` Thomas Gleixner
2019-11-05 14:27           ` Thomas Gleixner
2019-11-05 14:33             ` Florian Weimer
2019-11-05 14:48               ` Thomas Gleixner
2019-11-06 14:00             ` Zack Weinberg
2019-11-06 14:04               ` Florian Weimer
2019-11-05 15:27     ` handle_exit_race && PF_EXITING Oleg Nesterov
2019-11-05 17:28       ` Thomas Gleixner
2019-11-05 17:59         ` Thomas Gleixner
2019-11-05 18:56           ` Thomas Gleixner
2019-11-05 19:19             ` Thomas Gleixner
2019-11-06  8:55               ` Oleg Nesterov
2019-11-06  9:53                 ` Thomas Gleixner
2019-11-06 10:35                   ` Oleg Nesterov
2019-11-06 11:07                     ` Thomas Gleixner
2019-11-06 12:11                       ` Oleg Nesterov
2019-11-06 13:38                         ` Thomas Gleixner
2019-11-06 17:42                         ` Thomas Gleixner
2019-11-07 15:51                           ` Oleg Nesterov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).