linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v10 1/3] x86/syscalls: Check address limit on user-mode return
@ 2017-06-15  1:12 Thomas Garnier
  2017-06-15  1:12 ` [PATCH v10 2/3] arm/syscalls: " Thomas Garnier
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Thomas Garnier @ 2017-06-15  1:12 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Thomas Garnier, Petr Mladek, Miroslav Benes, Kees Cook, Al Viro,
	Arnd Bergmann, Dave Hansen, David Howells, Russell King,
	Andy Lutomirski, Will Drewry, Will Deacon, Catalin Marinas,
	Mark Rutland, Pratyush Anand, Chris Metcalf
  Cc: x86, linux-kernel, linux-api, linux-arm-kernel, kernel-hardening

Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
privileges [1].

The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.

The addr_limit_user_check function is added as a cross-architecture
function to check the address limit.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.

Based on next-20170609
---
 arch/x86/entry/common.c            |  3 +++
 arch/x86/include/asm/thread_info.h |  5 ++++-
 arch/x86/include/asm/uaccess.h     |  7 ++++++-
 include/linux/syscalls.h           | 16 ++++++++++++++++
 4 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index cdefcfdd9e63..03505ffbe1b6 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/uprobes.h>
 #include <linux/livepatch.h>
+#include <linux/syscalls.h>
 
 #include <asm/desc.h>
 #include <asm/traps.h>
@@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	struct thread_info *ti = current_thread_info();
 	u32 cached_flags;
 
+	addr_limit_user_check();
+
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
 		local_irq_disable();
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e00e1bd6e7b3..5161da1a0fa0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -98,6 +98,7 @@ struct thread_info {
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 #define TIF_ADDR32		29	/* 32-bit address space on 64 bits */
 #define TIF_X32			30	/* 32-bit native x86-64 binary */
+#define TIF_FSCHECK		31	/* Check FS is USER_DS on return */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -122,6 +123,7 @@ struct thread_info {
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_ADDR32		(1 << TIF_ADDR32)
 #define _TIF_X32		(1 << TIF_X32)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 
 /*
  * work to do in syscall_trace_enter().  Also includes TIF_NOHZ for
@@ -137,7 +139,8 @@ struct thread_info {
 	(_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING |	\
 	 _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU |	\
 	 _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE |	\
-	 _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT)
+	 _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_FSCHECK)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a059aac9e937..11433f9018e2 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -26,7 +26,12 @@
 
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current->thread.addr_limit)
-#define set_fs(x)	(current->thread.addr_limit = (x))
+static inline void set_fs(mm_segment_t fs)
+{
+	current->thread.addr_limit = fs;
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
+}
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..ac0cf6fb25d6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -206,6 +206,22 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	}								\
 	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
+#ifdef TIF_FSCHECK
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void addr_limit_user_check(void)
+{
+
+	if (!test_thread_flag(TIF_FSCHECK))
+		return;
+
+	BUG_ON(!segment_eq(get_fs(), USER_DS));
+	clear_thread_flag(TIF_FSCHECK);
+}
+#endif
+
 asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 			       qid_t id, void __user *addr);
 asmlinkage long sys_time(time_t __user *tloc);
-- 
2.13.1.518.g3df882009-goog

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
  2017-06-15  1:12 [PATCH v10 1/3] x86/syscalls: Check address limit on user-mode return Thomas Garnier
@ 2017-06-15  1:12 ` Thomas Garnier
       [not found]   ` <20170615011203.144108-2-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2017-06-15  1:12 ` [PATCH v10 3/3] arm64/syscalls: " Thomas Garnier
       [not found] ` <20170615011203.144108-1-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2 siblings, 1 reply; 23+ messages in thread
From: Thomas Garnier @ 2017-06-15  1:12 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Thomas Garnier, Petr Mladek, Miroslav Benes, Kees Cook, Al Viro,
	Arnd Bergmann, Dave Hansen, David Howells, Russell King,
	Andy Lutomirski, Will Drewry, Will Deacon, Catalin Marinas,
	Mark Rutland, Pratyush Anand, Chris Metcalf
  Cc: x86, linux-kernel, linux-api, linux-arm-kernel, kernel-hardening

Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].

The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.

The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
for arm instruction immediate support. The global work mask is too big
to used on a single instruction so adapt ret_fast_syscall.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.

Based on next-20170609
---
 arch/arm/include/asm/thread_info.h | 15 +++++++++------
 arch/arm/include/asm/uaccess.h     |  2 ++
 arch/arm/kernel/entry-common.S     |  9 +++++++--
 arch/arm/kernel/signal.c           |  5 +++++
 4 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 776757d1604a..1d468b527b7b 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define TIF_NEED_RESCHED	1	/* rescheduling necessary */
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_UPROBE		3	/* breakpointed or singlestepping */
-#define TIF_SYSCALL_TRACE	4	/* syscall trace active */
-#define TIF_SYSCALL_AUDIT	5	/* syscall auditing active */
-#define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
-#define TIF_SECCOMP		7	/* seccomp syscall filtering active */
+#define TIF_FSCHECK		4	/* Check FS is USER_DS on return */
+#define TIF_SYSCALL_TRACE	5	/* syscall trace active */
+#define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
+#define TIF_SYSCALL_TRACEPOINT	7	/* syscall tracepoint instrumentation */
+#define TIF_SECCOMP		8	/* seccomp syscall filtering active */
 
 #define TIF_NOHZ		12	/* in adaptive nohz mode */
 #define TIF_USING_IWMMXT	17
@@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
@@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
 /*
  * Change these and you break ASM code in entry-common.S
  */
-#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
-				 _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING |	\
+				 _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
+				 _TIF_FSCHECK)
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_ARM_THREAD_INFO_H */
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 2577405d082d..6cc882223e34 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
 {
 	current_thread_info()->addr_limit = fs;
 	modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
 }
 
 #define segment_eq(a, b)	((a) == (b))
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..e33c32d56193 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -41,7 +41,9 @@ ret_fast_syscall:
  UNWIND(.cantunwind	)
 	disable_irq_notrace			@ disable interrupts
 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
-	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+	tst	r1, #_TIF_SYSCALL_WORK
+	bne	fast_work_pending
+	tst	r1, #_TIF_WORK_MASK
 	bne	fast_work_pending
 
 	/* perform architecture specific actions before user return */
@@ -67,12 +69,15 @@ ret_fast_syscall:
 	str	r0, [sp, #S_R0 + S_OFF]!	@ save returned r0
 	disable_irq_notrace			@ disable interrupts
 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
-	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+	tst	r1, #_TIF_SYSCALL_WORK
+	bne	fast_work_pending
+	tst	r1, #_TIF_WORK_MASK
 	beq	no_work_pending
  UNWIND(.fnend		)
 ENDPROC(ret_fast_syscall)
 
 	/* Slower path - fall through to work_pending */
+fast_work_pending:
 #endif
 
 	tst	r1, #_TIF_SYSCALL_WORK
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7b8f2141427b..3a48b54c6405 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -14,6 +14,7 @@
 #include <linux/uaccess.h>
 #include <linux/tracehook.h>
 #include <linux/uprobes.h>
+#include <linux/syscalls.h>
 
 #include <asm/elf.h>
 #include <asm/cacheflush.h>
@@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 	 * Update the trace code with the current status.
 	 */
 	trace_hardirqs_off();
+
+	/* Check valid user FS if needed */
+	addr_limit_user_check();
+
 	do {
 		if (likely(thread_flags & _TIF_NEED_RESCHED)) {
 			schedule();
-- 
2.13.1.518.g3df882009-goog

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH v10 3/3] arm64/syscalls: Check address limit on user-mode return
  2017-06-15  1:12 [PATCH v10 1/3] x86/syscalls: Check address limit on user-mode return Thomas Garnier
  2017-06-15  1:12 ` [PATCH v10 2/3] arm/syscalls: " Thomas Garnier
@ 2017-06-15  1:12 ` Thomas Garnier
       [not found]   ` <20170615011203.144108-3-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
       [not found] ` <20170615011203.144108-1-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2 siblings, 1 reply; 23+ messages in thread
From: Thomas Garnier @ 2017-06-15  1:12 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Thomas Garnier, Petr Mladek, Miroslav Benes, Kees Cook, Al Viro,
	Arnd Bergmann, Dave Hansen, David Howells, Russell King,
	Andy Lutomirski, Will Drewry, Will Deacon, Catalin Marinas,
	Mark Rutland, Pratyush Anand, Chris Metcalf
  Cc: x86, linux-kernel, linux-api, linux-arm-kernel, kernel-hardening

Ensure the address limit is a user-mode segment before returning to
user-mode. Otherwise a process can corrupt kernel-mode memory and
elevate privileges [1].

The set_fs function sets the TIF_SETFS flag to force a slow path on
return. In the slow path, the address limit is checked to be USER_DS if
needed.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
v10 redesigns the change to use work flags on set_fs as recommended by
Linus and agreed by others.

Based on next-20170609
---
 arch/arm64/include/asm/thread_info.h | 4 +++-
 arch/arm64/include/asm/uaccess.h     | 3 +++
 arch/arm64/kernel/signal.c           | 5 +++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 46c3b93cf865..c5ba565544ee 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -86,6 +86,7 @@ struct thread_info {
 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
 #define TIF_FOREIGN_FPSTATE	3	/* CPU's FP state is not current's */
 #define TIF_UPROBE		4	/* uprobe breakpoint or singlestep */
+#define TIF_FSCHECK		5	/* Check FS is USER_DS on return */
 #define TIF_NOHZ		7
 #define TIF_SYSCALL_TRACE	8
 #define TIF_SYSCALL_AUDIT	9
@@ -107,11 +108,12 @@ struct thread_info {
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
+#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
 #define _TIF_32BIT		(1 << TIF_32BIT)
 
 #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
 				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \
-				 _TIF_UPROBE)
+				 _TIF_UPROBE | _TIF_FSCHECK)
 
 #define _TIF_SYSCALL_WORK	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 7b8a04789cef..ced7a7c2dd41 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -45,6 +45,9 @@ static inline void set_fs(mm_segment_t fs)
 {
 	current_thread_info()->addr_limit = fs;
 
+	/* On user-mode return, check fs is correct */
+	set_thread_flag(TIF_FSCHECK);
+
 	/*
 	 * Enable/disable UAO so that copy_to_user() etc can access
 	 * kernel memory with the unprivileged instructions.
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index c7b6de62f9d3..0f0279148bdc 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -25,6 +25,7 @@
 #include <linux/uaccess.h>
 #include <linux/tracehook.h>
 #include <linux/ratelimit.h>
+#include <linux/syscalls.h>
 
 #include <asm/debug-monitors.h>
 #include <asm/elf.h>
@@ -408,6 +409,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 	 * Update the trace code with the current status.
 	 */
 	trace_hardirqs_off();
+
+	/* Check valid user FS if needed */
+	addr_limit_user_check();
+
 	do {
 		if (thread_flags & _TIF_NEED_RESCHED) {
 			schedule();
-- 
2.13.1.518.g3df882009-goog

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
       [not found]   ` <20170615011203.144108-2-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2017-06-20 20:18     ` Kees Cook
       [not found]       ` <CAGXu5jLR7io8u-M8tqbYW22C+sb2a2wSYLRBqJ_dguT4x+1tsQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-07-18 14:36     ` Leonard Crestez
  1 sibling, 1 reply; 23+ messages in thread
From: Kees Cook @ 2017-06-20 20:18 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Petr Mladek, Miroslav Benes, Al Viro, Arnd Bergmann, Dave Hansen,
	David Howells, Russell King, Andy Lutomirski, Will Drewry,
	Will Deacon, Catalin Marinas, Mark Rutland, Pratyush Anand

On Wed, Jun 14, 2017 at 6:12 PM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> Ensure the address limit is a user-mode segment before returning to
> user-mode. Otherwise a process can corrupt kernel-mode memory and
> elevate privileges [1].
>
> The set_fs function sets the TIF_SETFS flag to force a slow path on
> return. In the slow path, the address limit is checked to be USER_DS if
> needed.
>
> The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
> for arm instruction immediate support. The global work mask is too big
> to used on a single instruction so adapt ret_fast_syscall.
>
> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>
> Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> ---
> v10 redesigns the change to use work flags on set_fs as recommended by
> Linus and agreed by others.
>
> Based on next-20170609
> ---
>  arch/arm/include/asm/thread_info.h | 15 +++++++++------
>  arch/arm/include/asm/uaccess.h     |  2 ++
>  arch/arm/kernel/entry-common.S     |  9 +++++++--
>  arch/arm/kernel/signal.c           |  5 +++++
>  4 files changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
> index 776757d1604a..1d468b527b7b 100644
> --- a/arch/arm/include/asm/thread_info.h
> +++ b/arch/arm/include/asm/thread_info.h
> @@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>  #define TIF_NEED_RESCHED       1       /* rescheduling necessary */
>  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
>  #define TIF_UPROBE             3       /* breakpointed or singlestepping */
> -#define TIF_SYSCALL_TRACE      4       /* syscall trace active */
> -#define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
> -#define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
> -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
> +#define TIF_FSCHECK            4       /* Check FS is USER_DS on return */
> +#define TIF_SYSCALL_TRACE      5       /* syscall trace active */
> +#define TIF_SYSCALL_AUDIT      6       /* syscall auditing active */
> +#define TIF_SYSCALL_TRACEPOINT 7       /* syscall tracepoint instrumentation */
> +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
>
>  #define TIF_NOHZ               12      /* in adaptive nohz mode */
>  #define TIF_USING_IWMMXT       17
> @@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
>  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
>  #define _TIF_UPROBE            (1 << TIF_UPROBE)
> +#define _TIF_FSCHECK           (1 << TIF_FSCHECK)
>  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
>  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
>  #define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
> @@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>  /*
>   * Change these and you break ASM code in entry-common.S
>   */
> -#define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
> -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
> +#define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING |  \
> +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE |     \
> +                                _TIF_FSCHECK)
>
>  #endif /* __KERNEL__ */
>  #endif /* __ASM_ARM_THREAD_INFO_H */
> diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
> index 2577405d082d..6cc882223e34 100644
> --- a/arch/arm/include/asm/uaccess.h
> +++ b/arch/arm/include/asm/uaccess.h
> @@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
>  {
>         current_thread_info()->addr_limit = fs;
>         modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
> +       /* On user-mode return, check fs is correct */
> +       set_thread_flag(TIF_FSCHECK);
>  }
>
>  #define segment_eq(a, b)       ((a) == (b))
> diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
> index eb5cd77bf1d8..e33c32d56193 100644
> --- a/arch/arm/kernel/entry-common.S
> +++ b/arch/arm/kernel/entry-common.S
> @@ -41,7 +41,9 @@ ret_fast_syscall:
>   UNWIND(.cantunwind    )
>         disable_irq_notrace                     @ disable interrupts
>         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
> -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> +       tst     r1, #_TIF_SYSCALL_WORK
> +       bne     fast_work_pending
> +       tst     r1, #_TIF_WORK_MASK

(IIUC) MOV32 is 2 cycles (MOVW, MOVT), and each TST above is 1 cycle
and each BNE is 1 cycle (when not taken). So:

mov32 r2, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
tst r1, r2
bne fast_work_pending

is 4 cycles and tst, bne, tst, bne is also 4 cycles. Would mov32 be
more readable (since it keeps the flags together)?

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 1/3] x86/syscalls: Check address limit on user-mode return
       [not found] ` <20170615011203.144108-1-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2017-06-20 20:24   ` Kees Cook
  2017-06-28 17:52     ` Kees Cook
  0 siblings, 1 reply; 23+ messages in thread
From: Kees Cook @ 2017-06-20 20:24 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Petr Mladek, Miroslav Benes, Al Viro, Arnd Bergmann, Dave Hansen,
	David Howells, Russell King, Andy Lutomirski, Will Drewry,
	Will Deacon, Catalin Marinas, Mark Rutland, Pratyush Anand

On Wed, Jun 14, 2017 at 6:12 PM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> Ensure the address limit is a user-mode segment before returning to
> user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
> privileges [1].
>
> The set_fs function sets the TIF_SETFS flag to force a slow path on
> return. In the slow path, the address limit is checked to be USER_DS if
> needed.
>
> The addr_limit_user_check function is added as a cross-architecture
> function to check the address limit.
>
> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>
> Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>

Thanks for reworking this series!

The bad state correctly BUGs under the LKDTM test:

[   21.171586] lkdtm: Performing direct entry CORRUPT_USER_DS
[   21.172791] lkdtm: setting bad task size limit
[   21.173742] ------------[ cut here ]------------
[   21.174641] kernel BUG at ./include/linux/syscalls.h:220!
...
[   21.193166] Call Trace:
[   21.193617]  ? trace_hardirqs_on_thunk+0x1a/0x1c
[   21.194443]  entry_SYSCALL64_slow_path+0x25/0x25


Tested-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
       [not found]       ` <CAGXu5jLR7io8u-M8tqbYW22C+sb2a2wSYLRBqJ_dguT4x+1tsQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-06-20 20:31         ` Thomas Garnier
  2017-06-21  9:08           ` Will Deacon
  0 siblings, 1 reply; 23+ messages in thread
From: Thomas Garnier @ 2017-06-20 20:31 UTC (permalink / raw)
  To: Kees Cook
  Cc: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Petr Mladek, Miroslav Benes, Al Viro, Arnd Bergmann, Dave Hansen,
	David Howells, Russell King, Andy Lutomirski, Will Drewry,
	Will Deacon, Catalin Marinas, Mark Rutland, Pratyush Anand

On Tue, Jun 20, 2017 at 1:18 PM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> On Wed, Jun 14, 2017 at 6:12 PM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
>> Ensure the address limit is a user-mode segment before returning to
>> user-mode. Otherwise a process can corrupt kernel-mode memory and
>> elevate privileges [1].
>>
>> The set_fs function sets the TIF_SETFS flag to force a slow path on
>> return. In the slow path, the address limit is checked to be USER_DS if
>> needed.
>>
>> The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
>> for arm instruction immediate support. The global work mask is too big
>> to used on a single instruction so adapt ret_fast_syscall.
>>
>> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>>
>> Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
>> ---
>> v10 redesigns the change to use work flags on set_fs as recommended by
>> Linus and agreed by others.
>>
>> Based on next-20170609
>> ---
>>  arch/arm/include/asm/thread_info.h | 15 +++++++++------
>>  arch/arm/include/asm/uaccess.h     |  2 ++
>>  arch/arm/kernel/entry-common.S     |  9 +++++++--
>>  arch/arm/kernel/signal.c           |  5 +++++
>>  4 files changed, 23 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
>> index 776757d1604a..1d468b527b7b 100644
>> --- a/arch/arm/include/asm/thread_info.h
>> +++ b/arch/arm/include/asm/thread_info.h
>> @@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>>  #define TIF_NEED_RESCHED       1       /* rescheduling necessary */
>>  #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
>>  #define TIF_UPROBE             3       /* breakpointed or singlestepping */
>> -#define TIF_SYSCALL_TRACE      4       /* syscall trace active */
>> -#define TIF_SYSCALL_AUDIT      5       /* syscall auditing active */
>> -#define TIF_SYSCALL_TRACEPOINT 6       /* syscall tracepoint instrumentation */
>> -#define TIF_SECCOMP            7       /* seccomp syscall filtering active */
>> +#define TIF_FSCHECK            4       /* Check FS is USER_DS on return */
>> +#define TIF_SYSCALL_TRACE      5       /* syscall trace active */
>> +#define TIF_SYSCALL_AUDIT      6       /* syscall auditing active */
>> +#define TIF_SYSCALL_TRACEPOINT 7       /* syscall tracepoint instrumentation */
>> +#define TIF_SECCOMP            8       /* seccomp syscall filtering active */
>>
>>  #define TIF_NOHZ               12      /* in adaptive nohz mode */
>>  #define TIF_USING_IWMMXT       17
>> @@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>>  #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
>>  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
>>  #define _TIF_UPROBE            (1 << TIF_UPROBE)
>> +#define _TIF_FSCHECK           (1 << TIF_FSCHECK)
>>  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
>>  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
>>  #define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
>> @@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>>  /*
>>   * Change these and you break ASM code in entry-common.S
>>   */
>> -#define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
>> -                                _TIF_NOTIFY_RESUME | _TIF_UPROBE)
>> +#define _TIF_WORK_MASK         (_TIF_NEED_RESCHED | _TIF_SIGPENDING |  \
>> +                                _TIF_NOTIFY_RESUME | _TIF_UPROBE |     \
>> +                                _TIF_FSCHECK)
>>
>>  #endif /* __KERNEL__ */
>>  #endif /* __ASM_ARM_THREAD_INFO_H */
>> diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
>> index 2577405d082d..6cc882223e34 100644
>> --- a/arch/arm/include/asm/uaccess.h
>> +++ b/arch/arm/include/asm/uaccess.h
>> @@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
>>  {
>>         current_thread_info()->addr_limit = fs;
>>         modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
>> +       /* On user-mode return, check fs is correct */
>> +       set_thread_flag(TIF_FSCHECK);
>>  }
>>
>>  #define segment_eq(a, b)       ((a) == (b))
>> diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
>> index eb5cd77bf1d8..e33c32d56193 100644
>> --- a/arch/arm/kernel/entry-common.S
>> +++ b/arch/arm/kernel/entry-common.S
>> @@ -41,7 +41,9 @@ ret_fast_syscall:
>>   UNWIND(.cantunwind    )
>>         disable_irq_notrace                     @ disable interrupts
>>         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
>> -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
>> +       tst     r1, #_TIF_SYSCALL_WORK
>> +       bne     fast_work_pending
>> +       tst     r1, #_TIF_WORK_MASK
>
> (IIUC) MOV32 is 2 cycles (MOVW, MOVT), and each TST above is 1 cycle
> and each BNE is 1 cycle (when not taken). So:
>
> mov32 r2, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> tst r1, r2
> bne fast_work_pending
>
> is 4 cycles and tst, bne, tst, bne is also 4 cycles. Would mov32 be
> more readable (since it keeps the flags together)?

I guess it would be more readable. Any opinion from the arm folks?

>
> -Kees
>
> --
> Kees Cook
> Pixel Security



-- 
Thomas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 3/3] arm64/syscalls: Check address limit on user-mode return
       [not found]   ` <20170615011203.144108-3-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2017-06-21  8:16     ` Catalin Marinas
  2017-06-21 13:57       ` Thomas Garnier
  0 siblings, 1 reply; 23+ messages in thread
From: Catalin Marinas @ 2017-06-21  8:16 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Petr Mladek, Miroslav Benes, Kees Cook, Al Viro, Arnd Bergmann,
	Dave Hansen, David Howells, Russell King, Andy Lutomirski,
	Will Drewry, Will Deacon, Mark Rutland, Pratyush Anand

On Wed, Jun 14, 2017 at 06:12:03PM -0700, Thomas Garnier wrote:
> Ensure the address limit is a user-mode segment before returning to
> user-mode. Otherwise a process can corrupt kernel-mode memory and
> elevate privileges [1].
> 
> The set_fs function sets the TIF_SETFS flag to force a slow path on
> return. In the slow path, the address limit is checked to be USER_DS if
> needed.
> 
> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> 
> Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> ---
> v10 redesigns the change to use work flags on set_fs as recommended by
> Linus and agreed by others.
> 
> Based on next-20170609
> ---
>  arch/arm64/include/asm/thread_info.h | 4 +++-
>  arch/arm64/include/asm/uaccess.h     | 3 +++
>  arch/arm64/kernel/signal.c           | 5 +++++
>  3 files changed, 11 insertions(+), 1 deletion(-)

For arm64:

Reviewed-by: Catalin Marinas <catalin.marinas-5wv7dgnIgG8@public.gmane.org>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
  2017-06-20 20:31         ` Thomas Garnier
@ 2017-06-21  9:08           ` Will Deacon
  0 siblings, 0 replies; 23+ messages in thread
From: Will Deacon @ 2017-06-21  9:08 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Kees Cook, Thomas Gleixner, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski, Paolo Bonzini, Rik van Riel, Oleg Nesterov,
	Josh Poimboeuf, Petr Mladek, Miroslav Benes, Al Viro,
	Arnd Bergmann, Dave Hansen, David Howells, Russell King,
	Andy Lutomirski, Will Drewry, Catalin Marinas, Mark Rutland,
	Pratyush Anand, Chris Metcalf, x86, LKML, Linux API

On Tue, Jun 20, 2017 at 01:31:14PM -0700, Thomas Garnier wrote:
> On Tue, Jun 20, 2017 at 1:18 PM, Kees Cook <keescook@chromium.org> wrote:
> > On Wed, Jun 14, 2017 at 6:12 PM, Thomas Garnier <thgarnie@google.com> wrote:
> >> diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
> >> index eb5cd77bf1d8..e33c32d56193 100644
> >> --- a/arch/arm/kernel/entry-common.S
> >> +++ b/arch/arm/kernel/entry-common.S
> >> @@ -41,7 +41,9 @@ ret_fast_syscall:
> >>   UNWIND(.cantunwind    )
> >>         disable_irq_notrace                     @ disable interrupts
> >>         ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
> >> -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> >> +       tst     r1, #_TIF_SYSCALL_WORK
> >> +       bne     fast_work_pending
> >> +       tst     r1, #_TIF_WORK_MASK
> >
> > (IIUC) MOV32 is 2 cycles (MOVW, MOVT), and each TST above is 1 cycle
> > and each BNE is 1 cycle (when not taken). So:
> >
> > mov32 r2, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> > tst r1, r2
> > bne fast_work_pending
> >
> > is 4 cycles and tst, bne, tst, bne is also 4 cycles. Would mov32 be
> > more readable (since it keeps the flags together)?
> 
> I guess it would be more readable. Any opinion from the arm folks?

The mov32 sequence is probably better, but statically attributing cycles
on a per instruction basis is pretty futile on modern CPUs.

Will

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 3/3] arm64/syscalls: Check address limit on user-mode return
  2017-06-21  8:16     ` Catalin Marinas
@ 2017-06-21 13:57       ` Thomas Garnier
  0 siblings, 0 replies; 23+ messages in thread
From: Thomas Garnier @ 2017-06-21 13:57 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Petr Mladek, Miroslav Benes, Kees Cook, Al Viro, Arnd Bergmann,
	Dave Hansen, David Howells, Russell King, Andy Lutomirski,
	Will Drewry, Will Deacon, Mark Rutland, Pratyush Anand

On Wed, Jun 21, 2017 at 1:16 AM, Catalin Marinas
<catalin.marinas@arm.com> wrote:
> On Wed, Jun 14, 2017 at 06:12:03PM -0700, Thomas Garnier wrote:
>> Ensure the address limit is a user-mode segment before returning to
>> user-mode. Otherwise a process can corrupt kernel-mode memory and
>> elevate privileges [1].
>>
>> The set_fs function sets the TIF_SETFS flag to force a slow path on
>> return. In the slow path, the address limit is checked to be USER_DS if
>> needed.
>>
>> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>>
>> Signed-off-by: Thomas Garnier <thgarnie@google.com>
>> ---
>> v10 redesigns the change to use work flags on set_fs as recommended by
>> Linus and agreed by others.
>>
>> Based on next-20170609
>> ---
>>  arch/arm64/include/asm/thread_info.h | 4 +++-
>>  arch/arm64/include/asm/uaccess.h     | 3 +++
>>  arch/arm64/kernel/signal.c           | 5 +++++
>>  3 files changed, 11 insertions(+), 1 deletion(-)
>
> For arm64:
>
> Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>

Thanks Catalin

-- 
Thomas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 1/3] x86/syscalls: Check address limit on user-mode return
  2017-06-20 20:24   ` [PATCH v10 1/3] x86/syscalls: " Kees Cook
@ 2017-06-28 17:52     ` Kees Cook
       [not found]       ` <CAGXu5jKrJv0y70e5JiafKGcGzWoJPZM_HruZ=Y0rM1m0J4tZAA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Kees Cook @ 2017-06-28 17:52 UTC (permalink / raw)
  To: Thomas Garnier, Ingo Molnar
  Cc: Thomas Gleixner, H . Peter Anvin, Andy Lutomirski, Paolo Bonzini,
	Rik van Riel, Oleg Nesterov, Josh Poimboeuf, Petr Mladek,
	Miroslav Benes, Al Viro, Arnd Bergmann, Dave Hansen,
	David Howells, Russell King, Andy Lutomirski, Will Drewry,
	Will Deacon, Catalin Marinas, Mark Rutland, Pratyush Anand,
	Chris Metcalf, x86, LKML, Linux API, linux-arm-

On Tue, Jun 20, 2017 at 1:24 PM, Kees Cook <keescook@chromium.org> wrote:
> On Wed, Jun 14, 2017 at 6:12 PM, Thomas Garnier <thgarnie@google.com> wrote:
>> Ensure the address limit is a user-mode segment before returning to
>> user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
>> privileges [1].
>>
>> The set_fs function sets the TIF_SETFS flag to force a slow path on
>> return. In the slow path, the address limit is checked to be USER_DS if
>> needed.
>>
>> The addr_limit_user_check function is added as a cross-architecture
>> function to check the address limit.
>>
>> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>>
>> Signed-off-by: Thomas Garnier <thgarnie@google.com>
>
> Thanks for reworking this series!
>
> The bad state correctly BUGs under the LKDTM test:
>
> [   21.171586] lkdtm: Performing direct entry CORRUPT_USER_DS
> [   21.172791] lkdtm: setting bad task size limit
> [   21.173742] ------------[ cut here ]------------
> [   21.174641] kernel BUG at ./include/linux/syscalls.h:220!
> ...
> [   21.193166] Call Trace:
> [   21.193617]  ? trace_hardirqs_on_thunk+0x1a/0x1c
> [   21.194443]  entry_SYSCALL64_slow_path+0x25/0x25
>
>
> Tested-by: Kees Cook <keescook@chromium.org>

Is everyone happy with this patch for x86? Does this need anything
more/different?

Thanks!

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 1/3] x86/syscalls: Check address limit on user-mode return
       [not found]       ` <CAGXu5jKrJv0y70e5JiafKGcGzWoJPZM_HruZ=Y0rM1m0J4tZAA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-07-06 20:38         ` Thomas Garnier
       [not found]           ` <CAJcbSZE6Og4gwhFwhy_-Jaq6GovwN3y1B6O89JmkpXHtVfDLBA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Thomas Garnier @ 2017-07-06 20:38 UTC (permalink / raw)
  To: Kees Cook
  Cc: Ingo Molnar, Thomas Gleixner, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Petr Mladek, Miroslav Benes, Al Viro, Arnd Bergmann, Dave Hansen,
	David Howells, Russell King, Andy Lutomirski, Will Drewry,
	Will Deacon, Catalin Marinas, Mark Rutland, Pratyush Anand

On Wed, Jun 28, 2017 at 10:52 AM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>
> On Tue, Jun 20, 2017 at 1:24 PM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> > On Wed, Jun 14, 2017 at 6:12 PM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> >> Ensure the address limit is a user-mode segment before returning to
> >> user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
> >> privileges [1].
> >>
> >> The set_fs function sets the TIF_SETFS flag to force a slow path on
> >> return. In the slow path, the address limit is checked to be USER_DS if
> >> needed.
> >>
> >> The addr_limit_user_check function is added as a cross-architecture
> >> function to check the address limit.
> >>
> >> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> >>
> >> Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> >
> > Thanks for reworking this series!
> >
> > The bad state correctly BUGs under the LKDTM test:
> >
> > [   21.171586] lkdtm: Performing direct entry CORRUPT_USER_DS
> > [   21.172791] lkdtm: setting bad task size limit
> > [   21.173742] ------------[ cut here ]------------
> > [   21.174641] kernel BUG at ./include/linux/syscalls.h:220!
> > ...
> > [   21.193166] Call Trace:
> > [   21.193617]  ? trace_hardirqs_on_thunk+0x1a/0x1c
> > [   21.194443]  entry_SYSCALL64_slow_path+0x25/0x25
> >
> >
> > Tested-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
>
> Is everyone happy with this patch for x86? Does this need anything
> more/different?

Asking again. Additional feedback? Anyone wants to pick-it up?

>
> Thanks!
>
> -Kees
>
> --
> Kees Cook
> Pixel Security




-- 
Thomas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 1/3] x86/syscalls: Check address limit on user-mode return
       [not found]           ` <CAJcbSZE6Og4gwhFwhy_-Jaq6GovwN3y1B6O89JmkpXHtVfDLBA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-07-06 20:48             ` Thomas Gleixner
  2017-07-06 20:52               ` Thomas Garnier
  0 siblings, 1 reply; 23+ messages in thread
From: Thomas Gleixner @ 2017-07-06 20:48 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Kees Cook, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Petr Mladek, Miroslav Benes, Al Viro, Arnd Bergmann, Dave Hansen,
	David Howells, Russell King, Andy Lutomirski, Will Drewry,
	Will Deacon, Catalin Marinas, Mark Rutland, Pratyush Anand

On Thu, 6 Jul 2017, Thomas Garnier wrote:
> On Wed, Jun 28, 2017 at 10:52 AM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> >
> > On Tue, Jun 20, 2017 at 1:24 PM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> > > On Wed, Jun 14, 2017 at 6:12 PM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> > >> Ensure the address limit is a user-mode segment before returning to
> > >> user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
> > >> privileges [1].
> > >>
> > >> The set_fs function sets the TIF_SETFS flag to force a slow path on
> > >> return. In the slow path, the address limit is checked to be USER_DS if
> > >> needed.
> > >>
> > >> The addr_limit_user_check function is added as a cross-architecture
> > >> function to check the address limit.
> > >>
> > >> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> > >>
> > >> Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > >
> > > Thanks for reworking this series!
> > >
> > > The bad state correctly BUGs under the LKDTM test:
> > >
> > > [   21.171586] lkdtm: Performing direct entry CORRUPT_USER_DS
> > > [   21.172791] lkdtm: setting bad task size limit
> > > [   21.173742] ------------[ cut here ]------------
> > > [   21.174641] kernel BUG at ./include/linux/syscalls.h:220!
> > > ...
> > > [   21.193166] Call Trace:
> > > [   21.193617]  ? trace_hardirqs_on_thunk+0x1a/0x1c
> > > [   21.194443]  entry_SYSCALL64_slow_path+0x25/0x25
> > >
> > >
> > > Tested-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> >
> > Is everyone happy with this patch for x86? Does this need anything
> > more/different?
> 
> Asking again. Additional feedback? Anyone wants to pick-it up?

Can do. This needs to be a combo of all 3 I assume as the x86 one contains
the function used by all of them, right?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 1/3] x86/syscalls: Check address limit on user-mode return
  2017-07-06 20:48             ` Thomas Gleixner
@ 2017-07-06 20:52               ` Thomas Garnier
  0 siblings, 0 replies; 23+ messages in thread
From: Thomas Garnier @ 2017-07-06 20:52 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Mark Rutland, kernel-hardening, Catalin Marinas, Will Deacon,
	LKML, David Howells, Dave Hansen, H . Peter Anvin,
	Miroslav Benes, Chris Metcalf, Pratyush Anand, x86, Russell King,
	Ingo Molnar, Petr Mladek, Rik van Riel, Kees Cook, Arnd Bergmann,
	Al Viro, Andy Lutomirski, Josh Poimboeuf, linux-arm-kernel

On Thu, Jul 6, 2017 at 1:48 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Thu, 6 Jul 2017, Thomas Garnier wrote:
>> On Wed, Jun 28, 2017 at 10:52 AM, Kees Cook <keescook@chromium.org> wrote:
>> >
>> > On Tue, Jun 20, 2017 at 1:24 PM, Kees Cook <keescook@chromium.org> wrote:
>> > > On Wed, Jun 14, 2017 at 6:12 PM, Thomas Garnier <thgarnie@google.com> wrote:
>> > >> Ensure the address limit is a user-mode segment before returning to
>> > >> user-mode. Otherwise a process can corrupt kernel-mode memory and elevate
>> > >> privileges [1].
>> > >>
>> > >> The set_fs function sets the TIF_SETFS flag to force a slow path on
>> > >> return. In the slow path, the address limit is checked to be USER_DS if
>> > >> needed.
>> > >>
>> > >> The addr_limit_user_check function is added as a cross-architecture
>> > >> function to check the address limit.
>> > >>
>> > >> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>> > >>
>> > >> Signed-off-by: Thomas Garnier <thgarnie@google.com>
>> > >
>> > > Thanks for reworking this series!
>> > >
>> > > The bad state correctly BUGs under the LKDTM test:
>> > >
>> > > [   21.171586] lkdtm: Performing direct entry CORRUPT_USER_DS
>> > > [   21.172791] lkdtm: setting bad task size limit
>> > > [   21.173742] ------------[ cut here ]------------
>> > > [   21.174641] kernel BUG at ./include/linux/syscalls.h:220!
>> > > ...
>> > > [   21.193166] Call Trace:
>> > > [   21.193617]  ? trace_hardirqs_on_thunk+0x1a/0x1c
>> > > [   21.194443]  entry_SYSCALL64_slow_path+0x25/0x25
>> > >
>> > >
>> > > Tested-by: Kees Cook <keescook@chromium.org>
>> >
>> > Is everyone happy with this patch for x86? Does this need anything
>> > more/different?
>>
>> Asking again. Additional feedback? Anyone wants to pick-it up?
>
> Can do. This needs to be a combo of all 3 I assume as the x86 one contains
> the function used by all of them, right?

That is correct.

>
> Thanks,
>
>         tglx



-- 
Thomas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
       [not found]   ` <20170615011203.144108-2-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2017-06-20 20:18     ` Kees Cook
@ 2017-07-18 14:36     ` Leonard Crestez
  2017-07-18 16:04       ` Thomas Garnier
  1 sibling, 1 reply; 23+ messages in thread
From: Leonard Crestez @ 2017-07-18 14:36 UTC (permalink / raw)
  To: Thomas Garnier, Thomas Gleixner, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski, Paolo Bonzini, Rik van Riel, Oleg Nesterov,
	Josh Poimboeuf, Petr Mladek, Miroslav Benes, Kees Cook, Al Viro,
	Arnd Bergmann, Dave Hansen, David Howells, Russell King,
	Andy Lutomirski, Will Drewry, Will Deacon, Catalin Marinas
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA, x86-DgEjT+Ai2ygdnm+yROfE0A,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	kernel-hardening-ZwoEplunGu1jrUoiu81ncdBPR1lH4CV8,
	Octavian Purdila

On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
> Ensure the address limit is a user-mode segment before returning to
> user-mode. Otherwise a process can corrupt kernel-mode memory and
> elevate privileges [1].
> 
> The set_fs function sets the TIF_SETFS flag to force a slow path on
> return. In the slow path, the address limit is checked to be USER_DS if
> needed.
> 
> The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
> for arm instruction immediate support. The global work mask is too big
> to used on a single instruction so adapt ret_fast_syscall.
> 
> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> 
> Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> ---
> v10 redesigns the change to use work flags on set_fs as recommended by
> Linus and agreed by others.
> 
> Based on next-20170609
> ---
>  arch/arm/include/asm/thread_info.h | 15 +++++++++------
>  arch/arm/include/asm/uaccess.h     |  2 ++
>  arch/arm/kernel/entry-common.S     |  9 +++++++--
>  arch/arm/kernel/signal.c           |  5 +++++
>  4 files changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
> index 776757d1604a..1d468b527b7b 100644
> --- a/arch/arm/include/asm/thread_info.h
> +++ b/arch/arm/include/asm/thread_info.h
> @@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>  #define TIF_NEED_RESCHED	1	/* rescheduling necessary */
>  #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
>  #define TIF_UPROBE		3	/* breakpointed or singlestepping */
> -#define TIF_SYSCALL_TRACE	4	/* syscall trace active */
> -#define TIF_SYSCALL_AUDIT	5	/* syscall auditing active */
> -#define TIF_SYSCALL_TRACEPOINT	6	/* syscall tracepoint instrumentation */
> -#define TIF_SECCOMP		7	/* seccomp syscall filtering active */
> +#define TIF_FSCHECK		4	/* Check FS is USER_DS on return */
> +#define TIF_SYSCALL_TRACE	5	/* syscall trace active */
> +#define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
> +#define TIF_SYSCALL_TRACEPOINT	7	/* syscall tracepoint instrumentation */
> +#define TIF_SECCOMP		8	/* seccomp syscall filtering active */
>  
>  #define TIF_NOHZ		12	/* in adaptive nohz mode */
>  #define TIF_USING_IWMMXT	17
> @@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>  #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
>  #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
>  #define _TIF_UPROBE		(1 << TIF_UPROBE)
> +#define _TIF_FSCHECK		(1 << TIF_FSCHECK)
>  #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
>  #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
>  #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
> @@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>  /*
>   * Change these and you break ASM code in entry-common.S
>   */
> -#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
> -				 _TIF_NOTIFY_RESUME | _TIF_UPROBE)
> +#define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING |	\
> +				 _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
> +				 _TIF_FSCHECK)
>  
>  #endif /* __KERNEL__ */
>  #endif /* __ASM_ARM_THREAD_INFO_H */
> diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
> index 2577405d082d..6cc882223e34 100644
> --- a/arch/arm/include/asm/uaccess.h
> +++ b/arch/arm/include/asm/uaccess.h
> @@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
>  {
>  	current_thread_info()->addr_limit = fs;
>  	modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
> +	/* On user-mode return, check fs is correct */
> +	set_thread_flag(TIF_FSCHECK);
>  }
>  
>  #define segment_eq(a, b)	((a) == (b))
> diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
> index eb5cd77bf1d8..e33c32d56193 100644
> --- a/arch/arm/kernel/entry-common.S
> +++ b/arch/arm/kernel/entry-common.S
> @@ -41,7 +41,9 @@ ret_fast_syscall:
>   UNWIND(.cantunwind	)
>  	disable_irq_notrace			@ disable interrupts
>  	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
> -	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> +	tst	r1, #_TIF_SYSCALL_WORK
> +	bne	fast_work_pending
> +	tst	r1, #_TIF_WORK_MASK
>  	bne	fast_work_pending
>  
>  	/* perform architecture specific actions before user return */
> @@ -67,12 +69,15 @@ ret_fast_syscall:
>  	str	r0, [sp, #S_R0 + S_OFF]!	@ save returned r0
>  	disable_irq_notrace			@ disable interrupts
>  	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
> -	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> +	tst	r1, #_TIF_SYSCALL_WORK
> +	bne	fast_work_pending
> +	tst	r1, #_TIF_WORK_MASK
>  	beq	no_work_pending
>   UNWIND(.fnend		)
>  ENDPROC(ret_fast_syscall)
>  
>  	/* Slower path - fall through to work_pending */
> +fast_work_pending:
>  #endif
>  
>  	tst	r1, #_TIF_SYSCALL_WORK
> diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
> index 7b8f2141427b..3a48b54c6405 100644
> --- a/arch/arm/kernel/signal.c
> +++ b/arch/arm/kernel/signal.c
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
>  	 * Update the trace code with the current status.
>  	 */
>  	trace_hardirqs_off();
> +
> +	/* Check valid user FS if needed */
> +	addr_limit_user_check();
> +
>  	do {
>  		if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>  			schedule();

This patch made it's way into linux-next next-20170717 and it seems to
cause hangs when booting some boards over NFS (found via bisection). I
don't know exactly what determines the issue but I can reproduce hangs
if even if I just boot with init=/bin/bash and do stuff like

# sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!

When this happens sysrq-t shows a sleep task hung in the 'R' state
spinning in do_work_pending, so maybe there is a potential infinite
loop here?

The addr_limit_user_check at the start of do_work_pending will check
for TIF_FSCHECK once and clear it but the function loops while
(thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
the loop will never terminate. Does this make sense?

I added some instrumentation to check if TIF_FSCHECK can show up during
the do_work_pending loop and the answer seems to be yes. I also tried
to get a stack with a set_fs call from inside do_work_pending and got
the following:

[  227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
[  227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
[  227.596275] Backtrace: 
[  227.598754] [<c010cbb4>] (dump_backtrace) from [<c010ce60>] (show_stack+0x18/0x1c)
[  227.606339]  r7:00000000 r6:60070113 r5:00000000 r4:c105a958
[  227.612016] [<c010ce48>] (show_stack) from [<c0493498>] (dump_stack+0xb4/0xe8)
[  227.619258] [<c04933e4>] (dump_stack) from [<c010c350>] (mydbg_set_fs+0x40/0x48)
[  227.626671]  r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
[  227.634433] [<c010c310>] (mydbg_set_fs) from [<c021f0b8>] (__probe_kernel_read+0x44/0xd0)
[  227.642629] [<c021f074>] (__probe_kernel_read) from [<c011b8d8>] (do_alignment+0x8c/0x75c)
[  227.650909]  r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
[  227.658748]  r4:00000000 r3:00000000
[  227.662338] [<c011b84c>] (do_alignment) from [<c0101394>] (do_DataAbort+0x40/0xc0)
[  227.669921]  r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
[  227.677760]  r4:c100dd3c
[  227.680308] [<c0101354>] (do_DataAbort) from [<c010da44>] (__dabt_svc+0x64/0xa0)
[  227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
[  227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
[  227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
[  227.709158] dc20: 40070113 ffffffff
[  227.712661]  r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
[  227.719382] [<c08cf16c>] (inet_gro_receive) from [<c084a8ec>] (dev_gro_receive+0x2f0/0x618)
[  227.727746]  r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
[  227.735585]  r4:c1009f78
[  227.738132] [<c084a5fc>] (dev_gro_receive) from [<c084ac8c>] (napi_gro_receive+0x78/0x1f4)
[  227.746410]  r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
[  227.754249]  r4:ee1b20c0
[  227.756801] [<c084ac14>] (napi_gro_receive) from [<c06a2784>] (fec_enet_rx_napi+0x39c/0x988)
[  227.765253]  r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
[  227.773010] [<c06a23e8>] (fec_enet_rx_napi) from [<c084a3a4>] (net_rx_action+0x21c/0x474)
[  227.781201]  r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
[  227.789039]  r4:ef085710
[  227.791593] [<c084a188>] (net_rx_action) from [<c012f2d4>] (__do_softirq+0x158/0x534)
[  227.799437]  r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
[  227.807275]  r4:c100208c
[  227.809824] [<c012f17c>] (__do_softirq) from [<c012fa68>] (irq_exit+0xec/0x168)
[  227.817147]  r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
[  227.824984]  r4:c0fa534c
[  227.827534] [<c012f97c>] (irq_exit) from [<c01883f4>] (__handle_domain_irq+0x74/0xe8)
[  227.835377] [<c0188380>] (__handle_domain_irq) from [<c01015fc>] (gic_handle_irq+0x58/0xbc)
[  227.843742]  r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
[  227.851498] [<c01015a4>] (gic_handle_irq) from [<c010daf0>] (__irq_svc+0x70/0x98)
[  227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
[  227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
[  227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
[  227.880434] dec0: 60070013 ffffffff
[  227.883937]  r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
[  227.891775]  r4:c0389688
[  227.894327] [<c038a6f8>] (nfs_file_clear_open_context) from [<c03860e8>] (nfs_file_release+0x54/0x60)
[  227.903558]  r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
[  227.909235] [<c0386094>] (nfs_file_release) from [<c0276cb4>] (__fput+0x94/0x1e0)
[  227.916734] [<c0276c20>] (__fput) from [<c0276e60>] (____fput+0x10/0x14)
[  227.923448]  r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
[  227.931286]  r4:ef2edbd4
[  227.933839] [<c0276e50>] (____fput) from [<c014c534>] (task_work_run+0xc8/0xec)
[  227.941166] [<c014c46c>] (task_work_run) from [<c010c484>] (do_work_pending+0x12c/0x1c4)
[  227.949271]  r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
[  227.957029] [<c010c358>] (do_work_pending) from [<c0107c90>] (slow_work_pending+0xc/0x20)
[  227.965219]  r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
[  227.973058]  r4:b6f76904

Maybe the reason this reproduces easily in this particular setup is
that ethernet causes lots of alignment faults?

--
Regards,
Leonard

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
  2017-07-18 14:36     ` Leonard Crestez
@ 2017-07-18 16:04       ` Thomas Garnier
       [not found]         ` <CAJcbSZEr8HPBwH1oVaHqPzAY4MS_=yqMoqPhcauuKu3cikB3uQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Thomas Garnier @ 2017-07-18 16:04 UTC (permalink / raw)
  To: Leonard Crestez
  Cc: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Petr Mladek, Miroslav Benes, Kees Cook, Al Viro, Arnd Bergmann,
	Dave Hansen, David Howells, Russell King, Andy Lutomirski,
	Will Drewry, Will Deacon, Catalin Marinas, Mark Rutland

On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez
<leonard.crestez@nxp.com> wrote:
> On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
>> Ensure the address limit is a user-mode segment before returning to
>> user-mode. Otherwise a process can corrupt kernel-mode memory and
>> elevate privileges [1].
>>
>> The set_fs function sets the TIF_SETFS flag to force a slow path on
>> return. In the slow path, the address limit is checked to be USER_DS if
>> needed.
>>
>> The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
>> for arm instruction immediate support. The global work mask is too big
>> to used on a single instruction so adapt ret_fast_syscall.
>>
>> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>>
>> Signed-off-by: Thomas Garnier <thgarnie@google.com>
>> ---
>> v10 redesigns the change to use work flags on set_fs as recommended by
>> Linus and agreed by others.
>>
>> Based on next-20170609
>> ---
>>  arch/arm/include/asm/thread_info.h | 15 +++++++++------
>>  arch/arm/include/asm/uaccess.h     |  2 ++
>>  arch/arm/kernel/entry-common.S     |  9 +++++++--
>>  arch/arm/kernel/signal.c           |  5 +++++
>>  4 files changed, 23 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
>> index 776757d1604a..1d468b527b7b 100644
>> --- a/arch/arm/include/asm/thread_info.h
>> +++ b/arch/arm/include/asm/thread_info.h
>> @@ -139,10 +139,11 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>>  #define TIF_NEED_RESCHED     1       /* rescheduling necessary */
>>  #define TIF_NOTIFY_RESUME    2       /* callback before returning to user */
>>  #define TIF_UPROBE           3       /* breakpointed or singlestepping */
>> -#define TIF_SYSCALL_TRACE    4       /* syscall trace active */
>> -#define TIF_SYSCALL_AUDIT    5       /* syscall auditing active */
>> -#define TIF_SYSCALL_TRACEPOINT       6       /* syscall tracepoint instrumentation */
>> -#define TIF_SECCOMP          7       /* seccomp syscall filtering active */
>> +#define TIF_FSCHECK          4       /* Check FS is USER_DS on return */
>> +#define TIF_SYSCALL_TRACE    5       /* syscall trace active */
>> +#define TIF_SYSCALL_AUDIT    6       /* syscall auditing active */
>> +#define TIF_SYSCALL_TRACEPOINT       7       /* syscall tracepoint instrumentation */
>> +#define TIF_SECCOMP          8       /* seccomp syscall filtering active */
>>
>>  #define TIF_NOHZ             12      /* in adaptive nohz mode */
>>  #define TIF_USING_IWMMXT     17
>> @@ -153,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>>  #define _TIF_NEED_RESCHED    (1 << TIF_NEED_RESCHED)
>>  #define _TIF_NOTIFY_RESUME   (1 << TIF_NOTIFY_RESUME)
>>  #define _TIF_UPROBE          (1 << TIF_UPROBE)
>> +#define _TIF_FSCHECK         (1 << TIF_FSCHECK)
>>  #define _TIF_SYSCALL_TRACE   (1 << TIF_SYSCALL_TRACE)
>>  #define _TIF_SYSCALL_AUDIT   (1 << TIF_SYSCALL_AUDIT)
>>  #define _TIF_SYSCALL_TRACEPOINT      (1 << TIF_SYSCALL_TRACEPOINT)
>> @@ -166,8 +168,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *,
>>  /*
>>   * Change these and you break ASM code in entry-common.S
>>   */
>> -#define _TIF_WORK_MASK               (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
>> -                              _TIF_NOTIFY_RESUME | _TIF_UPROBE)
>> +#define _TIF_WORK_MASK               (_TIF_NEED_RESCHED | _TIF_SIGPENDING |  \
>> +                              _TIF_NOTIFY_RESUME | _TIF_UPROBE |     \
>> +                              _TIF_FSCHECK)
>>
>>  #endif /* __KERNEL__ */
>>  #endif /* __ASM_ARM_THREAD_INFO_H */
>> diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
>> index 2577405d082d..6cc882223e34 100644
>> --- a/arch/arm/include/asm/uaccess.h
>> +++ b/arch/arm/include/asm/uaccess.h
>> @@ -77,6 +77,8 @@ static inline void set_fs(mm_segment_t fs)
>>  {
>>       current_thread_info()->addr_limit = fs;
>>       modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER);
>> +     /* On user-mode return, check fs is correct */
>> +     set_thread_flag(TIF_FSCHECK);
>>  }
>>
>>  #define segment_eq(a, b)     ((a) == (b))
>> diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
>> index eb5cd77bf1d8..e33c32d56193 100644
>> --- a/arch/arm/kernel/entry-common.S
>> +++ b/arch/arm/kernel/entry-common.S
>> @@ -41,7 +41,9 @@ ret_fast_syscall:
>>   UNWIND(.cantunwind  )
>>       disable_irq_notrace                     @ disable interrupts
>>       ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
>> -     tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
>> +     tst     r1, #_TIF_SYSCALL_WORK
>> +     bne     fast_work_pending
>> +     tst     r1, #_TIF_WORK_MASK
>>       bne     fast_work_pending
>>
>>       /* perform architecture specific actions before user return */
>> @@ -67,12 +69,15 @@ ret_fast_syscall:
>>       str     r0, [sp, #S_R0 + S_OFF]!        @ save returned r0
>>       disable_irq_notrace                     @ disable interrupts
>>       ldr     r1, [tsk, #TI_FLAGS]            @ re-check for syscall tracing
>> -     tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
>> +     tst     r1, #_TIF_SYSCALL_WORK
>> +     bne     fast_work_pending
>> +     tst     r1, #_TIF_WORK_MASK
>>       beq     no_work_pending
>>   UNWIND(.fnend               )
>>  ENDPROC(ret_fast_syscall)
>>
>>       /* Slower path - fall through to work_pending */
>> +fast_work_pending:
>>  #endif
>>
>>       tst     r1, #_TIF_SYSCALL_WORK
>> diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
>> index 7b8f2141427b..3a48b54c6405 100644
>> --- a/arch/arm/kernel/signal.c
>> +++ b/arch/arm/kernel/signal.c
>> @@ -14,6 +14,7 @@
>>  #include
>>  #include
>>  #include
>> +#include
>>
>>  #include
>>  #include
>> @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
>>        * Update the trace code with the current status.
>>        */
>>       trace_hardirqs_off();
>> +
>> +     /* Check valid user FS if needed */
>> +     addr_limit_user_check();
>> +
>>       do {
>>               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>>                       schedule();
>
> This patch made it's way into linux-next next-20170717 and it seems to
> cause hangs when booting some boards over NFS (found via bisection). I
> don't know exactly what determines the issue but I can reproduce hangs
> if even if I just boot with init=/bin/bash and do stuff like
>
> # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
>
> When this happens sysrq-t shows a sleep task hung in the 'R' state
> spinning in do_work_pending, so maybe there is a potential infinite
> loop here?
>
> The addr_limit_user_check at the start of do_work_pending will check
> for TIF_FSCHECK once and clear it but the function loops while
> (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
> the loop will never terminate. Does this make sense?

Yes, it does. Thanks for looking into this.

>
> I added some instrumentation to check if TIF_FSCHECK can show up during
> the do_work_pending loop and the answer seems to be yes. I also tried
> to get a stack with a set_fs call from inside do_work_pending and got
> the following:
>
> [  227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
> [  227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
> [  227.596275] Backtrace:
> [  227.598754] [<c010cbb4>] (dump_backtrace) from [<c010ce60>] (show_stack+0x18/0x1c)
> [  227.606339]  r7:00000000 r6:60070113 r5:00000000 r4:c105a958
> [  227.612016] [<c010ce48>] (show_stack) from [<c0493498>] (dump_stack+0xb4/0xe8)
> [  227.619258] [<c04933e4>] (dump_stack) from [<c010c350>] (mydbg_set_fs+0x40/0x48)
> [  227.626671]  r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
> [  227.634433] [<c010c310>] (mydbg_set_fs) from [<c021f0b8>] (__probe_kernel_read+0x44/0xd0)
> [  227.642629] [<c021f074>] (__probe_kernel_read) from [<c011b8d8>] (do_alignment+0x8c/0x75c)
> [  227.650909]  r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
> [  227.658748]  r4:00000000 r3:00000000
> [  227.662338] [<c011b84c>] (do_alignment) from [<c0101394>] (do_DataAbort+0x40/0xc0)
> [  227.669921]  r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
> [  227.677760]  r4:c100dd3c
> [  227.680308] [<c0101354>] (do_DataAbort) from [<c010da44>] (__dabt_svc+0x64/0xa0)
> [  227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
> [  227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
> [  227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
> [  227.709158] dc20: 40070113 ffffffff
> [  227.712661]  r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
> [  227.719382] [<c08cf16c>] (inet_gro_receive) from [<c084a8ec>] (dev_gro_receive+0x2f0/0x618)
> [  227.727746]  r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
> [  227.735585]  r4:c1009f78
> [  227.738132] [<c084a5fc>] (dev_gro_receive) from [<c084ac8c>] (napi_gro_receive+0x78/0x1f4)
> [  227.746410]  r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
> [  227.754249]  r4:ee1b20c0
> [  227.756801] [<c084ac14>] (napi_gro_receive) from [<c06a2784>] (fec_enet_rx_napi+0x39c/0x988)
> [  227.765253]  r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
> [  227.773010] [<c06a23e8>] (fec_enet_rx_napi) from [<c084a3a4>] (net_rx_action+0x21c/0x474)
> [  227.781201]  r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
> [  227.789039]  r4:ef085710
> [  227.791593] [<c084a188>] (net_rx_action) from [<c012f2d4>] (__do_softirq+0x158/0x534)
> [  227.799437]  r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
> [  227.807275]  r4:c100208c
> [  227.809824] [<c012f17c>] (__do_softirq) from [<c012fa68>] (irq_exit+0xec/0x168)
> [  227.817147]  r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
> [  227.824984]  r4:c0fa534c
> [  227.827534] [<c012f97c>] (irq_exit) from [<c01883f4>] (__handle_domain_irq+0x74/0xe8)
> [  227.835377] [<c0188380>] (__handle_domain_irq) from [<c01015fc>] (gic_handle_irq+0x58/0xbc)
> [  227.843742]  r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
> [  227.851498] [<c01015a4>] (gic_handle_irq) from [<c010daf0>] (__irq_svc+0x70/0x98)
> [  227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
> [  227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
> [  227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
> [  227.880434] dec0: 60070013 ffffffff
> [  227.883937]  r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
> [  227.891775]  r4:c0389688
> [  227.894327] [<c038a6f8>] (nfs_file_clear_open_context) from [<c03860e8>] (nfs_file_release+0x54/0x60)
> [  227.903558]  r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
> [  227.909235] [<c0386094>] (nfs_file_release) from [<c0276cb4>] (__fput+0x94/0x1e0)
> [  227.916734] [<c0276c20>] (__fput) from [<c0276e60>] (____fput+0x10/0x14)
> [  227.923448]  r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
> [  227.931286]  r4:ef2edbd4
> [  227.933839] [<c0276e50>] (____fput) from [<c014c534>] (task_work_run+0xc8/0xec)
> [  227.941166] [<c014c46c>] (task_work_run) from [<c010c484>] (do_work_pending+0x12c/0x1c4)
> [  227.949271]  r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
> [  227.957029] [<c010c358>] (do_work_pending) from [<c0107c90>] (slow_work_pending+0xc/0x20)
> [  227.965219]  r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
> [  227.973058]  r4:b6f76904
>
> Maybe the reason this reproduces easily in this particular setup is
> that ethernet causes lots of alignment faults?

Can you try this change?

diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 3a48b54c6405..bc6ad7789568 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
int thread_flags, int syscall)
  */
  trace_hardirqs_off();

- /* Check valid user FS if needed */
- addr_limit_user_check();
-
  do {
  if (likely(thread_flags & _TIF_NEED_RESCHED)) {
  schedule();
+ } else if (thread_flags & _TIF_FSCHECK) {
+ addr_limit_user_check();
  } else {
  if (unlikely(!user_mode(regs)))
  return 0;

>
> --
> Regards,
> Leonard



-- 
Thomas

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
       [not found]         ` <CAJcbSZEr8HPBwH1oVaHqPzAY4MS_=yqMoqPhcauuKu3cikB3uQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-07-18 17:18           ` Leonard Crestez
  2017-07-18 19:04             ` Thomas Garnier
  0 siblings, 1 reply; 23+ messages in thread
From: Leonard Crestez @ 2017-07-18 17:18 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Petr Mladek, Miroslav Benes, Kees Cook, Al Viro, Arnd Bergmann,
	Dave Hansen, David Howells, Russell King, Andy Lutomirski,
	Will Drewry, Will Deacon, Catalin Marinas, Mark Rutland

On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
> On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez-3arQi8VN3Tc@public.gmane.org> wrote:
> > 
> > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
> > > 
> > > Ensure the address limit is a user-mode segment before returning to
> > > user-mode. Otherwise a process can corrupt kernel-mode memory and
> > > elevate privileges [1].
> > > 
> > > The set_fs function sets the TIF_SETFS flag to force a slow path on
> > > return. In the slow path, the address limit is checked to be USER_DS if
> > > needed.
> > > 
> > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
> > > for arm instruction immediate support. The global work mask is too big
> > > to used on a single instruction so adapt ret_fast_syscall.
> > > 
> > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
> > >        * Update the trace code with the current status.
> > >        */
> > >       trace_hardirqs_off();
> > > +
> > > +     /* Check valid user FS if needed */
> > > +     addr_limit_user_check();
> > > +
> > >       do {
> > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > >                       schedule();
> > This patch made it's way into linux-next next-20170717 and it seems to
> > cause hangs when booting some boards over NFS (found via bisection). I
> > don't know exactly what determines the issue but I can reproduce hangs
> > if even if I just boot with init=/bin/bash and do stuff like
> > 
> > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
> > 
> > When this happens sysrq-t shows a sleep task hung in the 'R' state
> > spinning in do_work_pending, so maybe there is a potential infinite
> > loop here?
> > 
> > The addr_limit_user_check at the start of do_work_pending will check
> > for TIF_FSCHECK once and clear it but the function loops while
> > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
> > the loop will never terminate. Does this make sense?

> Yes, it does. Thanks for looking into this.

> > I added some instrumentation to check if TIF_FSCHECK can show up during
> > the do_work_pending loop and the answer seems to be yes. I also tried
> > to get a stack with a set_fs call from inside do_work_pending and got
> > the following:
> > 
> > [  227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
> > [  227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
> > [  227.596275] Backtrace:
> > [  227.598754] [] (dump_backtrace) from [] (show_stack+0x18/0x1c)
> > [  227.606339]  r7:00000000 r6:60070113 r5:00000000 r4:c105a958
> > [  227.612016] [] (show_stack) from [] (dump_stack+0xb4/0xe8)
> > [  227.619258] [] (dump_stack) from [] (mydbg_set_fs+0x40/0x48)
> > [  227.626671]  r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
> > [  227.634433] [] (mydbg_set_fs) from [] (__probe_kernel_read+0x44/0xd0)
> > [  227.642629] [] (__probe_kernel_read) from [] (do_alignment+0x8c/0x75c)
> > [  227.650909]  r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
> > [  227.658748]  r4:00000000 r3:00000000
> > [  227.662338] [] (do_alignment) from [] (do_DataAbort+0x40/0xc0)
> > [  227.669921]  r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
> > [  227.677760]  r4:c100dd3c
> > [  227.680308] [] (do_DataAbort) from [] (__dabt_svc+0x64/0xa0)
> > [  227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
> > [  227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
> > [  227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
> > [  227.709158] dc20: 40070113 ffffffff
> > [  227.712661]  r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
> > [  227.719382] [] (inet_gro_receive) from [] (dev_gro_receive+0x2f0/0x618)
> > [  227.727746]  r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
> > [  227.735585]  r4:c1009f78
> > [  227.738132] [] (dev_gro_receive) from [] (napi_gro_receive+0x78/0x1f4)
> > [  227.746410]  r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
> > [  227.754249]  r4:ee1b20c0
> > [  227.756801] [] (napi_gro_receive) from [] (fec_enet_rx_napi+0x39c/0x988)
> > [  227.765253]  r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
> > [  227.773010] [] (fec_enet_rx_napi) from [] (net_rx_action+0x21c/0x474)
> > [  227.781201]  r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
> > [  227.789039]  r4:ef085710
> > [  227.791593] [] (net_rx_action) from [] (__do_softirq+0x158/0x534)
> > [  227.799437]  r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
> > [  227.807275]  r4:c100208c
> > [  227.809824] [] (__do_softirq) from [] (irq_exit+0xec/0x168)
> > [  227.817147]  r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
> > [  227.824984]  r4:c0fa534c
> > [  227.827534] [] (irq_exit) from [] (__handle_domain_irq+0x74/0xe8)
> > [  227.835377] [] (__handle_domain_irq) from [] (gic_handle_irq+0x58/0xbc)
> > [  227.843742]  r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
> > [  227.851498] [] (gic_handle_irq) from [] (__irq_svc+0x70/0x98)
> > [  227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
> > [  227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
> > [  227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
> > [  227.880434] dec0: 60070013 ffffffff
> > [  227.883937]  r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
> > [  227.891775]  r4:c0389688
> > [  227.894327] [] (nfs_file_clear_open_context) from [] (nfs_file_release+0x54/0x60)
> > [  227.903558]  r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
> > [  227.909235] [] (nfs_file_release) from [] (__fput+0x94/0x1e0)
> > [  227.916734] [] (__fput) from [] (____fput+0x10/0x14)
> > [  227.923448]  r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
> > [  227.931286]  r4:ef2edbd4
> > [  227.933839] [] (____fput) from [] (task_work_run+0xc8/0xec)
> > [  227.941166] [] (task_work_run) from [] (do_work_pending+0x12c/0x1c4)
> > [  227.949271]  r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
> > [  227.957029] [] (do_work_pending) from [] (slow_work_pending+0xc/0x20)
> > [  227.965219]  r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
> > [  227.973058]  r4:b6f76904
> > 
> > Maybe the reason this reproduces easily in this particular setup is
> > that ethernet causes lots of alignment faults?
> Can you try this change?
> 
> diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
> index 3a48b54c6405..bc6ad7789568 100644
> --- a/arch/arm/kernel/signal.c
> +++ b/arch/arm/kernel/signal.c
> @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
> int thread_flags, int syscall)
>   */
>   trace_hardirqs_off();
> 
> - /* Check valid user FS if needed */
> - addr_limit_user_check();
> -
>   do {
>   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>   schedule();
> + } else if (thread_flags & _TIF_FSCHECK) {
> + addr_limit_user_check();
>   } else {
>   if (unlikely(!user_mode(regs)))
>   return 0;

This does seem to work, it no longer hangs on boot in my setup. This is
obviously only a very superficial test.

The new location of this check seems weird, it's not clear why it
should be on an else path. Perhaps it should be moved to right before
where current_thread_info()->flags is fetched again?

The issue seems like it would affect arm64 as well.

If the purpose is hardening against buggy kernel code doing bad set_fs
calls shouldn't this flag also be checked before looking at
TIF_NEED_RESCHED and calling schedule()?

--
Regards,
Leonard

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
  2017-07-18 17:18           ` Leonard Crestez
@ 2017-07-18 19:04             ` Thomas Garnier
       [not found]               ` <CAJcbSZFr9KJTfGfiZo2fThoDkAE-D1OFf2YtELq4P6jX8syesQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Thomas Garnier @ 2017-07-18 19:04 UTC (permalink / raw)
  To: Leonard Crestez
  Cc: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Oleg Nesterov, Josh Poimboeuf,
	Petr Mladek, Miroslav Benes, Kees Cook, Al Viro, Arnd Bergmann,
	Dave Hansen, David Howells, Russell King, Andy Lutomirski,
	Will Drewry, Will Deacon, Catalin Marinas, Mark Rutland

On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez
<leonard.crestez@nxp.com> wrote:
>
> On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
> > On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
> > >
> > > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
> > > >
> > > > Ensure the address limit is a user-mode segment before returning to
> > > > user-mode. Otherwise a process can corrupt kernel-mode memory and
> > > > elevate privileges [1].
> > > >
> > > > The set_fs function sets the TIF_SETFS flag to force a slow path on
> > > > return. In the slow path, the address limit is checked to be USER_DS if
> > > > needed.
> > > >
> > > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
> > > > for arm instruction immediate support. The global work mask is too big
> > > > to used on a single instruction so adapt ret_fast_syscall.
> > > >
> > > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
> > > >        * Update the trace code with the current status.
> > > >        */
> > > >       trace_hardirqs_off();
> > > > +
> > > > +     /* Check valid user FS if needed */
> > > > +     addr_limit_user_check();
> > > > +
> > > >       do {
> > > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > > >                       schedule();
> > > This patch made it's way into linux-next next-20170717 and it seems to
> > > cause hangs when booting some boards over NFS (found via bisection). I
> > > don't know exactly what determines the issue but I can reproduce hangs
> > > if even if I just boot with init=/bin/bash and do stuff like
> > >
> > > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
> > >
> > > When this happens sysrq-t shows a sleep task hung in the 'R' state
> > > spinning in do_work_pending, so maybe there is a potential infinite
> > > loop here?
> > >
> > > The addr_limit_user_check at the start of do_work_pending will check
> > > for TIF_FSCHECK once and clear it but the function loops while
> > > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
> > > the loop will never terminate. Does this make sense?
>
> > Yes, it does. Thanks for looking into this.
>
> > > I added some instrumentation to check if TIF_FSCHECK can show up during
> > > the do_work_pending loop and the answer seems to be yes. I also tried
> > > to get a stack with a set_fs call from inside do_work_pending and got
> > > the following:
> > >
> > > [  227.582402] CPU: 0 PID: 829 Comm: sleep Not tainted 4.12.0-01057-g93af8f7-dirty #332
> > > [  227.590171] Hardware name: Freescale i.MX6 SoloLite (Device Tree)
> > > [  227.596275] Backtrace:
> > > [  227.598754] [] (dump_backtrace) from [] (show_stack+0x18/0x1c)
> > > [  227.606339]  r7:00000000 r6:60070113 r5:00000000 r4:c105a958
> > > [  227.612016] [] (show_stack) from [] (dump_stack+0xb4/0xe8)
> > > [  227.619258] [] (dump_stack) from [] (mydbg_set_fs+0x40/0x48)
> > > [  227.626671]  r9:c08cf35c r8:ee1cda7c r7:ee1e3dce r6:bf000000 r5:00000000 r4:ffffe000
> > > [  227.634433] [] (mydbg_set_fs) from [] (__probe_kernel_read+0x44/0xd0)
> > > [  227.642629] [] (__probe_kernel_read) from [] (do_alignment+0x8c/0x75c)
> > > [  227.650909]  r10:ef085000 r9:c08cf35c r8:00000001 r7:ee1e3dce r6:c011b84c r5:ee1cdbe0
> > > [  227.658748]  r4:00000000 r3:00000000
> > > [  227.662338] [] (do_alignment) from [] (do_DataAbort+0x40/0xc0)
> > > [  227.669921]  r10:ef085000 r9:ee1cc000 r8:ee1cdbe0 r7:ee1e3dce r6:c011b84c r5:00000001
> > > [  227.677760]  r4:c100dd3c
> > > [  227.680308] [] (do_DataAbort) from [] (__dabt_svc+0x64/0xa0)
> > > [  227.687714] Exception stack(0xee1cdbe0 to 0xee1cdc28)
> > > [  227.692780] dbe0: 9064a8c0 ee1e3de2 d82727d8 00000000 ee1b20c0 ee1e3dce 00000000 ef08572c
> > > [  227.700971] dc00: c0bb2034 c10c75ea ef085000 ee1cdc74 ee1cdc00 ee1cdc30 c01761a8 c08cf35c
> > > [  227.709158] dc20: 40070113 ffffffff
> > > [  227.712661]  r8:c0bb2034 r7:ee1cdc14 r6:ffffffff r5:40070113 r4:c08cf35c
> > > [  227.719382] [] (inet_gro_receive) from [] (dev_gro_receive+0x2f0/0x618)
> > > [  227.727746]  r10:ef085000 r9:00000001 r8:00000000 r7:ef085710 r6:c1008b88 r5:ee1b20c0
> > > [  227.735585]  r4:c1009f78
> > > [  227.738132] [] (dev_gro_receive) from [] (napi_gro_receive+0x78/0x1f4)
> > > [  227.746410]  r10:ef085000 r9:00000001 r8:c10d15ec r7:c100792c r6:ef085710 r5:c10c744e
> > > [  227.754249]  r4:ee1b20c0
> > > [  227.756801] [] (napi_gro_receive) from [] (fec_enet_rx_napi+0x39c/0x988)
> > > [  227.765253]  r9:00000001 r8:f0c8a960 r7:00000000 r6:00000000 r5:ef086000 r4:ee1b20c0
> > > [  227.773010] [] (fec_enet_rx_napi) from [] (net_rx_action+0x21c/0x474)
> > > [  227.781201]  r10:ee1cdd78 r9:c0fa7b80 r8:ef7dab80 r7:0000012c r6:00000040 r5:00000001
> > > [  227.789039]  r4:ef085710
> > > [  227.791593] [] (net_rx_action) from [] (__do_softirq+0x158/0x534)
> > > [  227.799437]  r10:00000008 r9:ee1cc000 r8:c10ce568 r7:c100792c r6:c10247bd r5:00000003
> > > [  227.807275]  r4:c100208c
> > > [  227.809824] [] (__do_softirq) from [] (irq_exit+0xec/0x168)
> > > [  227.817147]  r10:c1007ea0 r9:ef010400 r8:00000001 r7:00000000 r6:c1007d3c r5:00000000
> > > [  227.824984]  r4:c0fa534c
> > > [  227.827534] [] (irq_exit) from [] (__handle_domain_irq+0x74/0xe8)
> > > [  227.835377] [] (__handle_domain_irq) from [] (gic_handle_irq+0x58/0xbc)
> > > [  227.843742]  r9:f080b100 r8:c105ae80 r7:ee1cde80 r6:000003ff r5:000003eb r4:f080b10c
> > > [  227.851498] [] (gic_handle_irq) from [] (__irq_svc+0x70/0x98)
> > > [  227.858990] Exception stack(0xee1cde80 to 0xee1cdec8)
> > > [  227.864056] de80: ee7a1140 00000001 00000000 000012a9 ee7a1140 ee9d9f10 ee76edc0 ee9d9f60
> > > [  227.872248] dea0: 00000000 ee9d9f10 00000010 ee1cdeec ee1cdeb8 ee1cded0 c038a77c c0389688
> > > [  227.880434] dec0: 60070013 ffffffff
> > > [  227.883937]  r10:00000010 r9:ee1cc000 r8:00000000 r7:ee1cdeb4 r6:ffffffff r5:60070013
> > > [  227.891775]  r4:c0389688
> > > [  227.894327] [] (nfs_file_clear_open_context) from [] (nfs_file_release+0x54/0x60)
> > > [  227.903558]  r7:ee9a78a0 r6:ee68f010 r5:ee9d9f10 r4:ee76edc0
> > > [  227.909235] [] (nfs_file_release) from [] (__fput+0x94/0x1e0)
> > > [  227.916734] [] (__fput) from [] (____fput+0x10/0x14)
> > > [  227.923448]  r10:c10d4298 r9:00000000 r8:00000000 r7:ef2ed780 r6:ef2edc00 r5:c10d5180
> > > [  227.931286]  r4:ef2edbd4
> > > [  227.933839] [] (____fput) from [] (task_work_run+0xc8/0xec)
> > > [  227.941166] [] (task_work_run) from [] (do_work_pending+0x12c/0x1c4)
> > > [  227.949271]  r9:ee1cdfb0 r8:00000000 r7:00000000 r6:ee1cc000 r5:00000000 r4:00000000
> > > [  227.957029] [] (do_work_pending) from [] (slow_work_pending+0xc/0x20)
> > > [  227.965219]  r10:00000000 r9:ee1cc000 r8:c0107e24 r7:0000005b r6:b6f76568 r5:b6f741f0
> > > [  227.973058]  r4:b6f76904
> > >
> > > Maybe the reason this reproduces easily in this particular setup is
> > > that ethernet causes lots of alignment faults?
> > Can you try this change?
> >
> > diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
> > index 3a48b54c6405..bc6ad7789568 100644
> > --- a/arch/arm/kernel/signal.c
> > +++ b/arch/arm/kernel/signal.c
> > @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
> > int thread_flags, int syscall)
> >   */
> >   trace_hardirqs_off();
> >
> > - /* Check valid user FS if needed */
> > - addr_limit_user_check();
> > -
> >   do {
> >   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> >   schedule();
> > + } else if (thread_flags & _TIF_FSCHECK) {
> > + addr_limit_user_check();
> >   } else {
> >   if (unlikely(!user_mode(regs)))
> >   return 0;
>
> This does seem to work, it no longer hangs on boot in my setup. This is
> obviously only a very superficial test.
>
> The new location of this check seems weird, it's not clear why it
> should be on an else path. Perhaps it should be moved to right before
> where current_thread_info()->flags is fetched again?

I was hitting bug when I tried that.I think that's because you
basically let the signal handler do pending work before you check the
flag, that's not a good idea.

>
> The issue seems like it would affect arm64 as well.

Yes, I will propose a fix on each architecture.

>
> If the purpose is hardening against buggy kernel code doing bad set_fs
> calls shouldn't this flag also be checked before looking at
> TIF_NEED_RESCHED and calling schedule()?

I am not sure to be honest. I expected schedule to only schedule the
processor to another task which would be fine given only the current
task have a bogus fs. I will put it first in case there is an edge
case scenario I missed.

What do you think? Let me know and I will look at changes all
architectures and testing them.

Thanks!

>
> --
> Regards,
> Leonard




-- 
Thomas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
       [not found]               ` <CAJcbSZFr9KJTfGfiZo2fThoDkAE-D1OFf2YtELq4P6jX8syesQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-07-19 14:58                 ` Leonard Crestez
       [not found]                   ` <1500476300.22834.13.camel-3arQi8VN3Tc@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Leonard Crestez @ 2017-07-19 14:58 UTC (permalink / raw)
  To: Thomas Garnier, Thomas Gleixner, Stephen Rothwell
  Cc: Ingo Molnar, H . Peter Anvin, Andy Lutomirski, Paolo Bonzini,
	Rik van Riel, Oleg Nesterov, Josh Poimboeuf, Petr Mladek,
	Miroslav Benes, Kees Cook, Al Viro, Arnd Bergmann, Dave Hansen,
	David Howells, Russell King, Andy Lutomirski, Will Drewry,
	Will Deacon, Catalin Marinas, Mark Rutland, Pratyush Anand

On Tue, 2017-07-18 at 12:04 -0700, Thomas Garnier wrote:
> On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez <leonard.crestez-3arQi8VN3Tc@public.gmane.org> wrote:
> > On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
> > > On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez-3arQi8VN3Tc@public.gmane.org> wrote:
> > > > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
> > > > > 
> > > > > Ensure the address limit is a user-mode segment before returning to
> > > > > user-mode. Otherwise a process can corrupt kernel-mode memory and
> > > > > elevate privileges [1].
> > > > > 
> > > > > The set_fs function sets the TIF_SETFS flag to force a slow path on
> > > > > return. In the slow path, the address limit is checked to be USER_DS if
> > > > > needed.
> > > > > 
> > > > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
> > > > > for arm instruction immediate support. The global work mask is too big
> > > > > to used on a single instruction so adapt ret_fast_syscall.
> > > > > 
> > > > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
> > > > >        * Update the trace code with the current status.
> > > > >        */
> > > > >       trace_hardirqs_off();
> > > > > +
> > > > > +     /* Check valid user FS if needed */
> > > > > +     addr_limit_user_check();
> > > > > +
> > > > >       do {
> > > > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > > > >                       schedule();
> > > > This patch made it's way into linux-next next-20170717 and it seems to
> > > > cause hangs when booting some boards over NFS (found via bisection). I
> > > > don't know exactly what determines the issue but I can reproduce hangs
> > > > if even if I just boot with init=/bin/bash and do stuff like
> > > > 
> > > > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
> > > > 
> > > > When this happens sysrq-t shows a sleep task hung in the 'R' state
> > > > spinning in do_work_pending, so maybe there is a potential infinite
> > > > loop here?
> > > > 
> > > > The addr_limit_user_check at the start of do_work_pending will check
> > > > for TIF_FSCHECK once and clear it but the function loops while
> > > > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
> > > > the loop will never terminate. Does this make sense?
> > > 
> > > Yes, it does. Thanks for looking into this.
> > > 
> > > Can you try this change?
> > > 
> > > diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
> > > index 3a48b54c6405..bc6ad7789568 100644
> > > --- a/arch/arm/kernel/signal.c
> > > +++ b/arch/arm/kernel/signal.c
> > > @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
> > > int thread_flags, int syscall)
> > >   */
> > >   trace_hardirqs_off();
> > > 
> > > - /* Check valid user FS if needed */
> > > - addr_limit_user_check();
> > > -
> > >   do {
> > >   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > >   schedule();
> > > + } else if (thread_flags & _TIF_FSCHECK) {
> > > + addr_limit_user_check();
> > >   } else {
> > >   if (unlikely(!user_mode(regs)))
> > >   return 0;
> > This does seem to work, it no longer hangs on boot in my setup. This is
> > obviously only a very superficial test.
> > 
> > The new location of this check seems weird, it's not clear why it
> > should be on an else path. Perhaps it should be moved to right before
> > where current_thread_info()->flags is fetched again?

> I was hitting bug when I tried that.I think that's because you
> basically let the signal handler do pending work before you check the
> flag, that's not a good idea.

> > If the purpose is hardening against buggy kernel code doing bad set_fs
> > calls shouldn't this flag also be checked before looking at
> > TIF_NEED_RESCHED and calling schedule()?
> I am not sure to be honest. I expected schedule to only schedule the
> processor to another task which would be fine given only the current
> task have a bogus fs. I will put it first in case there is an edge
> case scenario I missed.
> 
> What do you think? Let me know and I will look at changes all
> architectures and testing them.

I don't know and I'd rather not guess on security issues. It's better
if someone else reviews the code.

Unless there is a very quick fix maybe this series should be removed or
reverted from linux-next? A diagnosis of "system calls can sometimes
hang on return" seems serious even for linux-next. Since it happens
very rarely in most setups I can easily imagine somebody spending a lot
of time digging at this.

--
Regards,
Leonard

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
       [not found]                   ` <1500476300.22834.13.camel-3arQi8VN3Tc@public.gmane.org>
@ 2017-07-19 16:51                     ` Thomas Garnier
  2017-07-19 17:06                     ` Russell King - ARM Linux
  1 sibling, 0 replies; 23+ messages in thread
From: Thomas Garnier @ 2017-07-19 16:51 UTC (permalink / raw)
  To: Leonard Crestez
  Cc: Thomas Gleixner, Stephen Rothwell, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski, Paolo Bonzini, Rik van Riel, Oleg Nesterov,
	Josh Poimboeuf, Petr Mladek, Miroslav Benes, Kees Cook, Al Viro,
	Arnd Bergmann, Dave Hansen, David Howells, Russell King,
	Andy Lutomirski, Will Drewry, Will Deacon, Catalin Marinas

On Wed, Jul 19, 2017 at 7:58 AM, Leonard Crestez
<leonard.crestez-3arQi8VN3Tc@public.gmane.org> wrote:
> On Tue, 2017-07-18 at 12:04 -0700, Thomas Garnier wrote:
>> On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez <leonard.crestez-3arQi8VN3Tc@public.gmane.org> wrote:
>> > On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
>> > > On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez-3arQi8VN3Tc@public.gmane.org> wrote:
>> > > > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
>> > > > >
>> > > > > Ensure the address limit is a user-mode segment before returning to
>> > > > > user-mode. Otherwise a process can corrupt kernel-mode memory and
>> > > > > elevate privileges [1].
>> > > > >
>> > > > > The set_fs function sets the TIF_SETFS flag to force a slow path on
>> > > > > return. In the slow path, the address limit is checked to be USER_DS if
>> > > > > needed.
>> > > > >
>> > > > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
>> > > > > for arm instruction immediate support. The global work mask is too big
>> > > > > to used on a single instruction so adapt ret_fast_syscall.
>> > > > >
>> > > > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
>> > > > >        * Update the trace code with the current status.
>> > > > >        */
>> > > > >       trace_hardirqs_off();
>> > > > > +
>> > > > > +     /* Check valid user FS if needed */
>> > > > > +     addr_limit_user_check();
>> > > > > +
>> > > > >       do {
>> > > > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>> > > > >                       schedule();
>> > > > This patch made it's way into linux-next next-20170717 and it seems to
>> > > > cause hangs when booting some boards over NFS (found via bisection). I
>> > > > don't know exactly what determines the issue but I can reproduce hangs
>> > > > if even if I just boot with init=/bin/bash and do stuff like
>> > > >
>> > > > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
>> > > >
>> > > > When this happens sysrq-t shows a sleep task hung in the 'R' state
>> > > > spinning in do_work_pending, so maybe there is a potential infinite
>> > > > loop here?
>> > > >
>> > > > The addr_limit_user_check at the start of do_work_pending will check
>> > > > for TIF_FSCHECK once and clear it but the function loops while
>> > > > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
>> > > > the loop will never terminate. Does this make sense?
>> > >
>> > > Yes, it does. Thanks for looking into this.
>> > >
>> > > Can you try this change?
>> > >
>> > > diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
>> > > index 3a48b54c6405..bc6ad7789568 100644
>> > > --- a/arch/arm/kernel/signal.c
>> > > +++ b/arch/arm/kernel/signal.c
>> > > @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
>> > > int thread_flags, int syscall)
>> > >   */
>> > >   trace_hardirqs_off();
>> > >
>> > > - /* Check valid user FS if needed */
>> > > - addr_limit_user_check();
>> > > -
>> > >   do {
>> > >   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>> > >   schedule();
>> > > + } else if (thread_flags & _TIF_FSCHECK) {
>> > > + addr_limit_user_check();
>> > >   } else {
>> > >   if (unlikely(!user_mode(regs)))
>> > >   return 0;
>> > This does seem to work, it no longer hangs on boot in my setup. This is
>> > obviously only a very superficial test.
>> >
>> > The new location of this check seems weird, it's not clear why it
>> > should be on an else path. Perhaps it should be moved to right before
>> > where current_thread_info()->flags is fetched again?
>
>> I was hitting bug when I tried that.I think that's because you
>> basically let the signal handler do pending work before you check the
>> flag, that's not a good idea.
>
>> > If the purpose is hardening against buggy kernel code doing bad set_fs
>> > calls shouldn't this flag also be checked before looking at
>> > TIF_NEED_RESCHED and calling schedule()?
>> I am not sure to be honest. I expected schedule to only schedule the
>> processor to another task which would be fine given only the current
>> task have a bogus fs. I will put it first in case there is an edge
>> case scenario I missed.
>>
>> What do you think? Let me know and I will look at changes all
>> architectures and testing them.
>
> I don't know and I'd rather not guess on security issues. It's better
> if someone else reviews the code.
>
> Unless there is a very quick fix maybe this series should be removed or
> reverted from linux-next? A diagnosis of "system calls can sometimes
> hang on return" seems serious even for linux-next. Since it happens
> very rarely in most setups I can easily imagine somebody spending a lot
> of time digging at this.

I will send fixes for each architecture in the meantime.

>
> --
> Regards,
> Leonard



-- 
Thomas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
       [not found]                   ` <1500476300.22834.13.camel-3arQi8VN3Tc@public.gmane.org>
  2017-07-19 16:51                     ` Thomas Garnier
@ 2017-07-19 17:06                     ` Russell King - ARM Linux
  2017-07-19 17:20                       ` [kernel-hardening] " Thomas Garnier
  1 sibling, 1 reply; 23+ messages in thread
From: Russell King - ARM Linux @ 2017-07-19 17:06 UTC (permalink / raw)
  To: Leonard Crestez
  Cc: Thomas Garnier, Thomas Gleixner, Stephen Rothwell, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel,
	Oleg Nesterov, Josh Poimboeuf, Petr Mladek, Miroslav Benes,
	Kees Cook, Al Viro, Arnd Bergmann, Dave Hansen, David Howells,
	Andy Lutomirski, Will Drewry, Will Deacon, Catalin Marinas

On Wed, Jul 19, 2017 at 05:58:20PM +0300, Leonard Crestez wrote:
> On Tue, 2017-07-18 at 12:04 -0700, Thomas Garnier wrote:
> > On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez <leonard.crestez-3arQi8VN3Tc@public.gmane.org> wrote:
> > > On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
> > > > On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez-3arQi8VN3Tc@public.gmane.org> wrote:
> > > > > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
> > > > > > 
> > > > > > Ensure the address limit is a user-mode segment before returning to
> > > > > > user-mode. Otherwise a process can corrupt kernel-mode memory and
> > > > > > elevate privileges [1].
> > > > > > 
> > > > > > The set_fs function sets the TIF_SETFS flag to force a slow path on
> > > > > > return. In the slow path, the address limit is checked to be USER_DS if
> > > > > > needed.
> > > > > > 
> > > > > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
> > > > > > for arm instruction immediate support. The global work mask is too big
> > > > > > to used on a single instruction so adapt ret_fast_syscall.
> > > > > > 
> > > > > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
> > > > > >        * Update the trace code with the current status.
> > > > > >        */
> > > > > >       trace_hardirqs_off();
> > > > > > +
> > > > > > +     /* Check valid user FS if needed */
> > > > > > +     addr_limit_user_check();
> > > > > > +
> > > > > >       do {
> > > > > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > > > > >                       schedule();
> > > > > This patch made it's way into linux-next next-20170717 and it seems to
> > > > > cause hangs when booting some boards over NFS (found via bisection). I
> > > > > don't know exactly what determines the issue but I can reproduce hangs
> > > > > if even if I just boot with init=/bin/bash and do stuff like
> > > > > 
> > > > > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
> > > > > 
> > > > > When this happens sysrq-t shows a sleep task hung in the 'R' state
> > > > > spinning in do_work_pending, so maybe there is a potential infinite
> > > > > loop here?
> > > > > 
> > > > > The addr_limit_user_check at the start of do_work_pending will check
> > > > > for TIF_FSCHECK once and clear it but the function loops while
> > > > > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
> > > > > the loop will never terminate. Does this make sense?
> > > > 
> > > > Yes, it does. Thanks for looking into this.
> > > > 
> > > > Can you try this change?
> > > > 
> > > > diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
> > > > index 3a48b54c6405..bc6ad7789568 100644
> > > > --- a/arch/arm/kernel/signal.c
> > > > +++ b/arch/arm/kernel/signal.c
> > > > @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
> > > > int thread_flags, int syscall)
> > > >   */
> > > >   trace_hardirqs_off();
> > > > 
> > > > - /* Check valid user FS if needed */
> > > > - addr_limit_user_check();
> > > > -
> > > >   do {
> > > >   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
> > > >   schedule();
> > > > + } else if (thread_flags & _TIF_FSCHECK) {
> > > > + addr_limit_user_check();
> > > >   } else {
> > > >   if (unlikely(!user_mode(regs)))
> > > >   return 0;
> > > This does seem to work, it no longer hangs on boot in my setup. This is
> > > obviously only a very superficial test.
> > > 
> > > The new location of this check seems weird, it's not clear why it
> > > should be on an else path. Perhaps it should be moved to right before
> > > where current_thread_info()->flags is fetched again?
> 
> > I was hitting bug when I tried that.I think that's because you
> > basically let the signal handler do pending work before you check the
> > flag, that's not a good idea.
> 
> > > If the purpose is hardening against buggy kernel code doing bad set_fs
> > > calls shouldn't this flag also be checked before looking at
> > > TIF_NEED_RESCHED and calling schedule()?
> > I am not sure to be honest. I expected schedule to only schedule the
> > processor to another task which would be fine given only the current
> > task have a bogus fs. I will put it first in case there is an edge
> > case scenario I missed.
> > 
> > What do you think? Let me know and I will look at changes all
> > architectures and testing them.
> 
> I don't know and I'd rather not guess on security issues. It's better
> if someone else reviews the code.
> 
> Unless there is a very quick fix maybe this series should be removed or
> reverted from linux-next? A diagnosis of "system calls can sometimes
> hang on return" seems serious even for linux-next. Since it happens
> very rarely in most setups I can easily imagine somebody spending a lot
> of time digging at this.

Probably best to revert.  I stopped looking at these patches during
the discussion, as the discussion seemed to be mainly around other
architectures, and I thought we had ARM settled.

Looking at this patch now, there's several things I'm not happy with.

The effect of adding a the new TIF flag for FSCHECK amongst the other
flags is that we end up overflowing the 8-bit constant, and have to
split the tests, meaning more instructions in the return path.  Eg:

-       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+       tst     r1, #_TIF_SYSCALL_WORK
+       bne     fast_work_pending
+       tst     r1, #_TIF_WORK_MASK
        bne     fast_work_pending

should be written:

	tst	r1, #_TIF_SYSCALL_WORK
	tsteq	r1, #_TIF_WORK_MASK
	bne	fast_work_pending

and:

-       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
+       tst     r1, #_TIF_SYSCALL_WORK
+       bne     fast_work_pending
+       tst     r1, #_TIF_WORK_MASK

should be:

	tst	r1, #_TIF_SYSCALL_WORK
	tsteq	r1, #_TIF_WORK_MASK

There's no need for extra branches.

Now, the next issue is that I don't think this TIF-flag approach is
good for ARM - alignment faults can happen any time due to misaligned
packets in the networking code, and we really don't want to be doing
this check in a place that we can loop.

My original suggestion for ARM was to do the address limit check after
all work had been processed, with interrupts disabled (so no
possibility of this kind of loop happening.)  However, that seems to
have been replaced with this TIF approach, which is going to cause
loops - I suspect if the probes code is enabled, this will suffer
the same problem.  Remember, the various probes stuff can walk
userspace stacks, which means they'll be using set_fs().

I don't see why we've ended up with this (imho) sub-standard TIF-flag
approach, and I think it's going to be very problematical.

Can we please go back to the approach I suggested back in March for
ARM that doesn't suffer from this problem?

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
  2017-07-19 17:06                     ` Russell King - ARM Linux
@ 2017-07-19 17:20                       ` Thomas Garnier
       [not found]                         ` <CAJcbSZHi6454skNpG8ecMnq90LdUfcxy2RYZD+7og1C1PeypvQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 23+ messages in thread
From: Thomas Garnier @ 2017-07-19 17:20 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Leonard Crestez, Thomas Gleixner, Stephen Rothwell, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel,
	Oleg Nesterov, Josh Poimboeuf, Petr Mladek, Miroslav Benes,
	Kees Cook, Al Viro, Arnd Bergmann, Dave Hansen, David Howells,
	Andy Lutomirski, Will Drewry, Will Deacon, Catalin Marinas

On Wed, Jul 19, 2017 at 10:06 AM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Wed, Jul 19, 2017 at 05:58:20PM +0300, Leonard Crestez wrote:
>> On Tue, 2017-07-18 at 12:04 -0700, Thomas Garnier wrote:
>> > On Tue, Jul 18, 2017 at 10:18 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
>> > > On Tue, 2017-07-18 at 09:04 -0700, Thomas Garnier wrote:
>> > > > On Tue, Jul 18, 2017 at 7:36 AM, Leonard Crestez <leonard.crestez@nxp.com> wrote:
>> > > > > On Wed, 2017-06-14 at 18:12 -0700, Thomas Garnier wrote:
>> > > > > >
>> > > > > > Ensure the address limit is a user-mode segment before returning to
>> > > > > > user-mode. Otherwise a process can corrupt kernel-mode memory and
>> > > > > > elevate privileges [1].
>> > > > > >
>> > > > > > The set_fs function sets the TIF_SETFS flag to force a slow path on
>> > > > > > return. In the slow path, the address limit is checked to be USER_DS if
>> > > > > > needed.
>> > > > > >
>> > > > > > The TIF_SETFS flag is added to _TIF_WORK_MASK shifting _TIF_SYSCALL_WORK
>> > > > > > for arm instruction immediate support. The global work mask is too big
>> > > > > > to used on a single instruction so adapt ret_fast_syscall.
>> > > > > >
>> > > > > > @@ -571,6 +572,10 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
>> > > > > >        * Update the trace code with the current status.
>> > > > > >        */
>> > > > > >       trace_hardirqs_off();
>> > > > > > +
>> > > > > > +     /* Check valid user FS if needed */
>> > > > > > +     addr_limit_user_check();
>> > > > > > +
>> > > > > >       do {
>> > > > > >               if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>> > > > > >                       schedule();
>> > > > > This patch made it's way into linux-next next-20170717 and it seems to
>> > > > > cause hangs when booting some boards over NFS (found via bisection). I
>> > > > > don't know exactly what determines the issue but I can reproduce hangs
>> > > > > if even if I just boot with init=/bin/bash and do stuff like
>> > > > >
>> > > > > # sleep 1 & sleep 1 & sleep 1 & wait; wait; wait; echo done!
>> > > > >
>> > > > > When this happens sysrq-t shows a sleep task hung in the 'R' state
>> > > > > spinning in do_work_pending, so maybe there is a potential infinite
>> > > > > loop here?
>> > > > >
>> > > > > The addr_limit_user_check at the start of do_work_pending will check
>> > > > > for TIF_FSCHECK once and clear it but the function loops while
>> > > > > (thread_flags & _TIF_WORK_MASK), so it if TIF_FSCHECK is set again then
>> > > > > the loop will never terminate. Does this make sense?
>> > > >
>> > > > Yes, it does. Thanks for looking into this.
>> > > >
>> > > > Can you try this change?
>> > > >
>> > > > diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
>> > > > index 3a48b54c6405..bc6ad7789568 100644
>> > > > --- a/arch/arm/kernel/signal.c
>> > > > +++ b/arch/arm/kernel/signal.c
>> > > > @@ -573,12 +573,11 @@ do_work_pending(struct pt_regs *regs, unsigned
>> > > > int thread_flags, int syscall)
>> > > >   */
>> > > >   trace_hardirqs_off();
>> > > >
>> > > > - /* Check valid user FS if needed */
>> > > > - addr_limit_user_check();
>> > > > -
>> > > >   do {
>> > > >   if (likely(thread_flags & _TIF_NEED_RESCHED)) {
>> > > >   schedule();
>> > > > + } else if (thread_flags & _TIF_FSCHECK) {
>> > > > + addr_limit_user_check();
>> > > >   } else {
>> > > >   if (unlikely(!user_mode(regs)))
>> > > >   return 0;
>> > > This does seem to work, it no longer hangs on boot in my setup. This is
>> > > obviously only a very superficial test.
>> > >
>> > > The new location of this check seems weird, it's not clear why it
>> > > should be on an else path. Perhaps it should be moved to right before
>> > > where current_thread_info()->flags is fetched again?
>>
>> > I was hitting bug when I tried that.I think that's because you
>> > basically let the signal handler do pending work before you check the
>> > flag, that's not a good idea.
>>
>> > > If the purpose is hardening against buggy kernel code doing bad set_fs
>> > > calls shouldn't this flag also be checked before looking at
>> > > TIF_NEED_RESCHED and calling schedule()?
>> > I am not sure to be honest. I expected schedule to only schedule the
>> > processor to another task which would be fine given only the current
>> > task have a bogus fs. I will put it first in case there is an edge
>> > case scenario I missed.
>> >
>> > What do you think? Let me know and I will look at changes all
>> > architectures and testing them.
>>
>> I don't know and I'd rather not guess on security issues. It's better
>> if someone else reviews the code.
>>
>> Unless there is a very quick fix maybe this series should be removed or
>> reverted from linux-next? A diagnosis of "system calls can sometimes
>> hang on return" seems serious even for linux-next. Since it happens
>> very rarely in most setups I can easily imagine somebody spending a lot
>> of time digging at this.
>
> Probably best to revert.  I stopped looking at these patches during
> the discussion, as the discussion seemed to be mainly around other
> architectures, and I thought we had ARM settled.
>
> Looking at this patch now, there's several things I'm not happy with.
>
> The effect of adding a the new TIF flag for FSCHECK amongst the other
> flags is that we end up overflowing the 8-bit constant, and have to
> split the tests, meaning more instructions in the return path.  Eg:
>
> -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> +       tst     r1, #_TIF_SYSCALL_WORK
> +       bne     fast_work_pending
> +       tst     r1, #_TIF_WORK_MASK
>         bne     fast_work_pending
>
> should be written:
>
>         tst     r1, #_TIF_SYSCALL_WORK
>         tsteq   r1, #_TIF_WORK_MASK
>         bne     fast_work_pending
>
> and:
>
> -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> +       tst     r1, #_TIF_SYSCALL_WORK
> +       bne     fast_work_pending
> +       tst     r1, #_TIF_WORK_MASK
>
> should be:
>
>         tst     r1, #_TIF_SYSCALL_WORK
>         tsteq   r1, #_TIF_WORK_MASK
>
> There's no need for extra branches.
>
> Now, the next issue is that I don't think this TIF-flag approach is
> good for ARM - alignment faults can happen any time due to misaligned
> packets in the networking code, and we really don't want to be doing
> this check in a place that we can loop.
>
> My original suggestion for ARM was to do the address limit check after
> all work had been processed, with interrupts disabled (so no
> possibility of this kind of loop happening.)  However, that seems to
> have been replaced with this TIF approach, which is going to cause
> loops - I suspect if the probes code is enabled, this will suffer
> the same problem.  Remember, the various probes stuff can walk
> userspace stacks, which means they'll be using set_fs().
>
> I don't see why we've ended up with this (imho) sub-standard TIF-flag
> approach, and I think it's going to be very problematical.
>
> Can we please go back to the approach I suggested back in March for
> ARM that doesn't suffer from this problem?

During the extensive thread discussion, Linus asked to move away from
architecture specific changes to this work flag system. I am glad to
fix the assembly as you asked on a separate patch.

>
> --
> RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
> according to speedtest.net.



-- 
Thomas

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
       [not found]                         ` <CAJcbSZHi6454skNpG8ecMnq90LdUfcxy2RYZD+7og1C1PeypvQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-07-19 18:35                           ` Russell King - ARM Linux
  2017-07-19 18:50                             ` Thomas Garnier
  0 siblings, 1 reply; 23+ messages in thread
From: Russell King - ARM Linux @ 2017-07-19 18:35 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Mark Rutland, Kernel Hardening, Catalin Marinas, Will Deacon,
	LKML, David Howells, Dave Hansen, Octavian Purdila,
	H . Peter Anvin, Miroslav Benes, Chris Metcalf, Pratyush Anand,
	Stephen Rothwell, Leonard Crestez, the arch/x86 maintainers,
	Ingo Molnar, Petr Mladek, Rik van Riel, Kees Cook, Arnd Bergmann

On Wed, Jul 19, 2017 at 10:20:35AM -0700, Thomas Garnier wrote:
> On Wed, Jul 19, 2017 at 10:06 AM, Russell King - ARM Linux
> <linux-I+IVW8TIWO2tmTQ+vhA3Yw@public.gmane.org> wrote:
> > On Wed, Jul 19, 2017 at 05:58:20PM +0300, Leonard Crestez wrote:
> > Probably best to revert.  I stopped looking at these patches during
> > the discussion, as the discussion seemed to be mainly around other
> > architectures, and I thought we had ARM settled.
> >
> > Looking at this patch now, there's several things I'm not happy with.
> >
> > The effect of adding a the new TIF flag for FSCHECK amongst the other
> > flags is that we end up overflowing the 8-bit constant, and have to
> > split the tests, meaning more instructions in the return path.  Eg:
> >
> > -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> > +       tst     r1, #_TIF_SYSCALL_WORK
> > +       bne     fast_work_pending
> > +       tst     r1, #_TIF_WORK_MASK
> >         bne     fast_work_pending
> >
> > should be written:
> >
> >         tst     r1, #_TIF_SYSCALL_WORK
> >         tsteq   r1, #_TIF_WORK_MASK
> >         bne     fast_work_pending
> >
> > and:
> >
> > -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
> > +       tst     r1, #_TIF_SYSCALL_WORK
> > +       bne     fast_work_pending
> > +       tst     r1, #_TIF_WORK_MASK
> >
> > should be:
> >
> >         tst     r1, #_TIF_SYSCALL_WORK
> >         tsteq   r1, #_TIF_WORK_MASK
> >
> > There's no need for extra branches.
> >
> > Now, the next issue is that I don't think this TIF-flag approach is
> > good for ARM - alignment faults can happen any time due to misaligned
> > packets in the networking code, and we really don't want to be doing
> > this check in a place that we can loop.
> >
> > My original suggestion for ARM was to do the address limit check after
> > all work had been processed, with interrupts disabled (so no
> > possibility of this kind of loop happening.)  However, that seems to
> > have been replaced with this TIF approach, which is going to cause
> > loops - I suspect if the probes code is enabled, this will suffer
> > the same problem.  Remember, the various probes stuff can walk
> > userspace stacks, which means they'll be using set_fs().
> >
> > I don't see why we've ended up with this (imho) sub-standard TIF-flag
> > approach, and I think it's going to be very problematical.
> >
> > Can we please go back to the approach I suggested back in March for
> > ARM that doesn't suffer from this problem?
> 
> During the extensive thread discussion, Linus asked to move away from
> architecture specific changes to this work flag system. I am glad to
> fix the assembly as you asked on a separate patch.

Well, for the record, I don't think you've got to the bottom of the
"infinite loop" potential of Linus' approach.

Eg, perf will likely trigger this same issue.  Eg, perf record -a -g
will attempt to record the callchain both in kernel space and userspace
each time a perf interrupt happens.  If the perf interrupt frequency is
sufficiently high that we have multiple interrupts during the execution
of do_work_pending() and its called functions, then that will turn this
into an infinite loop yet again.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v10 2/3] arm/syscalls: Check address limit on user-mode return
  2017-07-19 18:35                           ` Russell King - ARM Linux
@ 2017-07-19 18:50                             ` Thomas Garnier
  0 siblings, 0 replies; 23+ messages in thread
From: Thomas Garnier @ 2017-07-19 18:50 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Mark Rutland, Kernel Hardening, Catalin Marinas, Will Deacon,
	LKML, David Howells, Dave Hansen, Octavian Purdila,
	H . Peter Anvin, Miroslav Benes, Chris Metcalf, Pratyush Anand,
	Stephen Rothwell, Leonard Crestez, the arch/x86 maintainers,
	Ingo Molnar, Petr Mladek, Rik van Riel, Kees Cook, Arnd Bergmann

On Wed, Jul 19, 2017 at 11:35 AM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Wed, Jul 19, 2017 at 10:20:35AM -0700, Thomas Garnier wrote:
>> On Wed, Jul 19, 2017 at 10:06 AM, Russell King - ARM Linux
>> <linux@armlinux.org.uk> wrote:
>> > On Wed, Jul 19, 2017 at 05:58:20PM +0300, Leonard Crestez wrote:
>> > Probably best to revert.  I stopped looking at these patches during
>> > the discussion, as the discussion seemed to be mainly around other
>> > architectures, and I thought we had ARM settled.
>> >
>> > Looking at this patch now, there's several things I'm not happy with.
>> >
>> > The effect of adding a the new TIF flag for FSCHECK amongst the other
>> > flags is that we end up overflowing the 8-bit constant, and have to
>> > split the tests, meaning more instructions in the return path.  Eg:
>> >
>> > -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
>> > +       tst     r1, #_TIF_SYSCALL_WORK
>> > +       bne     fast_work_pending
>> > +       tst     r1, #_TIF_WORK_MASK
>> >         bne     fast_work_pending
>> >
>> > should be written:
>> >
>> >         tst     r1, #_TIF_SYSCALL_WORK
>> >         tsteq   r1, #_TIF_WORK_MASK
>> >         bne     fast_work_pending
>> >
>> > and:
>> >
>> > -       tst     r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
>> > +       tst     r1, #_TIF_SYSCALL_WORK
>> > +       bne     fast_work_pending
>> > +       tst     r1, #_TIF_WORK_MASK
>> >
>> > should be:
>> >
>> >         tst     r1, #_TIF_SYSCALL_WORK
>> >         tsteq   r1, #_TIF_WORK_MASK
>> >
>> > There's no need for extra branches.
>> >
>> > Now, the next issue is that I don't think this TIF-flag approach is
>> > good for ARM - alignment faults can happen any time due to misaligned
>> > packets in the networking code, and we really don't want to be doing
>> > this check in a place that we can loop.
>> >
>> > My original suggestion for ARM was to do the address limit check after
>> > all work had been processed, with interrupts disabled (so no
>> > possibility of this kind of loop happening.)  However, that seems to
>> > have been replaced with this TIF approach, which is going to cause
>> > loops - I suspect if the probes code is enabled, this will suffer
>> > the same problem.  Remember, the various probes stuff can walk
>> > userspace stacks, which means they'll be using set_fs().
>> >
>> > I don't see why we've ended up with this (imho) sub-standard TIF-flag
>> > approach, and I think it's going to be very problematical.
>> >
>> > Can we please go back to the approach I suggested back in March for
>> > ARM that doesn't suffer from this problem?
>>
>> During the extensive thread discussion, Linus asked to move away from
>> architecture specific changes to this work flag system. I am glad to
>> fix the assembly as you asked on a separate patch.
>
> Well, for the record, I don't think you've got to the bottom of the
> "infinite loop" potential of Linus' approach.
>
> Eg, perf will likely trigger this same issue.  Eg, perf record -a -g
> will attempt to record the callchain both in kernel space and userspace
> each time a perf interrupt happens.  If the perf interrupt frequency is
> sufficiently high that we have multiple interrupts during the execution
> of do_work_pending() and its called functions, then that will turn this
> into an infinite loop yet again.

Do you think it applies to the patch I just sent? The other approach
is to check at the entrance, ignore _TIF_FSCHECK on the loop and clear
it on exit.

>
> --
> RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
> according to speedtest.net.



-- 
Thomas

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2017-07-19 18:50 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-15  1:12 [PATCH v10 1/3] x86/syscalls: Check address limit on user-mode return Thomas Garnier
2017-06-15  1:12 ` [PATCH v10 2/3] arm/syscalls: " Thomas Garnier
     [not found]   ` <20170615011203.144108-2-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2017-06-20 20:18     ` Kees Cook
     [not found]       ` <CAGXu5jLR7io8u-M8tqbYW22C+sb2a2wSYLRBqJ_dguT4x+1tsQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-06-20 20:31         ` Thomas Garnier
2017-06-21  9:08           ` Will Deacon
2017-07-18 14:36     ` Leonard Crestez
2017-07-18 16:04       ` Thomas Garnier
     [not found]         ` <CAJcbSZEr8HPBwH1oVaHqPzAY4MS_=yqMoqPhcauuKu3cikB3uQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-07-18 17:18           ` Leonard Crestez
2017-07-18 19:04             ` Thomas Garnier
     [not found]               ` <CAJcbSZFr9KJTfGfiZo2fThoDkAE-D1OFf2YtELq4P6jX8syesQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-07-19 14:58                 ` Leonard Crestez
     [not found]                   ` <1500476300.22834.13.camel-3arQi8VN3Tc@public.gmane.org>
2017-07-19 16:51                     ` Thomas Garnier
2017-07-19 17:06                     ` Russell King - ARM Linux
2017-07-19 17:20                       ` [kernel-hardening] " Thomas Garnier
     [not found]                         ` <CAJcbSZHi6454skNpG8ecMnq90LdUfcxy2RYZD+7og1C1PeypvQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-07-19 18:35                           ` Russell King - ARM Linux
2017-07-19 18:50                             ` Thomas Garnier
2017-06-15  1:12 ` [PATCH v10 3/3] arm64/syscalls: " Thomas Garnier
     [not found]   ` <20170615011203.144108-3-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2017-06-21  8:16     ` Catalin Marinas
2017-06-21 13:57       ` Thomas Garnier
     [not found] ` <20170615011203.144108-1-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2017-06-20 20:24   ` [PATCH v10 1/3] x86/syscalls: " Kees Cook
2017-06-28 17:52     ` Kees Cook
     [not found]       ` <CAGXu5jKrJv0y70e5JiafKGcGzWoJPZM_HruZ=Y0rM1m0J4tZAA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-07-06 20:38         ` Thomas Garnier
     [not found]           ` <CAJcbSZE6Og4gwhFwhy_-Jaq6GovwN3y1B6O89JmkpXHtVfDLBA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-07-06 20:48             ` Thomas Gleixner
2017-07-06 20:52               ` Thomas Garnier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).