linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
@ 2017-04-28 15:32 Thomas Garnier
       [not found] ` <20170428153213.137279-1-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
                   ` (2 more replies)
  0 siblings, 3 replies; 89+ messages in thread
From: Thomas Garnier @ 2017-04-28 15:32 UTC (permalink / raw)
  To: Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, Thomas Garnier,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Ingo Molnar, Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov,
	Ingo Molnar, H . Peter Anvin, Andy Lutomirski, Paolo Bonzini,
	Rik van Riel
  Cc: linux-s390-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, x86-DgEjT+Ai2ygdnm+yROfE0A,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	kernel-hardening-ZwoEplunGu1jrUoiu81ncdBPR1lH4CV8

Ensure that a syscall does not return to user-mode with a kernel address
limit. If that happens, a process can corrupt kernel-mode memory and
elevate privileges [1].

The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
architecture can create optimized versions. This option is enabled by
default on s390 because a similar feature already exists.

[1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990

Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
Tested-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
---
Based on next-20170426
---
 arch/s390/Kconfig        |  1 +
 include/linux/syscalls.h | 27 ++++++++++++++++++++++++++-
 init/Kconfig             |  6 ++++++
 kernel/sys.c             | 13 +++++++++++++
 4 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d25435d94b6e..3d2ec084d5fc 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -64,6 +64,7 @@ config ARCH_SUPPORTS_UPROBES
 
 config S390
 	def_bool y
+	select ADDR_LIMIT_CHECK
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..e534b93ce43a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -191,6 +191,28 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
 
+
+/*
+ * Called before coming back to user-mode. Returning to user-mode with an
+ * address limit different than USER_DS can allow to overwrite kernel memory.
+ */
+static inline void addr_limit_check_syscall(void)
+{
+	BUG_ON(!segment_eq(get_fs(), USER_DS));
+}
+
+#ifndef CONFIG_ADDR_LIMIT_CHECK
+#define ADDR_LIMIT_CHECK_PRE() \
+	bool user_caller = segment_eq(get_fs(), USER_DS)
+#define ADDR_LIMIT_CHECK_POST() \
+	if (user_caller) addr_limit_check_syscall()
+#else
+#define ADDR_LIMIT_CHECK_PRE()
+#define ADDR_LIMIT_CHECK_POST()
+asmlinkage void addr_limit_check_failed(void) __noreturn;
+#endif
+
+
 #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
 #define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
@@ -199,7 +221,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
 	{								\
-		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
+		long ret;						\
+		ADDR_LIMIT_CHECK_PRE();					\
+		ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
+		ADDR_LIMIT_CHECK_POST();				\
 		__MAP(x,__SC_TEST,__VA_ARGS__);				\
 		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
 		return ret;						\
diff --git a/init/Kconfig b/init/Kconfig
index 42a346b0df43..599d9fe30703 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1961,6 +1961,12 @@ config PROFILING
 config TRACEPOINTS
 	bool
 
+config ADDR_LIMIT_CHECK
+	bool
+	help
+	  Disable the generic address limit check. Allow each architecture to
+	  optimize how and when the verification is done.
+
 source "arch/Kconfig"
 
 endmenu		# General setup
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a94b4eabcaa..a1cbcd715d62 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2458,3 +2458,16 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
 	return 0;
 }
 #endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_ADDR_LIMIT_CHECK
+/*
+ * Used when an architecture specific implementation detects an invalid address
+ * limit. This function does not return.
+ */
+asmlinkage void addr_limit_check_failed(void)
+{
+	/* Try to fail on the generic address limit check */
+	addr_limit_check_syscall();
+	panic("Invalid address limit before returning to user-mode");
+}
+#endif
-- 
2.13.0.rc0.306.g87b477812d-goog

^ permalink raw reply	[flat|nested] 89+ messages in thread

* [PATCH v9 2/4] x86/syscalls: Optimize address limit check
       [not found] ` <20170428153213.137279-1-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2017-04-28 15:32   ` Thomas Garnier
  2017-05-05 22:18   ` [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode Thomas Garnier
  1 sibling, 0 replies; 89+ messages in thread
From: Thomas Garnier @ 2017-04-28 15:32 UTC (permalink / raw)
  To: Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, Thomas Garnier,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Ingo Molnar, Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov,
	Ingo Molnar, H . Peter Anvin, Andy Lutomirski, Paolo Bonzini,
	Rik van Riel
  Cc: linux-s390-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, x86-DgEjT+Ai2ygdnm+yROfE0A,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	kernel-hardening-ZwoEplunGu1jrUoiu81ncdBPR1lH4CV8

Disable the generic address limit check in favor of an architecture
specific optimized implementation.

The user-mode state check is added to the prepare_exit_to_usermode
function. This function is called before any user-mode return on 32-bit
and on the 64-bit syscall slowpath. For the 64-bit syscall fast path, an
assembly address limit check redirects to the slow path if the address
limit is different.

The TASK_SIZE_MAX definition is moved to the pgtable_64_types header so
it can be used in assembly code.

Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
---
Based on next-20170426
---
 arch/x86/Kconfig                        |  1 +
 arch/x86/entry/common.c                 |  3 +++
 arch/x86/entry/entry_64.S               |  8 ++++++++
 arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
 arch/x86/include/asm/processor.h        | 11 -----------
 5 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cd18994a9555..34c04696068b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -42,6 +42,7 @@ config X86
 	#
 	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
 	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
+	select ADDR_LIMIT_CHECK
 	select ANON_INODES
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_DISCARD_MEMBLOCK
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index cdefcfdd9e63..057d133d7b78 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -23,6 +23,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/uprobes.h>
 #include <linux/livepatch.h>
+#include <linux/syscalls.h>
 
 #include <asm/desc.h>
 #include <asm/traps.h>
@@ -183,6 +184,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 	struct thread_info *ti = current_thread_info();
 	u32 cached_flags;
 
+	addr_limit_check_syscall();
+
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
 		local_irq_disable();
 
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 607d72c4a485..62aca6dcdaf4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -218,6 +218,14 @@ entry_SYSCALL_64_fastpath:
 	testl	$_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
 	jnz	1f
 
+	/*
+	 * If address limit is not based on user-mode, jump to slow path for
+	 * additional security checks.
+	 */
+	movq	$TASK_SIZE_MAX, %rcx
+	cmp	%rcx, TASK_addr_limit(%r11)
+	jne	1f
+
 	LOCKDEP_SYS_EXIT
 	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
 	movq	RIP(%rsp), %rcx
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 06470da156ba..78af4d43a7ce 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -104,4 +104,15 @@ typedef struct { pteval_t pte; } pte_t;
 
 #define EARLY_DYNAMIC_PAGE_TABLES	64
 
+/*
+ * User space process size. 47bits minus one guard page.  The guard
+ * page is necessary on Intel CPUs: if a SYSCALL instruction is at
+ * the highest possible canonical userspace address, then that
+ * syscall will enter the kernel with a non-canonical return
+ * address, and SYSRET will explode dangerously.  We avoid this
+ * particular problem by preventing anything from being mapped
+ * at the maximum canonical address.
+ */
+#define TASK_SIZE_MAX	((_AC(1, UL) << 47) - PAGE_SIZE)
+
 #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3cada998a402..e80822582d3e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -825,17 +825,6 @@ static inline void spin_lock_prefetch(const void *x)
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
 
 #else
-/*
- * User space process size. 47bits minus one guard page.  The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously.  We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
- */
-#define TASK_SIZE_MAX	((1UL << 47) - PAGE_SIZE)
-
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
-- 
2.13.0.rc0.306.g87b477812d-goog

^ permalink raw reply	[flat|nested] 89+ messages in thread

* [PATCH v9 3/4] arm/syscalls: Optimize address limit check
  2017-04-28 15:32 [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode Thomas Garnier
       [not found] ` <20170428153213.137279-1-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
@ 2017-04-28 15:32 ` Thomas Garnier
  2017-04-28 15:32 ` [PATCH v9 4/4] arm64/syscalls: " Thomas Garnier
  2 siblings, 0 replies; 89+ messages in thread
From: Thomas Garnier @ 2017-04-28 15:32 UTC (permalink / raw)
  To: Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, Thomas Garnier,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Ingo Molnar, Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov,
	Ingo Molnar, H . Peter Anvin, Andy Lutomirski, Paolo Bonzini,
	Rik van Riel, Kees Cook, Josh Poimboeuf, Borislav Petkov
  Cc: linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
	kernel-hardening

Disable the generic address limit check in favor of an architecture
specific optimized implementation.

The address limit is checked on each syscall return path to user-mode
path as well as the irq user-mode return function. If the address limit
was changed, a generic handler is called to stop the kernel on an
explicit check.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
Based on next-20170426
---
 arch/arm/Kconfig               |  1 +
 arch/arm/kernel/entry-common.S | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4c1a35f15838..c7322f2bb818 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1,6 +1,7 @@
 config ARM
 	bool
 	default y
+	select ADDR_LIMIT_CHECK
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..c83927498f40 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -12,6 +12,7 @@
 #include <asm/unistd.h>
 #include <asm/ftrace.h>
 #include <asm/unwind.h>
+#include <asm/memory.h>
 #ifdef CONFIG_AEABI
 #include <asm/unistd-oabi.h>
 #endif
@@ -27,7 +28,6 @@
 
 #include "entry-header.S"
 
-
 	.align	5
 #if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING))
 /*
@@ -40,9 +40,12 @@ ret_fast_syscall:
  UNWIND(.fnstart	)
  UNWIND(.cantunwind	)
 	disable_irq_notrace			@ disable interrupts
+	ldr	r2, [tsk, #TI_ADDR_LIMIT]
 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
 	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 	bne	fast_work_pending
+	cmp	r2, #TASK_SIZE
+	blne	addr_limit_check_failed
 
 	/* perform architecture specific actions before user return */
 	arch_ret_to_user r1, lr
@@ -66,6 +69,7 @@ ret_fast_syscall:
  UNWIND(.cantunwind	)
 	str	r0, [sp, #S_R0 + S_OFF]!	@ save returned r0
 	disable_irq_notrace			@ disable interrupts
+	ldr	r2, [tsk, #TI_ADDR_LIMIT]
 	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
 	tst	r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
 	beq	no_work_pending
@@ -82,6 +86,7 @@ slow_work_pending:
 	mov	r2, why				@ 'syscall'
 	bl	do_work_pending
 	cmp	r0, #0
+	ldreq   r2, [tsk, #TI_ADDR_LIMIT]
 	beq	no_work_pending
 	movlt	scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE)
 	ldmia	sp, {r0 - r6}			@ have to reload r0 - r6
@@ -99,9 +104,12 @@ ret_slow_syscall:
 	disable_irq_notrace			@ disable interrupts
 ENTRY(ret_to_user_from_irq)
 	ldr	r1, [tsk, #TI_FLAGS]
+	ldr	r2, [tsk, #TI_ADDR_LIMIT]
 	tst	r1, #_TIF_WORK_MASK
 	bne	slow_work_pending
 no_work_pending:
+	cmp	r2, #TASK_SIZE
+	blne	addr_limit_check_failed
 	asm_trace_hardirqs_on save = 0
 
 	/* perform architecture specific actions before user return */
-- 
2.13.0.rc0.306.g87b477812d-goog

^ permalink raw reply	[flat|nested] 89+ messages in thread

* [PATCH v9 4/4] arm64/syscalls: Optimize address limit check
  2017-04-28 15:32 [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode Thomas Garnier
       [not found] ` <20170428153213.137279-1-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2017-04-28 15:32 ` [PATCH v9 3/4] arm/syscalls: Optimize address limit check Thomas Garnier
@ 2017-04-28 15:32 ` Thomas Garnier
  2 siblings, 0 replies; 89+ messages in thread
From: Thomas Garnier @ 2017-04-28 15:32 UTC (permalink / raw)
  To: Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, Thomas Garnier,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Ingo Molnar, Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov,
	Ingo Molnar, H . Peter Anvin, Andy Lutomirski, Paolo Bonzini,
	Rik van Riel, Kees Cook, Josh Poimboeuf, Borislav Petkov
  Cc: linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
	kernel-hardening

Disable the generic address limit check in favor of an architecture
specific optimized implementation.

The address limit is checked on each syscall return path to user-mode.
If it was changed, a generic handler is called to stop the kernel on an
explicit check.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
---
Based on next-20170426
---
 arch/arm64/Kconfig        |  1 +
 arch/arm64/kernel/entry.S | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3dcd7ec69bca..ae82b1f067da 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -6,6 +6,7 @@ config ARM64
 	select ACPI_REDUCED_HARDWARE_ONLY if ACPI
 	select ACPI_MCFG if ACPI
 	select ACPI_SPCR_TABLE if ACPI
+	select ADDR_LIMIT_CHECK
 	select ARCH_CLOCKSOURCE_DATA
 	select ARCH_HAS_DEBUG_VIRTUAL
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 43512d4d7df2..c895c4402d32 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -744,6 +744,8 @@ ENDPROC(cpu_switch_to)
 ret_fast_syscall:
 	disable_irq				// disable interrupts
 	str	x0, [sp, #S_X0]			// returned x0
+	ldr	x2, [tsk, #TSK_TI_ADDR_LIMIT]	// check addr limit change
+	tbnz	x2, #63, addr_limit_fail
 	ldr	x1, [tsk, #TSK_TI_FLAGS]	// re-check for syscall tracing
 	and	x2, x1, #_TIF_SYSCALL_WORK
 	cbnz	x2, ret_fast_syscall_trace
@@ -771,6 +773,8 @@ work_pending:
  */
 ret_to_user:
 	disable_irq				// disable interrupts
+	ldr	x2, [tsk, #TSK_TI_ADDR_LIMIT]	// check addr limit change
+	tbnz	x2, #63, addr_limit_fail
 	ldr	x1, [tsk, #TSK_TI_FLAGS]
 	and	x2, x1, #_TIF_WORK_MASK
 	cbnz	x2, work_pending
@@ -780,6 +784,14 @@ finish_ret_to_user:
 ENDPROC(ret_to_user)
 
 /*
+ * Address limit was incorrect before returning in user-mode which could be
+ * used to elevate privileges. Call the generic handler to stop the kernel on
+ * the appropriate check. This function does not return.
+ */
+addr_limit_fail:
+	b	addr_limit_check_failed
+
+/*
  * This is how we return from a fork.
  */
 ENTRY(ret_from_fork)
-- 
2.13.0.rc0.306.g87b477812d-goog

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found] ` <20170428153213.137279-1-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
  2017-04-28 15:32   ` [PATCH v9 2/4] x86/syscalls: Optimize address limit check Thomas Garnier
@ 2017-05-05 22:18   ` Thomas Garnier
       [not found]     ` <CAJcbSZGQsRVg3QZ9QfLn2HBC+RP-7fUTab0bYDJ455d8y8GyNw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 89+ messages in thread
From: Thomas Garnier @ 2017-05-05 22:18 UTC (permalink / raw)
  To: Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, Thomas Garnier,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Ingo Molnar, Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov,
	Ingo Molnar, H . Peter Anvin, Andy Lutomirski, Paolo Bonzini,
	Rik van Riel
  Cc: linux-s390, LKML, Linux API, the arch/x86 maintainers,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	Kernel Hardening

On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> Ensure that a syscall does not return to user-mode with a kernel address
> limit. If that happens, a process can corrupt kernel-mode memory and
> elevate privileges [1].
>
> The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
> architecture can create optimized versions. This option is enabled by
> default on s390 because a similar feature already exists.
>
> [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>
> Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> Tested-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>

Ingo: Do you want to take the set?

> ---
> Based on next-20170426
> ---
>  arch/s390/Kconfig        |  1 +
>  include/linux/syscalls.h | 27 ++++++++++++++++++++++++++-
>  init/Kconfig             |  6 ++++++
>  kernel/sys.c             | 13 +++++++++++++
>  4 files changed, 46 insertions(+), 1 deletion(-)
>
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index d25435d94b6e..3d2ec084d5fc 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -64,6 +64,7 @@ config ARCH_SUPPORTS_UPROBES
>
>  config S390
>         def_bool y
> +       select ADDR_LIMIT_CHECK
>         select ARCH_HAS_DEVMEM_IS_ALLOWED
>         select ARCH_HAS_ELF_RANDOMIZE
>         select ARCH_HAS_GCOV_PROFILE_ALL
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 980c3c9b06f8..e534b93ce43a 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -191,6 +191,28 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>         SYSCALL_METADATA(sname, x, __VA_ARGS__)                 \
>         __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
>
> +
> +/*
> + * Called before coming back to user-mode. Returning to user-mode with an
> + * address limit different than USER_DS can allow to overwrite kernel memory.
> + */
> +static inline void addr_limit_check_syscall(void)
> +{
> +       BUG_ON(!segment_eq(get_fs(), USER_DS));
> +}
> +
> +#ifndef CONFIG_ADDR_LIMIT_CHECK
> +#define ADDR_LIMIT_CHECK_PRE() \
> +       bool user_caller = segment_eq(get_fs(), USER_DS)
> +#define ADDR_LIMIT_CHECK_POST() \
> +       if (user_caller) addr_limit_check_syscall()
> +#else
> +#define ADDR_LIMIT_CHECK_PRE()
> +#define ADDR_LIMIT_CHECK_POST()
> +asmlinkage void addr_limit_check_failed(void) __noreturn;
> +#endif
> +
> +
>  #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
>  #define __SYSCALL_DEFINEx(x, name, ...)                                        \
>         asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))       \
> @@ -199,7 +221,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
>         asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));      \
>         asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))       \
>         {                                                               \
> -               long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
> +               long ret;                                               \
> +               ADDR_LIMIT_CHECK_PRE();                                 \
> +               ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));       \
> +               ADDR_LIMIT_CHECK_POST();                                \
>                 __MAP(x,__SC_TEST,__VA_ARGS__);                         \
>                 __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));       \
>                 return ret;                                             \
> diff --git a/init/Kconfig b/init/Kconfig
> index 42a346b0df43..599d9fe30703 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1961,6 +1961,12 @@ config PROFILING
>  config TRACEPOINTS
>         bool
>
> +config ADDR_LIMIT_CHECK
> +       bool
> +       help
> +         Disable the generic address limit check. Allow each architecture to
> +         optimize how and when the verification is done.
> +
>  source "arch/Kconfig"
>
>  endmenu                # General setup
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 8a94b4eabcaa..a1cbcd715d62 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2458,3 +2458,16 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
>         return 0;
>  }
>  #endif /* CONFIG_COMPAT */
> +
> +#ifdef CONFIG_ADDR_LIMIT_CHECK
> +/*
> + * Used when an architecture specific implementation detects an invalid address
> + * limit. This function does not return.
> + */
> +asmlinkage void addr_limit_check_failed(void)
> +{
> +       /* Try to fail on the generic address limit check */
> +       addr_limit_check_syscall();
> +       panic("Invalid address limit before returning to user-mode");
> +}
> +#endif
> --
> 2.13.0.rc0.306.g87b477812d-goog
>



-- 
Thomas

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]     ` <CAJcbSZGQsRVg3QZ9QfLn2HBC+RP-7fUTab0bYDJ455d8y8GyNw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-08  7:33       ` Ingo Molnar
       [not found]         ` <20170508073352.caqe3fqf7nuxypgi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2017-05-08 13:09         ` Kees Cook
  0 siblings, 2 replies; 89+ messages in thread
From: Ingo Molnar @ 2017-05-08  7:33 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski, Paolo Bonzini, Rik van Riel, Kees Cook,
	Josh Poimboeuf


(added more Cc:s)

* Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:

> On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> > Ensure that a syscall does not return to user-mode with a kernel address
> > limit. If that happens, a process can corrupt kernel-mode memory and
> > elevate privileges [1].
> >
> > The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
> > architecture can create optimized versions. This option is enabled by
> > default on s390 because a similar feature already exists.
> >
> > [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> >
> > Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > Tested-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> 
> Ingo: Do you want to take the set?

Yeah, so now I'm questioning the whole premise of the feature, sorry :-/

A big disavantage is that the "security check" will add 2-5 instructions to the 
system call fast path. Every one of them, and essentially forever. Just to handle 
a CVE that was caused by a buggy touch-screen driver helper function leaking 
KERNEL_DS and which was fixed long ago ...

And yes, I realize that there were other such bugs and that such bugs might occur 
in the future - but why not push the overhead of the security check to the kernel 
build phase? I.e. I'm wondering how well we could do static analysis during kernel 
build - would a limited mode of Sparse be good enough for that? Or we could add a 
new static checker to tools/, built from first principles and used primarily for 
extended syntactical checking.

For example I'd consider it a good practice to mandate that if a kernel function 
sets KERNEL_DS then it must restore it as well. Any function that does not do 
that, or is too complex for the static analysis to prove correctness for sure 
should be considered buggy!

Are there any common kernel APIs outside set_fs() that set KERNEL_DS 
intentionally? The overwhelming pattern ought to be:

        orig_fs = get_fs();
        set_fs(KERNEL_DS);
	...
        set_fs(orig_fs);

... and even a relatively simple static analysis tool ought to be able to see 
through that.

I'd even suggest we do it not like Sparse builds are done today, but in a more 
integrated fashion: do static analysis as part of a typical kernel defconfig build 
and not tolerate warnings but go for a 'zero warnings' policy like Linus uses for 
modconfig builds.

_That_ solution I'd feel very, very good about - it would be so much better than 
any runtime checks...

Not to mention that such an integrated static analysis facility would allow many 
other things to be checked during build time, which we couldn't possibly check 
runtime.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]         ` <20170508073352.caqe3fqf7nuxypgi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-05-08  7:52           ` Ingo Molnar
       [not found]             ` <20170508075209.7aluvpwildw325rf-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2017-05-08 12:46           ` Greg KH
  1 sibling, 1 reply; 89+ messages in thread
From: Ingo Molnar @ 2017-05-08  7:52 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski, Paolo Bonzini, Rik van Riel, Kees Cook,
	Josh Poimboeuf


* Ingo Molnar <mingo-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:

> ... and even a relatively simple static analysis tool ought to be able to see 
> through that.
> 
> I'd even suggest we do it not like Sparse builds are done today, but in a more 
> integrated fashion: do static analysis as part of a typical kernel defconfig 
> build and not tolerate warnings but go for a 'zero warnings' policy like Linus 
> uses for modconfig builds.
> 
> _That_ solution I'd feel very, very good about - it would be so much better than 
> any runtime checks...

So the problem I have with Sparse is that it is very spammy. For example:

	make C=1 kernel/sched/

... produces:

    kernel/sched/core.c:792:6: warning: symbol 'sched_set_stop_task' was not declared. Should it be static?
    kernel/sched/core.c:1298:5: warning: symbol 'migrate_swap' was not declared. Should it be static?
    kernel/sched/core.c:3648:35: warning: symbol 'preempt_schedule_irq' was not declared. Should it be static?
    ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers)
    ./include/linux/uaccess.h:166:18:    expected void *<noident>
    ./include/linux/uaccess.h:166:18:    got void const *from
    ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers)
    ./include/linux/uaccess.h:166:18:    expected void *<noident>
    ./include/linux/uaccess.h:166:18:    got void const *from
    ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers)
    ./include/linux/uaccess.h:166:18:    expected void *<noident>
    ./include/linux/uaccess.h:166:18:    got void const *from
    ./include/linux/uaccess.h:166:18: warning: incorrect type in argument 1 (different modifiers)
    ./include/linux/uaccess.h:166:18:    expected void *<noident>
    ./include/linux/uaccess.h:166:18:    got void const *from
    kernel/sched/clock.c:80:19: warning: symbol 'sched_clock_running' was not declared. Should it be static?
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    kernel/sched/cputime.c:335:33: warning: context imbalance in 'thread_group_cputime' - different lock contexts for basic block
    kernel/sched/fair.c:54:14: warning: symbol 'normalized_sysctl_sched_latency' was not declared. Should it be static?
    kernel/sched/fair.c:75:14: warning: symbol 'normalized_sysctl_sched_min_granularity' was not declared. Should it be static?
    kernel/sched/fair.c:98:14: warning: symbol 'normalized_sysctl_sched_wakeup_granularity' was not declared. Should it be static?
    kernel/sched/fair.c:132:14: warning: symbol 'capacity_margin' was not declared. Should it be static?
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/fair.c:4688:35: error: marked inline, but without a definition
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/fair.c:5817:19: error: cannot dereference this type
    kernel/sched/fair.c:5817:19: error: cannot dereference this type
    kernel/sched/fair.c:5817:19: error: internal error: bad type in derived(11)
    kernel/sched/fair.c:5817:19: error: cannot dereference this type
    kernel/sched/fair.c:5817:19: error: incompatible types in comparison expression (different base types)
    kernel/sched/fair.c:5817:19: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/rt.c:635:6: warning: symbol 'sched_rt_bandwidth_account' was not declared. Should it be static?
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: internal error: bad type in derived(11)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    kernel/sched/sched.h:1988:16: error: incompatible types in comparison expression (different base types)
    kernel/sched/sched.h:1988:16: error: cannot dereference this type
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    ./include/linux/sched/cputime.h:83:14: warning: expression using sizeof bool
    kernel/sched/topology.c:499:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:499:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:499:28:    got struct sched_domain **<noident>
    kernel/sched/topology.c:534:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:534:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:534:28:    got struct sched_domain **<noident>
    kernel/sched/topology.c:554:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:554:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:554:28:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:594:36: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:594:36:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:594:36:    got struct sched_domain **<noident>
    kernel/sched/topology.c:601:24: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:601:24:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:601:24:    got struct sched_group **<noident>
    kernel/sched/topology.c:602:31: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:602:31:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:602:31:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:1330:39: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1330:39:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1330:39:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1333:40: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1333:40:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1333:40:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1337:40: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1337:40:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1337:40:    got struct sched_domain_shared **<noident>
    kernel/sched/topology.c:1339:40: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1339:40:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1339:40:    got struct sched_group **<noident>
    kernel/sched/topology.c:1341:40: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1341:40:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1341:40:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:1343:32: warning: incorrect type in argument 1 (different address spaces)
    kernel/sched/topology.c:1343:32:    expected void [noderef] <asn:3>*__pdata
    kernel/sched/topology.c:1343:32:    got struct sched_domain **[noderef] sd
    kernel/sched/topology.c:1345:32: warning: incorrect type in argument 1 (different address spaces)
    kernel/sched/topology.c:1345:32:    expected void [noderef] <asn:3>*__pdata
    kernel/sched/topology.c:1345:32:    got struct sched_domain_shared **[noderef] sds
    kernel/sched/topology.c:1347:32: warning: incorrect type in argument 1 (different address spaces)
    kernel/sched/topology.c:1347:32:    expected void [noderef] <asn:3>*__pdata
    kernel/sched/topology.c:1347:32:    got struct sched_group **[noderef] sg
    kernel/sched/topology.c:1349:32: warning: incorrect type in argument 1 (different address spaces)
    kernel/sched/topology.c:1349:32:    expected void [noderef] <asn:3>*__pdata
    kernel/sched/topology.c:1349:32:    got struct sched_group_capacity **[noderef] sgc
    kernel/sched/topology.c:1261:25: warning: incorrect type in assignment (different address spaces)
    kernel/sched/topology.c:1261:25:    expected struct sched_domain **[noderef] sd
    kernel/sched/topology.c:1261:25:    got struct sched_domain *[noderef] <asn:3>*<noident>
    kernel/sched/topology.c:1265:26: warning: incorrect type in assignment (different address spaces)
    kernel/sched/topology.c:1265:26:    expected struct sched_domain_shared **[noderef] sds
    kernel/sched/topology.c:1265:26:    got struct sched_domain_shared *[noderef] <asn:3>*<noident>
    kernel/sched/topology.c:1269:25: warning: incorrect type in assignment (different address spaces)
    kernel/sched/topology.c:1269:25:    expected struct sched_group **[noderef] sg
    kernel/sched/topology.c:1269:25:    got struct sched_group *[noderef] <asn:3>*<noident>
    kernel/sched/topology.c:1273:26: warning: incorrect type in assignment (different address spaces)
    kernel/sched/topology.c:1273:26:    expected struct sched_group_capacity **[noderef] sgc
    kernel/sched/topology.c:1273:26:    got struct sched_group_capacity *[noderef] <asn:3>*<noident>
    kernel/sched/topology.c:1288:26: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1288:26:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1288:26:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1295:26: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1295:26:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1295:26:    got struct sched_domain_shared **<noident>
    kernel/sched/topology.c:1304:26: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1304:26:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1304:26:    got struct sched_group **<noident>
    kernel/sched/topology.c:1311:26: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1311:26:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1311:26:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:759:30: warning: incorrect type in argument 1 (different address spaces)
    kernel/sched/topology.c:759:30:    expected void [noderef] <asn:3>*__pdata
    kernel/sched/topology.c:759:30:    got struct sched_domain **[noderef] sd
    kernel/sched/topology.c:776:15: warning: incorrect type in assignment (different address spaces)
    kernel/sched/topology.c:776:15:    expected struct sched_domain **[noderef] sd
    kernel/sched/topology.c:776:15:    got struct sched_domain *[noderef] <asn:3>*<noident>
    kernel/sched/topology.c:794:9: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:794:9:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:794:9:    got struct sched_domain **<noident>
    kernel/sched/topology.c:795:10: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:795:10:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:795:10:    got struct sched_domain **<noident>
    kernel/sched/topology.c:797:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:797:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:797:28:    got struct sched_domain_shared **<noident>
    kernel/sched/topology.c:798:18: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:798:18:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:798:18:    got struct sched_domain_shared **<noident>
    kernel/sched/topology.c:800:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:800:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:800:28:    got struct sched_group **<noident>
    kernel/sched/topology.c:801:18: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:801:18:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:801:18:    got struct sched_group **<noident>
    kernel/sched/topology.c:803:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:803:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:803:28:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:804:18: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:804:18:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:804:18:    got struct sched_group_capacity **<noident>
    kernel/sched/topology.c:848:36: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:848:36:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:848:36:    got struct sched_domain **<noident>
    kernel/sched/topology.c:954:31: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:954:31:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:954:31:    got struct sched_domain_shared **<noident>
    kernel/sched/topology.c:1354:21: warning: symbol 'build_sched_domain' was not declared. Should it be static?
    kernel/sched/topology.c:1409:34: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1409:34:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1409:34:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1419:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1419:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1419:28:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1436:28: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1436:28:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1436:28:    got struct sched_domain **<noident>
    kernel/sched/topology.c:1446:23: warning: incorrect type in initializer (different address spaces)
    kernel/sched/topology.c:1446:23:    expected void const [noderef] <asn:3>*__vpp_verify
    kernel/sched/topology.c:1446:23:    got struct sched_domain **<noident>
    kernel/sched/topology.c:759:29: warning: dereference of noderef expression
    kernel/sched/topology.c:777:14: warning: dereference of noderef expression
    kernel/sched/topology.c:1262:22: warning: dereference of noderef expression
    kernel/sched/topology.c:1266:22: warning: dereference of noderef expression
    kernel/sched/topology.c:1270:22: warning: dereference of noderef expression
    kernel/sched/topology.c:1274:22: warning: dereference of noderef expression
    kernel/sched/topology.c:1329:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1336:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1338:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1340:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1343:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1344:17: warning: dereference of noderef expression
    kernel/sched/topology.c:1345:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1346:17: warning: dereference of noderef expression
    kernel/sched/topology.c:1347:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1348:17: warning: dereference of noderef expression
    kernel/sched/topology.c:1349:29: warning: dereference of noderef expression
    kernel/sched/topology.c:1350:17: warning: dereference of noderef expression

... it's just not usable in that form for a regular maintenance flow.

So what would be more useful is to add a specific Sparse check that only checks 
KERNEL_DS, to add it as a regular (.config driven) build option and make sure the 
kernel build has zero warnings.

>From that point on we can declare that this kind of bug won't occur anymore, if 
the Sparse implementation of the check is correct.

But there's a (big) problem with that development model: Sparse is not part of the 
kernel tree and adding a feature to it while making the kernel depend on that 
brand new feature is a logistical nightmare. The overhead is quite similar to 
adding new features to a compiler - it happens at a glacial pace and is only done 
for major features really, at considerable expense. I don't think this is an 
adequate model for 'extended syntax checking' of the kernel, especially when it 
comes to correctness that has such obvious security impact.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]         ` <20170508073352.caqe3fqf7nuxypgi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2017-05-08  7:52           ` Ingo Molnar
@ 2017-05-08 12:46           ` Greg KH
       [not found]             ` <20170508124621.GA20705-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>
  1 sibling, 1 reply; 89+ messages in thread
From: Greg KH @ 2017-05-08 12:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel,
	Kees Cook

On Mon, May 08, 2017 at 09:33:52AM +0200, Ingo Molnar wrote:
> 
> (added more Cc:s)
> 
> * Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> 
> > On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> > > Ensure that a syscall does not return to user-mode with a kernel address
> > > limit. If that happens, a process can corrupt kernel-mode memory and
> > > elevate privileges [1].
> > >
> > > The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
> > > architecture can create optimized versions. This option is enabled by
> > > default on s390 because a similar feature already exists.
> > >
> > > [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
> > >
> > > Signed-off-by: Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
> > > Tested-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> > 
> > Ingo: Do you want to take the set?
> 
> Yeah, so now I'm questioning the whole premise of the feature, sorry :-/
> 
> A big disavantage is that the "security check" will add 2-5 instructions to the 
> system call fast path. Every one of them, and essentially forever. Just to handle 
> a CVE that was caused by a buggy touch-screen driver helper function leaking 
> KERNEL_DS and which was fixed long ago ...
> 
> And yes, I realize that there were other such bugs and that such bugs might occur 
> in the future - but why not push the overhead of the security check to the kernel 
> build phase? I.e. I'm wondering how well we could do static analysis during kernel 
> build - would a limited mode of Sparse be good enough for that? Or we could add a 
> new static checker to tools/, built from first principles and used primarily for 
> extended syntactical checking.
> 
> For example I'd consider it a good practice to mandate that if a kernel function 
> sets KERNEL_DS then it must restore it as well. Any function that does not do 
> that, or is too complex for the static analysis to prove correctness for sure 
> should be considered buggy!
> 
> Are there any common kernel APIs outside set_fs() that set KERNEL_DS 
> intentionally? The overwhelming pattern ought to be:
> 
>         orig_fs = get_fs();
>         set_fs(KERNEL_DS);
> 	...
>         set_fs(orig_fs);
> 
> ... and even a relatively simple static analysis tool ought to be able to see 
> through that.
> 
> I'd even suggest we do it not like Sparse builds are done today, but in a more 
> integrated fashion: do static analysis as part of a typical kernel defconfig build 
> and not tolerate warnings but go for a 'zero warnings' policy like Linus uses for 
> modconfig builds.
> 
> _That_ solution I'd feel very, very good about - it would be so much better than 
> any runtime checks...
> 
> Not to mention that such an integrated static analysis facility would allow many 
> other things to be checked during build time, which we couldn't possibly check 
> runtime.

What about a simple coccinelle script to test for this type of thing?
We write it once, add it to the in-kernel body of tests, and then 0-day
runs it on all trees all the time.  That should catch this type of
issue, like all of the other "bad programming bus" that the tool
currently catches.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-08  7:33       ` Ingo Molnar
       [not found]         ` <20170508073352.caqe3fqf7nuxypgi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-05-08 13:09         ` Kees Cook
  2017-05-08 14:02           ` Ingo Molnar
  1 sibling, 1 reply; 89+ messages in thread
From: Kees Cook @ 2017-05-08 13:09 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel,
	Josh Poimboeuf

On Mon, May 8, 2017 at 12:33 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> (added more Cc:s)
>
> * Thomas Garnier <thgarnie@google.com> wrote:
>
>> On Fri, Apr 28, 2017 at 8:32 AM, Thomas Garnier <thgarnie@google.com> wrote:
>> > Ensure that a syscall does not return to user-mode with a kernel address
>> > limit. If that happens, a process can corrupt kernel-mode memory and
>> > elevate privileges [1].
>> >
>> > The CONFIG_ADDR_LIMIT_CHECK option disables the generic check so each
>> > architecture can create optimized versions. This option is enabled by
>> > default on s390 because a similar feature already exists.
>> >
>> > [1] https://bugs.chromium.org/p/project-zero/issues/detail?id=990
>> >
>> > Signed-off-by: Thomas Garnier <thgarnie@google.com>
>> > Tested-by: Kees Cook <keescook@chromium.org>
>>
>> Ingo: Do you want to take the set?
>
> Yeah, so now I'm questioning the whole premise of the feature, sorry :-/
>
> A big disavantage is that the "security check" will add 2-5 instructions to the
> system call fast path. Every one of them, and essentially forever. Just to handle
> a CVE that was caused by a buggy touch-screen driver helper function leaking
> KERNEL_DS and which was fixed long ago ...
>
> And yes, I realize that there were other such bugs and that such bugs might occur
> in the future - but why not push the overhead of the security check to the kernel
> build phase? I.e. I'm wondering how well we could do static analysis during kernel
> build - would a limited mode of Sparse be good enough for that? Or we could add a
> new static checker to tools/, built from first principles and used primarily for
> extended syntactical checking.

Static analysis is just not going to cover all cases. We've had
vulnerabilities where interrupt handlers left KERNEL_DS set, for
example. If there are performance concerns, let's put this behind a
CONFIG. 2-5 instructions is not an issue for most people that want
this coverage.

> For example I'd consider it a good practice to mandate that if a kernel function
> sets KERNEL_DS then it must restore it as well. Any function that does not do
> that, or is too complex for the static analysis to prove correctness for sure
> should be considered buggy!
>
> Are there any common kernel APIs outside set_fs() that set KERNEL_DS
> intentionally? The overwhelming pattern ought to be:
>
>         orig_fs = get_fs();
>         set_fs(KERNEL_DS);
>         ...
>         set_fs(orig_fs);
>
> ... and even a relatively simple static analysis tool ought to be able to see
> through that.

This pattern was, in fact, what the interrupt handler bug escaped
from. We have to build proactive defenses, and this check has a clear
defensive advantage. It's a noble goal to improve the static analyzers
and simplify the source, but we have too much history to prove that
this just isn't enough. This instruction cost of this is extremely
small, too. Until we can eliminate set_fs(), we need to add this
check.


> I'd even suggest we do it not like Sparse builds are done today, but in a more
> integrated fashion: do static analysis as part of a typical kernel defconfig build
> and not tolerate warnings but go for a 'zero warnings' policy like Linus uses for
> modconfig builds.
>
> _That_ solution I'd feel very, very good about - it would be so much better than
> any runtime checks...

I'm not opposed to this, but there will be push-back on "making the
build slower", and it still won't catch everything. Bug-finding is
different from making a bug class just unexploitable at all. As we've
done before, it's the difference between trying to find format string
attacks vs just removing %n from the format parser.

> Not to mention that such an integrated static analysis facility would allow many
> other things to be checked during build time, which we couldn't possibly check
> runtime.

Absolutely! But it's orthogonal to proactive runtime exploit blocking.
We've got one that works and defends against an entire class of
vulnerability for very low cost. It it's truly too costly for default,
let's put it behind a CONFIG and see who wants it. (Most distros, I
suspect, will enable it, just like hardened usercopy which is much
more expensive than this.)

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-08 13:09         ` Kees Cook
@ 2017-05-08 14:02           ` Ingo Molnar
       [not found]             ` <20170508140230.23kxf2kfeazeo4zr-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Ingo Molnar @ 2017-05-08 14:02 UTC (permalink / raw)
  To: Kees Cook
  Cc: Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel,
	Josh Poimboeuf


* Kees Cook <keescook@chromium.org> wrote:

> > And yes, I realize that there were other such bugs and that such bugs might 
> > occur in the future - but why not push the overhead of the security check to 
> > the kernel build phase? I.e. I'm wondering how well we could do static 
> > analysis during kernel build - would a limited mode of Sparse be good enough 
> > for that? Or we could add a new static checker to tools/, built from first 
> > principles and used primarily for extended syntactical checking.
> 
> Static analysis is just not going to cover all cases. We've had vulnerabilities 
> where interrupt handlers left KERNEL_DS set, for example. [...]

Got any commit ID of that bug - was it because a function executed by the 
interrupt handler leaked KERNEL_DS?

> [...] If there are performance concerns, let's put this behind a CONFIG. 2-5 
> instructions is not an issue for most people that want this coverage.

That doesn't really _solve_ the performance concerns, it just forces most people 
to enable it by creating a 'security or performance' false dichotomy ...

> [...] and it still won't catch everything. Bug-finding is different from making 
> a bug class just unexploitable at all. As we've done before, it's the difference 
> between trying to find format string attacks vs just removing %n from the format 
> parser.

No, it does not make it unexploitable, it could still be exploitable if the 
runtime check is buggy or if there's kernel execution outside of the regular 
system call paths - there's plenty of such hardware functionality on x86 for 
example.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]             ` <20170508140230.23kxf2kfeazeo4zr-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-05-08 14:06               ` Jann Horn
       [not found]                 ` <CAG48ez0Hz=CimkPwuq903tgJkGj8gXUtiQJJb-P2zUes6bd6Hw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-05-08 15:24               ` Kees Cook
  1 sibling, 1 reply; 89+ messages in thread
From: Jann Horn @ 2017-05-08 14:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Kees Cook, Thomas Garnier, Martin Schwidefsky, Heiko Carstens,
	Dave Hansen, Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel

On Mon, May 8, 2017 at 4:02 PM, Ingo Molnar <mingo-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
>
> * Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>
>> > And yes, I realize that there were other such bugs and that such bugs might
>> > occur in the future - but why not push the overhead of the security check to
>> > the kernel build phase? I.e. I'm wondering how well we could do static
>> > analysis during kernel build - would a limited mode of Sparse be good enough
>> > for that? Or we could add a new static checker to tools/, built from first
>> > principles and used primarily for extended syntactical checking.
>>
>> Static analysis is just not going to cover all cases. We've had vulnerabilities
>> where interrupt handlers left KERNEL_DS set, for example. [...]
>
> Got any commit ID of that bug - was it because a function executed by the
> interrupt handler leaked KERNEL_DS?

I think Kees might be talking about
https://bugs.chromium.org/p/project-zero/issues/detail?id=822, fixed in
commit e6978e4bf181fb3b5f8cb6f71b4fe30fbf1b655c. The issue was that
perf code that can run in pretty much any context called access_ok().

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]             ` <20170508075209.7aluvpwildw325rf-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-05-08 15:22               ` Daniel Micay
  2017-05-08 15:26                 ` Kees Cook
  0 siblings, 1 reply; 89+ messages in thread
From: Daniel Micay @ 2017-05-08 15:22 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Garnier
  Cc: Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski, Paolo Bonzini, Rik van Riel, Kees Cook,
	Josh Poimboeuf

On Mon, 2017-05-08 at 09:52 +0200, Ingo Molnar wrote:
> 
> ... it's just not usable in that form for a regular maintenance flow.
> 
> So what would be more useful is to add a specific Sparse check that
> only checks 
> KERNEL_DS, to add it as a regular (.config driven) build option and
> make sure the 
> kernel build has zero warnings.
> 
> From that point on we can declare that this kind of bug won't occur
> anymore, if 
> the Sparse implementation of the check is correct.
> 
> But there's a (big) problem with that development model: Sparse is not
> part of the 
> kernel tree and adding a feature to it while making the kernel depend
> on that 
> brand new feature is a logistical nightmare. The overhead is quite
> similar to 
> adding new features to a compiler - it happens at a glacial pace and
> is only done 
> for major features really, at considerable expense. I don't think this
> is an 
> adequate model for 'extended syntax checking' of the kernel,
> especially when it 
> comes to correctness that has such obvious security impact.
> 
> Thanks,
> 
> 	Ingo

There's the option of using GCC plugins now that the infrastructure was
upstreamed from grsecurity. It can be used as part of the regular build
process and as long as the analysis is pretty simple it shouldn't hurt
compile time much.

The problem with doing that is I don't think there are people with much
experience with GCC contributing upstream and it's going to be more work
to develop/maintain than some kind of specialized DSL for analysis. I
think a few static analysis plugins are used as part of maintaining
grsecurity for solving issues like finding false positives for the
REFCOUNT overflow checking feature, so it's something that's already
being done in practice elsewhere.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]             ` <20170508140230.23kxf2kfeazeo4zr-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2017-05-08 14:06               ` Jann Horn
@ 2017-05-08 15:24               ` Kees Cook
       [not found]                 ` <CAGXu5jJ4iY7QZ9wRu5dmm7RHtLh_V6TQh4huWwLCYPKOr63aiA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 89+ messages in thread
From: Kees Cook @ 2017-05-08 15:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel,
	Josh Poimboeuf

On Mon, May 8, 2017 at 7:02 AM, Ingo Molnar <mingo-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
>
> * Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>
>> > And yes, I realize that there were other such bugs and that such bugs might
>> > occur in the future - but why not push the overhead of the security check to
>> > the kernel build phase? I.e. I'm wondering how well we could do static
>> > analysis during kernel build - would a limited mode of Sparse be good enough
>> > for that? Or we could add a new static checker to tools/, built from first
>> > principles and used primarily for extended syntactical checking.
>>
>> Static analysis is just not going to cover all cases. We've had vulnerabilities
>> where interrupt handlers left KERNEL_DS set, for example. [...]
>
> Got any commit ID of that bug - was it because a function executed by the
> interrupt handler leaked KERNEL_DS?

Ah, it was an exception handler, but the one I was thinking of was this:
https://lwn.net/Articles/419141/

>> [...] If there are performance concerns, let's put this behind a CONFIG. 2-5
>> instructions is not an issue for most people that want this coverage.
>
> That doesn't really _solve_ the performance concerns, it just forces most people
> to enable it by creating a 'security or performance' false dichotomy ...

That's fair, but what I'm trying to say is that many people will want
this, so rejecting it because it's 2 more instructions seems
unreasonable. We have had much more invasive changes added to the
kernel.

>> [...] and it still won't catch everything. Bug-finding is different from making
>> a bug class just unexploitable at all. As we've done before, it's the difference
>> between trying to find format string attacks vs just removing %n from the format
>> parser.
>
> No, it does not make it unexploitable, it could still be exploitable if the
> runtime check is buggy or if there's kernel execution outside of the regular
> system call paths - there's plenty of such hardware functionality on x86 for
> example.

Fine, but this is splitting hairs. This does protect a specific
situation, and it does so very cheaply. The real fix would be to
remove set_fs() entirely. :P

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-08 15:22               ` [kernel-hardening] " Daniel Micay
@ 2017-05-08 15:26                 ` Kees Cook
  2017-05-08 19:51                   ` Thomas Garnier
       [not found]                   ` <CAGXu5jL61K0bRSEg9a_LswNyrt3K1J57REbWVcvAXT54zWwtMA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 2 replies; 89+ messages in thread
From: Kees Cook @ 2017-05-08 15:26 UTC (permalink / raw)
  To: Daniel Micay
  Cc: Ingo Molnar, Thomas Garnier, Martin Schwidefsky, Heiko Carstens,
	Dave Hansen, Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel

On Mon, May 8, 2017 at 8:22 AM, Daniel Micay <danielmicay@gmail.com> wrote:
> On Mon, 2017-05-08 at 09:52 +0200, Ingo Molnar wrote:
>>
>> ... it's just not usable in that form for a regular maintenance flow.
>>
>> So what would be more useful is to add a specific Sparse check that
>> only checks
>> KERNEL_DS, to add it as a regular (.config driven) build option and
>> make sure the
>> kernel build has zero warnings.
>>
>> From that point on we can declare that this kind of bug won't occur
>> anymore, if
>> the Sparse implementation of the check is correct.
>>
>> But there's a (big) problem with that development model: Sparse is not
>> part of the
>> kernel tree and adding a feature to it while making the kernel depend
>> on that
>> brand new feature is a logistical nightmare. The overhead is quite
>> similar to
>> adding new features to a compiler - it happens at a glacial pace and
>> is only done
>> for major features really, at considerable expense. I don't think this
>> is an
>> adequate model for 'extended syntax checking' of the kernel,
>> especially when it
>> comes to correctness that has such obvious security impact.
>>
>> Thanks,
>>
>>       Ingo
>
> There's the option of using GCC plugins now that the infrastructure was
> upstreamed from grsecurity. It can be used as part of the regular build
> process and as long as the analysis is pretty simple it shouldn't hurt
> compile time much.

Well, and that the situation may arise due to memory corruption, not
from poorly-matched set_fs() calls, which static analysis won't help
solve. We need to catch this bad kernel state because it is a very bad
state to run in.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-08 15:26                 ` Kees Cook
@ 2017-05-08 19:51                   ` Thomas Garnier
       [not found]                   ` <CAGXu5jL61K0bRSEg9a_LswNyrt3K1J57REbWVcvAXT54zWwtMA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 0 replies; 89+ messages in thread
From: Thomas Garnier @ 2017-05-08 19:51 UTC (permalink / raw)
  To: Kees Cook
  Cc: Daniel Micay, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
	Dave Hansen, Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel

On Mon, May 8, 2017 at 8:26 AM, Kees Cook <keescook@chromium.org> wrote:
> On Mon, May 8, 2017 at 8:22 AM, Daniel Micay <danielmicay@gmail.com> wrote:
>> On Mon, 2017-05-08 at 09:52 +0200, Ingo Molnar wrote:
>>>
>>> ... it's just not usable in that form for a regular maintenance flow.
>>>
>>> So what would be more useful is to add a specific Sparse check that
>>> only checks
>>> KERNEL_DS, to add it as a regular (.config driven) build option and
>>> make sure the
>>> kernel build has zero warnings.
>>>
>>> From that point on we can declare that this kind of bug won't occur
>>> anymore, if
>>> the Sparse implementation of the check is correct.
>>>
>>> But there's a (big) problem with that development model: Sparse is not
>>> part of the
>>> kernel tree and adding a feature to it while making the kernel depend
>>> on that
>>> brand new feature is a logistical nightmare. The overhead is quite
>>> similar to
>>> adding new features to a compiler - it happens at a glacial pace and
>>> is only done
>>> for major features really, at considerable expense. I don't think this
>>> is an
>>> adequate model for 'extended syntax checking' of the kernel,
>>> especially when it
>>> comes to correctness that has such obvious security impact.
>>>
>>> Thanks,
>>>
>>>       Ingo
>>
>> There's the option of using GCC plugins now that the infrastructure was
>> upstreamed from grsecurity. It can be used as part of the regular build
>> process and as long as the analysis is pretty simple it shouldn't hurt
>> compile time much.
>
> Well, and that the situation may arise due to memory corruption, not
> from poorly-matched set_fs() calls, which static analysis won't help
> solve. We need to catch this bad kernel state because it is a very bad
> state to run in.

Of course, I agree with Kees points on this and previous emails.

A static analysis solution is hard to scale across functions and build
time can suffer. I don't think the coverage will be good enough to
consider this change and static analysis as similar.

>
> -Kees
>
> --
> Kees Cook
> Pixel Security



-- 
Thomas

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                 ` <CAG48ez0Hz=CimkPwuq903tgJkGj8gXUtiQJJb-P2zUes6bd6Hw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-08 20:48                   ` Al Viro
       [not found]                     ` <20170508204858.GT29622-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Al Viro @ 2017-05-08 20:48 UTC (permalink / raw)
  To: Jann Horn
  Cc: Ingo Molnar, Kees Cook, Thomas Garnier, Martin Schwidefsky,
	Heiko Carstens, Dave Hansen, Arnd Bergmann, Thomas Gleixner,
	David Howells, René Nyffenegger, Andrew Morton,
	Paul E . McKenney, Eric W . Biederman, Oleg Nesterov,
	Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini

On Mon, May 08, 2017 at 04:06:35PM +0200, Jann Horn wrote:

> I think Kees might be talking about
> https://bugs.chromium.org/p/project-zero/issues/detail?id=822, fixed in
> commit e6978e4bf181fb3b5f8cb6f71b4fe30fbf1b655c. The issue was that
> perf code that can run in pretty much any context called access_ok().

And that commit has *NOT* solved the problem.  perf_callchain_user()
can be called synchronously, without passing through that code.
Tracepoint shite...

That set_fs() should be done in get_perf_callchain(), just around the call of
perf_callchain_user().  Along with pagefault_disable(), actually.

BTW, that's a nice example demonstrating why doing that on the kernel
boundary is wrong.  Wider (in theory) area being "protected" => easier
to miss the ways not crossing its border.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                 ` <CAGXu5jJ4iY7QZ9wRu5dmm7RHtLh_V6TQh4huWwLCYPKOr63aiA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-09  6:34                   ` Ingo Molnar
  0 siblings, 0 replies; 89+ messages in thread
From: Ingo Molnar @ 2017-05-09  6:34 UTC (permalink / raw)
  To: Kees Cook
  Cc: Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel,
	Josh Poimboeuf


* Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:

> On Mon, May 8, 2017 at 7:02 AM, Ingo Molnar <mingo-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> >
> > * Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> >
> >> > And yes, I realize that there were other such bugs and that such bugs might
> >> > occur in the future - but why not push the overhead of the security check to
> >> > the kernel build phase? I.e. I'm wondering how well we could do static
> >> > analysis during kernel build - would a limited mode of Sparse be good enough
> >> > for that? Or we could add a new static checker to tools/, built from first
> >> > principles and used primarily for extended syntactical checking.
> >>
> >> Static analysis is just not going to cover all cases. We've had vulnerabilities
> >> where interrupt handlers left KERNEL_DS set, for example. [...]
> >
> > Got any commit ID of that bug - was it because a function executed by the
> > interrupt handler leaked KERNEL_DS?
> 
> Ah, it was an exception handler, but the one I was thinking of was this:
> https://lwn.net/Articles/419141/

Ok, so that's CVE-2010-4258, where an oops with KERNEL_DS set was used to escalate 
privileges, due to the kernel's oops handler not cleaning up the KERNEL_DS. The 
exploit used another bug, a crash in a network protocol handler, to execute the 
oops handler with KERNEL_DS set.

The explanation of the exploit itself points out that it's a very interesting bug 
and I agree, it's not a general kernel bug but a bug in a very narrow code path 
(oops handling) that caused this, and I don't see how that example can be turned 
into a general example: it was a bug in oops handling to let the process continue 
execution (and perform the CLEARTID operation) *and* leak the address limit at 
KERNEL_DS.

By similar argument a bug in the runtime checking of the address limit may allow 
exploits. Consider the oops path cleanup a similarly sensitive code path as the 
address limit check.

To handle this category of exploits it would be enough to add a runtime check to 
the _oops handling code itself_ (to make sure we've set addr_limit back to USER_DS 
even if we crash in a KERNEL_DS code area), not to every system call!

That check would avoid that particular historic pattern, if combined with static 
analysis that ensured that KERNEL_DS is always set/restored correctly. (Which btw. 
I believe some of the regular static scans of the kernel are already doing today.)

Furthermore, to go back to your original argument:

> Static analysis is just not going to cover all cases.

it's not even true that a runtime check will 'cover all cases': for example a 
similar bug to CVE-2010-4258 could still be exploited:

 - Note that the actual put_user() was not prevented via the runtime check - the
   runtime check would run *after* the buggy put_user() was done. The runtime 
   check warns or panics after the fact, which might (or might not) be enough to 
   prevent the exploit.

 - Also note that a slightly different form of the bug would still be exploitable, 
   even with the runtime check: for example if the task-shutdown code can be made 
   to unconditionally set KERNEL_DS, but after the put_user(), then the runtime
   check would not 'cover all cases'.

So the argument for doing this runtime check after every system call is very 
dubious.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]             ` <20170508124621.GA20705-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>
@ 2017-05-09  6:45               ` Ingo Molnar
       [not found]                 ` <20170509064522.anusoikaalvlux3w-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Ingo Molnar @ 2017-05-09  6:45 UTC (permalink / raw)
  To: Greg KH
  Cc: Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel,
	Kees Cook


* Greg KH <greg-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org> wrote:

> What about a simple coccinelle script to test for this type of thing?
> We write it once, add it to the in-kernel body of tests, and then 0-day
> runs it on all trees all the time.  That should catch this type of
> issue, like all of the other "bad programming bus" that the tool
> currently catches.

Yeah, that would work - but today most of our coccinelle scripts are still pretty 
verbose, and I think it's important to make this a different category of 
coccinelle script, which is .config driven where a loud warning yells at us.

I.e. force the 'zero warnings tolerated' model.

I also noticed that Coccinelle builds are pretty slow, so it would still make 
sense to have a performance oriented static checking facility that does not have 
the performance baggage of high level functional languages.

I.e. either integrate it into Sparse - or start a kernel integrated static 
analysis tooling project that would only follow control flow initially - which is 
what we need here I believe.

We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would 
be a pity to add a runtime check to every system call ...

We could also add a runtime check to oops handling to make sure we don't leak 
KERNEL_DS through kernel crashes, to ease worries about CVE-2010-4258.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                   ` <CAGXu5jL61K0bRSEg9a_LswNyrt3K1J57REbWVcvAXT54zWwtMA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-09  6:56                     ` Ingo Molnar
       [not found]                       ` <20170509065619.wmqa6z6w3n6xpvrw-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2017-05-09 16:30                       ` [kernel-hardening] " Kees Cook
  0 siblings, 2 replies; 89+ messages in thread
From: Ingo Molnar @ 2017-05-09  6:56 UTC (permalink / raw)
  To: Kees Cook
  Cc: Daniel Micay, Thomas Garnier, Martin Schwidefsky, Heiko Carstens,
	Dave Hansen, Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel


* Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:

> > There's the option of using GCC plugins now that the infrastructure was 
> > upstreamed from grsecurity. It can be used as part of the regular build 
> > process and as long as the analysis is pretty simple it shouldn't hurt compile 
> > time much.
> 
> Well, and that the situation may arise due to memory corruption, not from 
> poorly-matched set_fs() calls, which static analysis won't help solve. We need 
> to catch this bad kernel state because it is a very bad state to run in.

If memory corruption corrupted the task state into having addr_limit set to 
KERNEL_DS then there's already a fair chance that it's game over: it could also 
have set *uid to 0, or changed a sensitive PF_ flag, or a number of other 
things...

Furthermore, think about it: there's literally an infinite amount of corrupted 
task states that could be a security problem and that could be checked after every 
system call. Do we want to check every one of them?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                 ` <20170509064522.anusoikaalvlux3w-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-05-09  8:56                   ` Christoph Hellwig
  2017-05-09 13:00                     ` Andy Lutomirski
  0 siblings, 1 reply; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-09  8:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Greg KH, Thomas Garnier, Martin Schwidefsky, Heiko Carstens,
	Dave Hansen, Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel

On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote:
> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would 
> be a pity to add a runtime check to every system call ...

I think we should simply strive to remove all of them that aren't
in core scheduler / arch code.  Basically evetyytime we do the

	oldfs = get_fs();
	set_fs(KERNEL_DS); 
	..
	set_fs(oldfs);

trick we're doing something wrong, and there should always be better
ways to archive it.  E.g. using iov_iter with a ITER_KVEC type
consistently would already remove most of them.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                       ` <20170509065619.wmqa6z6w3n6xpvrw-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-05-09 11:10                         ` Greg KH
       [not found]                           ` <20170509111007.GA14702-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Greg KH @ 2017-05-09 11:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Kees Cook, Daniel Micay, Thomas Garnier, Martin Schwidefsky,
	Heiko Carstens, Dave Hansen, Arnd Bergmann, Thomas Gleixner,
	David Howells, René Nyffenegger, Andrew Morton,
	Paul E . McKenney, Eric W . Biederman, Oleg Nesterov,
	Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini

On Tue, May 09, 2017 at 08:56:19AM +0200, Ingo Molnar wrote:
> 
> * Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> 
> > > There's the option of using GCC plugins now that the infrastructure was 
> > > upstreamed from grsecurity. It can be used as part of the regular build 
> > > process and as long as the analysis is pretty simple it shouldn't hurt compile 
> > > time much.
> > 
> > Well, and that the situation may arise due to memory corruption, not from 
> > poorly-matched set_fs() calls, which static analysis won't help solve. We need 
> > to catch this bad kernel state because it is a very bad state to run in.
> 
> If memory corruption corrupted the task state into having addr_limit set to 
> KERNEL_DS then there's already a fair chance that it's game over: it could also 
> have set *uid to 0, or changed a sensitive PF_ flag, or a number of other 
> things...
> 
> Furthermore, think about it: there's literally an infinite amount of corrupted 
> task states that could be a security problem and that could be checked after every 
> system call. Do we want to check every one of them?

Ok, I'm all for not checking lots of stuff all the time, just to protect
from crappy drivers that.  Especially as we _can_ audit and run checks
on the source code for them in the kernel tree.

But, and here's the problem, outside of the desktop/enterprise world,
there are a ton of out-of-tree code that is crap.  The number of
security/bug fixes and kernel crashes for out-of-tree code in systems
like Android phones is just so high it's laughable.

When you have a device that is running 3.2 million lines of kernel code,
yet the diffstat of the tree compared to mainline adds 3 million lines
of code, there is bound to be a ton of issues/problems there.

So this is an entirely different thing we need to try to protect
ourselves from.  A long time ago I laughed when I saw that Microsoft had
to do lots of "hardening" of their kernel to protect themselves from
crappy drivers, as I knew we didn't have to do that because we had the
source for them and could fix the root issues.  But that has changed and
now we don't all have that option.  That code is out-of-tree because the
vendor doesn't care, and doesn't want to take any time at all to do
anything resembling a real code review[1].

So, how about options like the ones being proposed here, go behind a new
config option:
	CONFIG_PROTECT_FROM_CRAPPY_DRIVERS
that device owners can enable if they do not trust their vendor-provided
code (hint, I sure don't.)  That way the "normal" path that all of us
are used to running will be fine, but if you want to take the speed hit
to try to protect yourself, then you can do that as well.

Anyway, just an idea...

thanks,

greg k-h

[1] I am working really hard with lots of vendors to try to fix their
    broken development model, but that is going to take years to resolve
    as their device pipelines are years long, and changing their
    mindsets takes a long time...

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-09  8:56                   ` Christoph Hellwig
@ 2017-05-09 13:00                     ` Andy Lutomirski
  2017-05-09 13:02                       ` [kernel-hardening] " Christoph Hellwig
                                         ` (2 more replies)
  0 siblings, 3 replies; 89+ messages in thread
From: Andy Lutomirski @ 2017-05-09 13:00 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Ingo Molnar, Greg KH, Thomas Garnier, Martin Schwidefsky,
	Heiko Carstens, Dave Hansen, Arnd Bergmann, Thomas Gleixner,
	David Howells, René Nyffenegger, Andrew Morton,
	Paul E . McKenney, Eric W . Biederman, Oleg Nesterov,
	Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini, Rik van Riel, Kees Cook, Josh Poimboeuf,
	Borislav Petkov

On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote:
>> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would
>> be a pity to add a runtime check to every system call ...
>
> I think we should simply strive to remove all of them that aren't
> in core scheduler / arch code.  Basically evetyytime we do the
>
>         oldfs = get_fs();
>         set_fs(KERNEL_DS);
>         ..
>         set_fs(oldfs);
>
> trick we're doing something wrong, and there should always be better
> ways to archive it.  E.g. using iov_iter with a ITER_KVEC type
> consistently would already remove most of them.

How about trying to remove all of them?  If we could actually get rid
of all of them, we could drop the arch support, and we'd get faster,
simpler, shorter uaccess code throughout the kernel.

The ones in kernel/compat.c are generally garbage.  They should be
using compat_alloc_user_space().  Ditto for kernel/power/user.c.

flush_module_icache() is a potentially silly arch thing.  Does the
code in kernel/module.c that uses set_fs() actually work?

kernel/signal.c's set_fs() is laziness.

__probe_kernel_read() and __probe_kernel_write() use set_fs(), but
that usage only matters on sane arches* like s390x.  We should
arguably have a set_uaccess_address_space() or similar for this
purpose that's a nop on normal arches like x86.

fs/splice.c has some, ahem, interesting uses that have been the source
of nasty exploits in the past.  Converting them to use iov_iter
properly would be really, really nice.  Christoph, I don't suppose
you'd like to do that?

The others seem to mostly be fixable, but I haven't looked that closely.

Overall, I suspect that a big part of why mitigations like the one
being discussed in this thread were developed is because addr_limit
used to be on the stack, making it (along with restart_block) a really
nice target.  This is fixed now on x86, arm64, and s390x, I believe,
and other arches can easily opt in to the fix.

* I'm strongly in favor of arches that have totally separate user and
kernel address spaces.  Sadly, the most common arches don't do this.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-09 13:00                     ` Andy Lutomirski
@ 2017-05-09 13:02                       ` Christoph Hellwig
  2017-05-09 16:03                         ` Christoph Hellwig
  2017-05-09 16:05                       ` Brian Gerst
       [not found]                       ` <CALCETrUh8NO2scaqEM48K70Fo2+V3=Cpyk4JurCDiCYp4nm_+g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2 siblings, 1 reply; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-09 13:02 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney, Rik van Riel, Kees Cook

On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
> fs/splice.c has some, ahem, interesting uses that have been the source
> of nasty exploits in the past.  Converting them to use iov_iter
> properly would be really, really nice.  Christoph, I don't suppose
> you'd like to do that?

I can take care of all the fs code including this one.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                           ` <20170509111007.GA14702-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>
@ 2017-05-09 14:29                             ` Thomas Garnier
       [not found]                               ` <CAJcbSZFswDWZoK-1UK+xkRMJ4ttSYbtH2Y5WD5_aPR-8ru6t8A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Thomas Garnier @ 2017-05-09 14:29 UTC (permalink / raw)
  To: Greg KH
  Cc: Ingo Molnar, Kees Cook, Daniel Micay, Martin Schwidefsky,
	Heiko Carstens, Dave Hansen, Arnd Bergmann, Thomas Gleixner,
	David Howells, René Nyffenegger, Andrew Morton,
	Paul E . McKenney, Eric W . Biederman, Oleg Nesterov,
	Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini

On Tue, May 9, 2017 at 4:10 AM, Greg KH <greg-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org> wrote:
> On Tue, May 09, 2017 at 08:56:19AM +0200, Ingo Molnar wrote:
>>
>> * Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>>
>> > > There's the option of using GCC plugins now that the infrastructure was
>> > > upstreamed from grsecurity. It can be used as part of the regular build
>> > > process and as long as the analysis is pretty simple it shouldn't hurt compile
>> > > time much.
>> >
>> > Well, and that the situation may arise due to memory corruption, not from
>> > poorly-matched set_fs() calls, which static analysis won't help solve. We need
>> > to catch this bad kernel state because it is a very bad state to run in.
>>
>> If memory corruption corrupted the task state into having addr_limit set to
>> KERNEL_DS then there's already a fair chance that it's game over: it could also
>> have set *uid to 0, or changed a sensitive PF_ flag, or a number of other
>> things...
>>
>> Furthermore, think about it: there's literally an infinite amount of corrupted
>> task states that could be a security problem and that could be checked after every
>> system call. Do we want to check every one of them?
>
> Ok, I'm all for not checking lots of stuff all the time, just to protect
> from crappy drivers that.  Especially as we _can_ audit and run checks
> on the source code for them in the kernel tree.
>
> But, and here's the problem, outside of the desktop/enterprise world,
> there are a ton of out-of-tree code that is crap.  The number of
> security/bug fixes and kernel crashes for out-of-tree code in systems
> like Android phones is just so high it's laughable.
>
> When you have a device that is running 3.2 million lines of kernel code,
> yet the diffstat of the tree compared to mainline adds 3 million lines
> of code, there is bound to be a ton of issues/problems there.
>
> So this is an entirely different thing we need to try to protect
> ourselves from.  A long time ago I laughed when I saw that Microsoft had
> to do lots of "hardening" of their kernel to protect themselves from
> crappy drivers, as I knew we didn't have to do that because we had the
> source for them and could fix the root issues.  But that has changed and
> now we don't all have that option.  That code is out-of-tree because the
> vendor doesn't care, and doesn't want to take any time at all to do
> anything resembling a real code review[1].

That's a big part of why I thought would be useful. I am less worried
about edge cases upstream right now than forks with custom codes not
using set_fs correctly.

>
> So, how about options like the ones being proposed here, go behind a new
> config option:
>         CONFIG_PROTECT_FROM_CRAPPY_DRIVERS
> that device owners can enable if they do not trust their vendor-provided
> code (hint, I sure don't.)  That way the "normal" path that all of us
> are used to running will be fine, but if you want to take the speed hit
> to try to protect yourself, then you can do that as well.

Maybe another name but why not.

>
> Anyway, just an idea...
>
> thanks,
>
> greg k-h
>
> [1] I am working really hard with lots of vendors to try to fix their
>     broken development model, but that is going to take years to resolve
>     as their device pipelines are years long, and changing their
>     mindsets takes a long time...



-- 
Thomas

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-09 13:02                       ` [kernel-hardening] " Christoph Hellwig
@ 2017-05-09 16:03                         ` Christoph Hellwig
  2017-05-09 16:50                           ` Kees Cook
       [not found]                           ` <20170509160322.GA15902-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
  0 siblings, 2 replies; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-09 16:03 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney, Rik van Riel, Kees Cook

On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
> > fs/splice.c has some, ahem, interesting uses that have been the source
> > of nasty exploits in the past.  Converting them to use iov_iter
> > properly would be really, really nice.  Christoph, I don't suppose
> > you'd like to do that?
> 
> I can take care of all the fs code including this one.

I spent the afternoon hacking up where I'd like this to head.  It's
completely untested as of now:

	http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-09 13:00                     ` Andy Lutomirski
  2017-05-09 13:02                       ` [kernel-hardening] " Christoph Hellwig
@ 2017-05-09 16:05                       ` Brian Gerst
       [not found]                       ` <CALCETrUh8NO2scaqEM48K70Fo2+V3=Cpyk4JurCDiCYp4nm_+g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2 siblings, 0 replies; 89+ messages in thread
From: Brian Gerst @ 2017-05-09 16:05 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Christoph Hellwig, Ingo Molnar, Greg KH, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini, Rik van Riel, Kees Cook, Josh Poimboeuf,
	Borislav Petkov

On Tue, May 9, 2017 at 9:00 AM, Andy Lutomirski <luto@kernel.org> wrote:
> On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch@infradead.org> wrote:
>> On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote:
>>> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would
>>> be a pity to add a runtime check to every system call ...
>>
>> I think we should simply strive to remove all of them that aren't
>> in core scheduler / arch code.  Basically evetyytime we do the
>>
>>         oldfs = get_fs();
>>         set_fs(KERNEL_DS);
>>         ..
>>         set_fs(oldfs);
>>
>> trick we're doing something wrong, and there should always be better
>> ways to archive it.  E.g. using iov_iter with a ITER_KVEC type
>> consistently would already remove most of them.
>
> How about trying to remove all of them?  If we could actually get rid
> of all of them, we could drop the arch support, and we'd get faster,
> simpler, shorter uaccess code throughout the kernel.
>
> The ones in kernel/compat.c are generally garbage.  They should be
> using compat_alloc_user_space().  Ditto for kernel/power/user.c.

compat_alloc_user_space() is a hack that should go away too.  It ends
up copying the data three times.

The more efficient solution to this is to have a core syscall function
that only accesses kernel memory, and then have two front-end
functions (native and compat) that do the actual reads and writes to
userspace, with conversion in the compat case.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-09  6:56                     ` Ingo Molnar
       [not found]                       ` <20170509065619.wmqa6z6w3n6xpvrw-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-05-09 16:30                       ` Kees Cook
  1 sibling, 0 replies; 89+ messages in thread
From: Kees Cook @ 2017-05-09 16:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Daniel Micay, Thomas Garnier, Martin Schwidefsky, Heiko Carstens,
	Dave Hansen, Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin, Andy Lutomirski, Paolo Bonzini, Rik van Riel

On Mon, May 8, 2017 at 11:56 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Kees Cook <keescook@chromium.org> wrote:
>
>> > There's the option of using GCC plugins now that the infrastructure was
>> > upstreamed from grsecurity. It can be used as part of the regular build
>> > process and as long as the analysis is pretty simple it shouldn't hurt compile
>> > time much.
>>
>> Well, and that the situation may arise due to memory corruption, not from
>> poorly-matched set_fs() calls, which static analysis won't help solve. We need
>> to catch this bad kernel state because it is a very bad state to run in.

[attempting some thread-merging]

> Ok, so that's CVE-2010-4258, where an oops with KERNEL_DS set was used to escalate
> privileges, due to the kernel's oops handler not cleaning up the KERNEL_DS. The
> exploit used another bug, a crash in a network protocol handler, to execute the
> oops handler with KERNEL_DS set.

Right, I didn't mean to suggest that vulnerability would be fixed by
this solution. I was trying to show how there can be some pretty
complex interaction with exceptions/interrupts/etc that would make
pure static analysis still miss things.

> If memory corruption corrupted the task state into having addr_limit set to
> KERNEL_DS then there's already a fair chance that it's game over: it could also
> have set *uid to 0, or changed a sensitive PF_ flag, or a number of other
> things...
>
> Furthermore, think about it: there's literally an infinite amount of corrupted
> task states that could be a security problem and that could be checked after every
> system call. Do we want to check every one of them?

Right, but this "slippery slope" argument isn't the best way to reject
security changes. Let me take a step back and describe the threat, and
where we should likely spend time:

The primary threat with addr_limit getting changed is that a
narrowly-scoped attack (traditionally stack exhaustion or
adjacent-stack large-index writes) could be leveraged into opening the
entire kernel to writes (by allowing all syscalls with a
copy_to_user() call to suddenly be able to write to kernel memory).
So, really, the flaw is having addr_limit at all. Removing set_fs()
should, I think, allow this to become a const (or at least should get
us a lot closer).

The main path to corrupting addr_limit has been via stack corruption.
On architectures with CONFIG_THREAD_INFO_IN_TASK, this risk is greatly
reduced already, but it's not universally available yet. (And as long
as we're talking about stack attacks, CONFIG_VMAP_STACK makes
cross-stack overflows go away, and cross-stack indexing harder, but
that's not really about addr_limit since currently nothing with
VMAP_STACK doesn't already have THREAD_INFO_IN_TASK.)

So, left with a still exploitable target in memory that allows such an
expansion of attack method, I still think it's worth keeping this
patch series, but if we can drop set_fs() I could probably be
convinced the benefit of the series doesn't exceed the cost on
THREAD_INFO_IN_TASK-architectures (x86, arm64, s390). But that means
at least currently keeping it on arm, for example. If we can make
addr_limit const, well, we don't need the series at all.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-09 16:03                         ` Christoph Hellwig
@ 2017-05-09 16:50                           ` Kees Cook
       [not found]                             ` <CAGXu5jKHVMRMKDfn+=kkbm+JkWPhoEtDwKx=QXAYxg1p9bn7PQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-05-10  6:46                             ` Christoph Hellwig
       [not found]                           ` <20170509160322.GA15902-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
  1 sibling, 2 replies; 89+ messages in thread
From: Kees Cook @ 2017-05-09 16:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andy Lutomirski, Mark Rutland, Kernel Hardening, Greg KH,
	Heiko Carstens, LKML, David Howells, Dave Hansen,
	H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov, Peter Zijlstra,
	linux-s390, the arch/x86 maintainers, Russell King, Will Deacon,
	Christian Borntraeger, René Nyffenegger

On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
>> > fs/splice.c has some, ahem, interesting uses that have been the source
>> > of nasty exploits in the past.  Converting them to use iov_iter
>> > properly would be really, really nice.  Christoph, I don't suppose
>> > you'd like to do that?
>>
>> I can take care of all the fs code including this one.
>
> I spent the afternoon hacking up where I'd like this to head.  It's
> completely untested as of now:
>
>         http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination

Ooooh yes! Nice work.

I love this:
http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d
but I wonder what it'll cause out-of-tree code to do. I mean, I'd
rather nothing out-of-tree be calling these, but I'd hate 3rd party
hacks even more.

http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587
This accidentally(?) removes the kernel-doc comments.

http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798
Could this be made defensive? (Return 0 if ret wraps, for example?) I
see what the comment says, but not everyone will read that. :(

http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1
Perhaps unconditionally set USER_DS on exit instead of retaining
whatever was there?

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                             ` <CAGXu5jKHVMRMKDfn+=kkbm+JkWPhoEtDwKx=QXAYxg1p9bn7PQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-09 22:52                               ` Andy Lutomirski
       [not found]                                 ` <CALCETrV73=cDvaSLOMvb299yaGNJYME8LC-=P+N6p7R1NN97Yg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Andy Lutomirski @ 2017-05-09 22:52 UTC (permalink / raw)
  To: Kees Cook
  Cc: Christoph Hellwig, Andy Lutomirski, Mark Rutland,
	Kernel Hardening, Greg KH, Heiko Carstens, LKML, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	Peter Zijlstra, linux-s390, the arch/x86 maintainers,
	Russell King, Will Deacon, Christian Borntraeger,
	René Nyffenegger

On Tue, May 9, 2017 at 9:50 AM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org> wrote:
>> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
>>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
>>> > fs/splice.c has some, ahem, interesting uses that have been the source
>>> > of nasty exploits in the past.  Converting them to use iov_iter
>>> > properly would be really, really nice.  Christoph, I don't suppose
>>> > you'd like to do that?
>>>
>>> I can take care of all the fs code including this one.
>>
>> I spent the afternoon hacking up where I'd like this to head.  It's
>> completely untested as of now:
>>
>>         http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination
>
> Ooooh yes! Nice work.
>
> I love this:
> http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d
> but I wonder what it'll cause out-of-tree code to do. I mean, I'd
> rather nothing out-of-tree be calling these, but I'd hate 3rd party
> hacks even more.
>
> http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587
> This accidentally(?) removes the kernel-doc comments.
>
> http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798
> Could this be made defensive? (Return 0 if ret wraps, for example?) I
> see what the comment says, but not everyone will read that. :(
>
> http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1
> Perhaps unconditionally set USER_DS on exit instead of retaining
> whatever was there?

I don't like silent fixups.  If we want to do this, we should BUG or
at least WARN, not just change the addr limit.  But I'm also not
convinced it's indicative of an actual bug here.

--Andy

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                 ` <CALCETrV73=cDvaSLOMvb299yaGNJYME8LC-=P+N6p7R1NN97Yg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-09 23:31                                   ` Kees Cook
  2017-05-10  1:59                                     ` Andy Lutomirski
                                                       ` (2 more replies)
  0 siblings, 3 replies; 89+ messages in thread
From: Kees Cook @ 2017-05-09 23:31 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Christoph Hellwig, Andy Lutomirski, Mark Rutland,
	Kernel Hardening, Greg KH, Heiko Carstens, LKML, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	Peter Zijlstra, linux-s390, the arch/x86 maintainers,
	Russell King, Will Deacon, Christian Borntraeger,
	René Nyffenegger

On Tue, May 9, 2017 at 3:52 PM, Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org> wrote:
> On Tue, May 9, 2017 at 9:50 AM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>> On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org> wrote:
>>> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
>>>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
>>>> > fs/splice.c has some, ahem, interesting uses that have been the source
>>>> > of nasty exploits in the past.  Converting them to use iov_iter
>>>> > properly would be really, really nice.  Christoph, I don't suppose
>>>> > you'd like to do that?
>>>>
>>>> I can take care of all the fs code including this one.
>>>
>>> I spent the afternoon hacking up where I'd like this to head.  It's
>>> completely untested as of now:
>>>
>>>         http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination
>>
>> Ooooh yes! Nice work.
>>
>> I love this:
>> http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d
>> but I wonder what it'll cause out-of-tree code to do. I mean, I'd
>> rather nothing out-of-tree be calling these, but I'd hate 3rd party
>> hacks even more.
>>
>> http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587
>> This accidentally(?) removes the kernel-doc comments.
>>
>> http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798
>> Could this be made defensive? (Return 0 if ret wraps, for example?) I
>> see what the comment says, but not everyone will read that. :(
>>
>> http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1
>> Perhaps unconditionally set USER_DS on exit instead of retaining
>> whatever was there?
>
> I don't like silent fixups.  If we want to do this, we should BUG or
> at least WARN, not just change the addr limit.  But I'm also not
> convinced it's indicative of an actual bug here.

Nothing should enter that function with KERNEL_DS set, right?

BUG_ON(get_fs() != USER_DS);
set_fs(KERNEL_DS);
...
set_fs(USER_DS);

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-09 23:31                                   ` Kees Cook
@ 2017-05-10  1:59                                     ` Andy Lutomirski
  2017-05-10  7:15                                     ` Christoph Hellwig
       [not found]                                     ` <CAGXu5jL6PeQmmdxh5h--fgrMK8DW_XZYpNfDOvvv_o9E3-Kxdw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2 siblings, 0 replies; 89+ messages in thread
From: Andy Lutomirski @ 2017-05-10  1:59 UTC (permalink / raw)
  To: Kees Cook
  Cc: Christoph Hellwig, Andy Lutomirski, Mark Rutland,
	Kernel Hardening, Greg KH, Heiko Carstens, LKML, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	Peter Zijlstra, linux-s390, the arch/x86 maintainers,
	Russell King, Will Deacon, Christian Borntraeger,
	René Nyffenegger

On Tue, May 9, 2017 at 4:31 PM, Kees Cook <keescook@chromium.org> wrote:
> On Tue, May 9, 2017 at 3:52 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> On Tue, May 9, 2017 at 9:50 AM, Kees Cook <keescook@chromium.org> wrote:
>>> On Tue, May 9, 2017 at 9:03 AM, Christoph Hellwig <hch@infradead.org> wrote:
>>>> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
>>>>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
>>>>> > fs/splice.c has some, ahem, interesting uses that have been the source
>>>>> > of nasty exploits in the past.  Converting them to use iov_iter
>>>>> > properly would be really, really nice.  Christoph, I don't suppose
>>>>> > you'd like to do that?
>>>>>
>>>>> I can take care of all the fs code including this one.
>>>>
>>>> I spent the afternoon hacking up where I'd like this to head.  It's
>>>> completely untested as of now:
>>>>
>>>>         http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination
>>>
>>> Ooooh yes! Nice work.
>>>
>>> I love this:
>>> http://git.infradead.org/users/hch/vfs.git/commitdiff/51e83f50f824ca23f5584c172138e6b7c2ff786d
>>> but I wonder what it'll cause out-of-tree code to do. I mean, I'd
>>> rather nothing out-of-tree be calling these, but I'd hate 3rd party
>>> hacks even more.
>>>
>>> http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587
>>> This accidentally(?) removes the kernel-doc comments.
>>>
>>> http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798
>>> Could this be made defensive? (Return 0 if ret wraps, for example?) I
>>> see what the comment says, but not everyone will read that. :(
>>>
>>> http://git.infradead.org/users/hch/vfs.git/commitdiff/a106276ca0294be054bc89ce97219933fe543df1
>>> Perhaps unconditionally set USER_DS on exit instead of retaining
>>> whatever was there?
>>
>> I don't like silent fixups.  If we want to do this, we should BUG or
>> at least WARN, not just change the addr limit.  But I'm also not
>> convinced it's indicative of an actual bug here.
>
> Nothing should enter that function with KERNEL_DS set, right?
>
> BUG_ON(get_fs() != USER_DS);
> set_fs(KERNEL_DS);
> ...
> set_fs(USER_DS);
>

It's not immediately obvious to me that this shouldn't happen.  Why
not do it the way Christoph did and then, once the rest of set_fs() is
tamed, consider this change (or trying to mass-convert file_operations
implementations to get rid of it entirely)?

--Andy

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                           ` <20170509160322.GA15902-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
@ 2017-05-10  2:11                             ` Al Viro
  2017-05-10  2:45                               ` Al Viro
  2017-05-10  7:28                             ` Arnd Bergmann
  1 sibling, 1 reply; 89+ messages in thread
From: Al Viro @ 2017-05-10  2:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andy Lutomirski, Ingo Molnar, Greg KH, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Tue, May 09, 2017 at 09:03:22AM -0700, Christoph Hellwig wrote:
> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
> > On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
> > > fs/splice.c has some, ahem, interesting uses that have been the source
> > > of nasty exploits in the past.  Converting them to use iov_iter
> > > properly would be really, really nice.  Christoph, I don't suppose
> > > you'd like to do that?
> > 
> > I can take care of all the fs code including this one.

Oh?

> I spent the afternoon hacking up where I'd like this to head.  It's
> completely untested as of now:
> 
> 	http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination

And just what happens to driver that has no ->read_iter()?  Unless I'm
seriously misreading that, NAK with extreme prejudice.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  2:11                             ` Al Viro
@ 2017-05-10  2:45                               ` Al Viro
  2017-05-10  3:12                                 ` Al Viro
  2017-05-10  6:49                                 ` Christoph Hellwig
  0 siblings, 2 replies; 89+ messages in thread
From: Al Viro @ 2017-05-10  2:45 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andy Lutomirski, Ingo Molnar, Greg KH, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Wed, May 10, 2017 at 03:11:18AM +0100, Al Viro wrote:
> On Tue, May 09, 2017 at 09:03:22AM -0700, Christoph Hellwig wrote:
> > On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
> > > On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
> > > > fs/splice.c has some, ahem, interesting uses that have been the source
> > > > of nasty exploits in the past.  Converting them to use iov_iter
> > > > properly would be really, really nice.  Christoph, I don't suppose
> > > > you'd like to do that?
> > > 
> > > I can take care of all the fs code including this one.
> 
> Oh?
> 
> > I spent the afternoon hacking up where I'd like this to head.  It's
> > completely untested as of now:
> > 
> > 	http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination
> 
> And just what happens to driver that has no ->read_iter()?  Unless I'm
> seriously misreading that, NAK with extreme prejudice.

FWIW, some parts of that queue are obviously sane; it's the conversions of
kernel_write() and friends to ->read_iter/->write_iter() that are non-starters.
That stuff is used in too many situations; we can't guarantee that all of
them will be for files that have those.

As for default_file_splice_read(), I seriously suspect that with your change
we could as well just make it return -EINVAL and be done with that; places
that have ->read_iter() tend to have explicit ->splice_read() and it looks
like the ones that do not should simply use generic_file_read_iter().
I hadn't checked that, but there's not a lot of those:

arch/s390/hypfs/inode.c:437:    .read_iter      = hypfs_read_iter,
drivers/char/mem.c:798: .read_iter      = read_iter_null,
drivers/char/mem.c:813: .read_iter      = read_iter_zero,
drivers/char/mem.c:824: .read_iter      = read_iter_zero,
drivers/char/raw.c:286: .read_iter      = blkdev_read_iter,
drivers/net/tap.c:1134: .read_iter      = tap_read_iter,
drivers/net/tun.c:2423: .read_iter  = tun_chr_read_iter,
drivers/usb/gadget/function/f_fs.c:1255:        .read_iter =    ffs_epfile_read_iter,
drivers/usb/gadget/legacy/inode.c:703:  .read_iter =    ep_read_iter,
drivers/vhost/net.c:1252:       .read_iter      = vhost_net_chr_read_iter,
fs/9p/vfs_file.c:641:   .read_iter = generic_file_read_iter,
fs/9p/vfs_file.c:652:   .read_iter = generic_file_read_iter,
fs/9p/vfs_file.c:664:   .read_iter = v9fs_file_read_iter,
fs/9p/vfs_file.c:675:   .read_iter = v9fs_file_read_iter,
fs/9p/vfs_file.c:687:   .read_iter = v9fs_mmap_file_read_iter,
fs/9p/vfs_file.c:698:   .read_iter = v9fs_mmap_file_read_iter,
fs/fuse/cuse.c:180:     .read_iter              = cuse_read_iter,
fs/fuse/file.c:3015:    .read_iter      = fuse_direct_read_iter,
fs/hugetlbfs/inode.c:980:       .read_iter              = hugetlbfs_read_iter,
fs/ncpfs/file.c:248:    .read_iter      = ncp_file_read_iter,
fs/orangefs/file.c:742: .read_iter      = orangefs_file_read_iter,
fs/pipe.c:1011: .read_iter      = pipe_read,
sound/core/pcm_native.c:3696:           .read_iter =            snd_pcm_readv,

is the full list and I'm fairly certain that most of them will work with
generic_file_splice_read() just fine.  drivers/char definitely will, so
will ncpfs/orangefs/hugetlbfs/most of 9p ones (two of the latter might
need some care in p9_client_read(), but that should be doable easily enough).
pipe is irrelevant (->splice_read() won't be called for those).  fuse ones
should be doable, but that might take a bit more infrastructure work in
lib/iov_iter.c.  vhost, gadgetfs, tun/tap - no idea at the moment.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  2:45                               ` Al Viro
@ 2017-05-10  3:12                                 ` Al Viro
       [not found]                                   ` <20170510031254.GC390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  2017-05-10  6:53                                   ` Christoph Hellwig
  2017-05-10  6:49                                 ` Christoph Hellwig
  1 sibling, 2 replies; 89+ messages in thread
From: Al Viro @ 2017-05-10  3:12 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andy Lutomirski, Ingo Molnar, Greg KH, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Wed, May 10, 2017 at 03:45:24AM +0100, Al Viro wrote:

> FWIW, some parts of that queue are obviously sane; it's the conversions of
> kernel_write() and friends to ->read_iter/->write_iter() that are non-starters.

Egads...  OK, I *have* misread what you are doing there.  Your vfs_iter_read()
works for files sans ->read_iter().  For strange values of "works" - you
hardwire "it's either iovec or kvec iterator" into its calling conventions,
which is a trouble waiting to happen.

What's the point?  What's wrong with having kernel_read()/kernel_readv()/etc.?
You still have set_fs() in there; doing that one level up in call chain would
be just fine...  IDGI.

Broken commit: "net: don't play with address limits in kernel_recvmsg".
It would be OK if it was only about data.  Unfortunately, that's not
true in one case: svc_udp_recvfrom() wants ->msg_control.

Another delicate place: you can't assume that write() always advances
file position by its (positive) return value.  btrfs stuff is sensitive
to that.

ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure
about blind asma->file->f_pos += ret.  That's begging for races.  Actually,
scratch that - it *is* racy.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                   ` <20170510031254.GC390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
@ 2017-05-10  3:21                                     ` Al Viro
       [not found]                                       ` <20170510032137.GD390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Al Viro @ 2017-05-10  3:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andy Lutomirski, Ingo Molnar, Greg KH, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote:

> Broken commit: "net: don't play with address limits in kernel_recvmsg".
> It would be OK if it was only about data.  Unfortunately, that's not
> true in one case: svc_udp_recvfrom() wants ->msg_control.
> 
> Another delicate place: you can't assume that write() always advances
> file position by its (positive) return value.  btrfs stuff is sensitive
> to that.
> 
> ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure
> about blind asma->file->f_pos += ret.  That's begging for races.  Actually,
> scratch that - it *is* racy.

kvec_length(): please, don't.  I would rather have the last remaining
iov_length() gone...   What do you need it for, anyway?  You have only
two users and both have the count passed to them (as *count and *cnt resp.)

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                       ` <20170510032137.GD390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
@ 2017-05-10  3:39                                         ` Al Viro
  2017-05-10  6:54                                           ` Christoph Hellwig
  0 siblings, 1 reply; 89+ messages in thread
From: Al Viro @ 2017-05-10  3:39 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andy Lutomirski, Ingo Molnar, Greg KH, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Wed, May 10, 2017 at 04:21:37AM +0100, Al Viro wrote:
> On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote:
> 
> > Broken commit: "net: don't play with address limits in kernel_recvmsg".
> > It would be OK if it was only about data.  Unfortunately, that's not
> > true in one case: svc_udp_recvfrom() wants ->msg_control.
> > 
> > Another delicate place: you can't assume that write() always advances
> > file position by its (positive) return value.  btrfs stuff is sensitive
> > to that.
> > 
> > ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure
> > about blind asma->file->f_pos += ret.  That's begging for races.  Actually,
> > scratch that - it *is* racy.
> 
> kvec_length(): please, don't.  I would rather have the last remaining
> iov_length() gone...   What do you need it for, anyway?  You have only
> two users and both have the count passed to them (as *count and *cnt resp.)

fcntl stuff: I've decided not to put something similar into work.compat
since I couldn't decide what to do with compat stuff - word-by-word copy
from userland converting to struct flock + conversion to posix_lock +
actual work + conversion to flock + word-by-word copy to userland...  Smells
like we might be better off with compat_flock_to_posix_lock() et.al.
I'm still not sure; played a bit one way and another and dediced to drop
it for now.  Hell knows...

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-09 16:50                           ` Kees Cook
       [not found]                             ` <CAGXu5jKHVMRMKDfn+=kkbm+JkWPhoEtDwKx=QXAYxg1p9bn7PQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-10  6:46                             ` Christoph Hellwig
  1 sibling, 0 replies; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-10  6:46 UTC (permalink / raw)
  To: Kees Cook
  Cc: Mark Rutland, Brian Gerst, Kernel Hardening, Catalin Marinas,
	Heiko Carstens, Oleg Nesterov, David Howells, Dave Hansen,
	H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov, linux-s390,
	the arch/x86 maintainers, Russell King, Christoph Hellwig,
	Christian Borntraeger, René Nyffenegger, Greg KH,
	Paul E . McKenney, Rik van Riel, Peter Zijlstra, Arnd Bergmann

On Tue, May 09, 2017 at 09:50:32AM -0700, Kees Cook wrote:
> http://git.infradead.org/users/hch/vfs.git/commitdiff/018e0e9030777121fe87e89d43066691e7366587
> This accidentally(?) removes the kernel-doc comments.

That was sort of intentional, the kerneldoc boilerplate adds very little
value for method instances.

> http://git.infradead.org/users/hch/vfs.git/commitdiff/78b62c730254fc39fa888cdbdca08fde6e09a798
> Could this be made defensive? (Return 0 if ret wraps, for example?) I
> see what the comment says, but not everyone will read that. :(

It's a straight copy of iov_length, and I'd rather keep it that way.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  2:45                               ` Al Viro
  2017-05-10  3:12                                 ` Al Viro
@ 2017-05-10  6:49                                 ` Christoph Hellwig
  1 sibling, 0 replies; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-10  6:49 UTC (permalink / raw)
  To: Al Viro
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney, Rik van Riel, Kees Cook

On Wed, May 10, 2017 at 03:45:24AM +0100, Al Viro wrote:
> FWIW, some parts of that queue are obviously sane; it's the conversions of
> kernel_write() and friends to ->read_iter/->write_iter() that are non-starters.

And that part is the main point!

> That stuff is used in too many situations; we can't guarantee that all of
> them will be for files that have those.

That's why this series handles ITER_KVEC for this case, which is all
that's really needed for kernel_read/write.  If you insiste the bvec
and pipe cases are handled as well that couod be added fairly easily.

> As for default_file_splice_read(), I seriously suspect that with your change
> we could as well just make it return -EINVAL and be done with that; places
> that have ->read_iter() tend to have explicit ->splice_read() and it looks
> like the ones that do not should simply use generic_file_read_iter().
> I hadn't checked that, but there's not a lot of those:

Making ->splice_read to default to the ->read_iter based implementation
and returning -EINVAL if neither that nor an explicit ->splice_read
is provided is useful, but wasn't the aim of this series.  Similar
on the write side.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  3:12                                 ` Al Viro
       [not found]                                   ` <20170510031254.GC390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
@ 2017-05-10  6:53                                   ` Christoph Hellwig
  2017-05-10  7:27                                     ` Al Viro
  1 sibling, 1 reply; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-10  6:53 UTC (permalink / raw)
  To: Al Viro
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney, Rik van Riel, Kees Cook

On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote:
> What's the point?  What's wrong with having kernel_read()/kernel_readv()/etc.?
> You still have set_fs() in there; doing that one level up in call chain would
> be just fine...  IDGI.

The problem is that they modify the address limit, which the whole
subthread here wants to get rid of.

> Broken commit: "net: don't play with address limits in kernel_recvmsg".
> It would be OK if it was only about data.  Unfortunately, that's not
> true in one case: svc_udp_recvfrom() wants ->msg_control.

Dropped, but we'll need to fix that eventually.

> Another delicate place: you can't assume that write() always advances
> file position by its (positive) return value.  btrfs stuff is sensitive
> to that.

If we don't want to assume that we need to pass pointer to pos to
kernel_read/write.  Which might be a good idea in general.

> ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure
> about blind asma->file->f_pos += ret.  That's begging for races.  Actually,
> scratch that - it *is* racy.

I think the proper fix is to not even bother to maintain f_pos of the
backing file, as we don't ever use it - all reads from it pass in
an explicit position anyway.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  3:39                                         ` Al Viro
@ 2017-05-10  6:54                                           ` Christoph Hellwig
  0 siblings, 0 replies; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-10  6:54 UTC (permalink / raw)
  To: Al Viro
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney, Rik van Riel, Kees Cook

On Wed, May 10, 2017 at 04:39:12AM +0100, Al Viro wrote:
> fcntl stuff: I've decided not to put something similar into work.compat
> since I couldn't decide what to do with compat stuff - word-by-word copy
> from userland converting to struct flock + conversion to posix_lock +
> actual work + conversion to flock + word-by-word copy to userland...  Smells
> like we might be better off with compat_flock_to_posix_lock() et.al.
> I'm still not sure; played a bit one way and another and dediced to drop
> it for now.  Hell knows...

My version already is an improvement in lines of code alone.  Between
that and stopping to mess with the address limit I think it's a clear
winner.  But it's pretty independent of the rest, and I'll just run
it through Jeff and Bruce and ask them what they think.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-09 23:31                                   ` Kees Cook
  2017-05-10  1:59                                     ` Andy Lutomirski
@ 2017-05-10  7:15                                     ` Christoph Hellwig
       [not found]                                     ` <CAGXu5jL6PeQmmdxh5h--fgrMK8DW_XZYpNfDOvvv_o9E3-Kxdw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2 siblings, 0 replies; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-10  7:15 UTC (permalink / raw)
  To: Kees Cook
  Cc: Mark Rutland, Brian Gerst, Kernel Hardening, Catalin Marinas,
	Heiko Carstens, Oleg Nesterov, David Howells, Dave Hansen,
	H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov, linux-s390,
	the arch/x86 maintainers, Russell King, Christoph Hellwig,
	Christian Borntraeger, René Nyffenegger, Greg KH,
	Paul E . McKenney, Rik van Riel, Peter Zijlstra, Arnd Bergmann

On Tue, May 09, 2017 at 04:31:00PM -0700, Kees Cook wrote:
> > I don't like silent fixups.  If we want to do this, we should BUG or
> > at least WARN, not just change the addr limit.  But I'm also not
> > convinced it's indicative of an actual bug here.
> 
> Nothing should enter that function with KERNEL_DS set, right?

It might very well do.  Various drivers or the networking code mess
with the address limits for fairly broad sections of code.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  6:53                                   ` Christoph Hellwig
@ 2017-05-10  7:27                                     ` Al Viro
  2017-05-10  7:35                                       ` Christoph Hellwig
  0 siblings, 1 reply; 89+ messages in thread
From: Al Viro @ 2017-05-10  7:27 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andy Lutomirski, Ingo Molnar, Greg KH, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Tue, May 09, 2017 at 11:53:01PM -0700, Christoph Hellwig wrote:
> On Wed, May 10, 2017 at 04:12:54AM +0100, Al Viro wrote:
> > What's the point?  What's wrong with having kernel_read()/kernel_readv()/etc.?
> > You still have set_fs() in there; doing that one level up in call chain would
> > be just fine...  IDGI.
> 
> The problem is that they modify the address limit, which the whole
> subthread here wants to get rid of.

And you *still* do the same.  Christoph, this is ridiculous - the worst
part of the area is not a couple of functions in fs/read_write.c, it's
a fucking lot of ->read() and ->write() instances in shitty driver code,
pardon the redundance.  And _that_ is still done under set_fs(KERNEL_DS).

Claiming that set_fs() done one function deeper in callchain (both in
fs/read_write.c) is somehow better because it reduces the amount of code
under that thing...  Get real, please - helpers that encapsulate those
set_fs() pairs (a-la kernel_read(), etc.) absolutely make sense and
converting their open-coded instances to calls of those helpers is clearly
a good thing.  However, we are not
	* getting rid of low-quality code run under KERNEL_DS
	* gettind rid of set_fs() itself
	* getting a generic kernel_read() variant that would really take
an iov_iter.

That's what I'm objecting to.  Centralized kernel_readv() et.al. - sure,
and fs/read_write.c is the right place for those.  No arguments here.
Conversion to those - absolutely; drivers have no fucking business touching
set_fs() at all.  But your primitives are trouble waiting to happen.
Let them take kvec arrays.  And let them, in case when there's no
->read_iter()/->write_iter(), do set_fs().  Statically, without this
if (iter->type & ITER_KVEC) ... stuff.

> > Another delicate place: you can't assume that write() always advances
> > file position by its (positive) return value.  btrfs stuff is sensitive
> > to that.
> 
> If we don't want to assume that we need to pass pointer to pos to
> kernel_read/write.  Which might be a good idea in general.

Yes.

> > ashmem probably _is_ OK with demanding ->read_iter(), but I'm not sure
> > about blind asma->file->f_pos += ret.  That's begging for races.  Actually,
> > scratch that - it *is* racy.
> 
> I think the proper fix is to not even bother to maintain f_pos of the
> backing file, as we don't ever use it - all reads from it pass in
> an explicit position anyway.

vfs_llseek() used by ashmem_llseek()...

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                           ` <20170509160322.GA15902-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
  2017-05-10  2:11                             ` Al Viro
@ 2017-05-10  7:28                             ` Arnd Bergmann
  2017-05-10  7:35                               ` Christoph Hellwig
  1 sibling, 1 reply; 89+ messages in thread
From: Arnd Bergmann @ 2017-05-10  7:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andy Lutomirski, Ingo Molnar, Greg KH, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Thomas Gleixner,
	David Howells, René Nyffenegger, Andrew Morton,
	Paul E . McKenney, Eric W . Biederman, Oleg Nesterov,
	Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin, Paolo Bonzini,
	Rik van Riel

On Tue, May 9, 2017 at 6:03 PM, Christoph Hellwig <hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org> wrote:
> On Tue, May 09, 2017 at 06:02:50AM -0700, Christoph Hellwig wrote:
>> On Tue, May 09, 2017 at 06:00:01AM -0700, Andy Lutomirski wrote:
>> > fs/splice.c has some, ahem, interesting uses that have been the source
>> > of nasty exploits in the past.  Converting them to use iov_iter
>> > properly would be really, really nice.  Christoph, I don't suppose
>> > you'd like to do that?
>>
>> I can take care of all the fs code including this one.
>
> I spent the afternoon hacking up where I'd like this to head.  It's
> completely untested as of now:
>
>         http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination

My older time64_t syscall series has the side-effect of doing something
like this to the time-related compat handlers in kernel/compat.c. If nobody
else has started looking at removing set_fs from those, I can extract
the relevant parts from my series.

      Arnd

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  7:27                                     ` Al Viro
@ 2017-05-10  7:35                                       ` Christoph Hellwig
  0 siblings, 0 replies; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-10  7:35 UTC (permalink / raw)
  To: Al Viro
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney, Rik van Riel, Kees Cook

On Wed, May 10, 2017 at 08:27:47AM +0100, Al Viro wrote:
> And you *still* do the same.  Christoph, this is ridiculous - the worst
> part of the area is not a couple of functions in fs/read_write.c, it's
> a fucking lot of ->read() and ->write() instances in shitty driver code,
> pardon the redundance.  And _that_ is still done under set_fs(KERNEL_DS).

For now yes.  But it provides a sane API on top, and then we can
start moving the drivers that matter to the iter variants and drop
support for the rest soon.  Most in-kernel I/O is done to files, and
the rest to a very limited set of devices. (not accounting for sockets
through their own APIs, thats another story)

> That's what I'm objecting to.  Centralized kernel_readv() et.al. - sure,
> and fs/read_write.c is the right place for those.  No arguments here.
> Conversion to those - absolutely; drivers have no fucking business touching
> set_fs() at all.  But your primitives are trouble waiting to happen.
> Let them take kvec arrays.  And let them, in case when there's no
> ->read_iter()/->write_iter(), do set_fs().  Statically, without this
> if (iter->type & ITER_KVEC) ... stuff.

Oh weĺl.  I can do that first, but I think eventually we'll have to
get back to what I've done now.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  7:28                             ` Arnd Bergmann
@ 2017-05-10  7:35                               ` Christoph Hellwig
  0 siblings, 0 replies; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-10  7:35 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney, Rik van Riel, Kees Cook

On Wed, May 10, 2017 at 09:28:48AM +0200, Arnd Bergmann wrote:
> My older time64_t syscall series has the side-effect of doing something
> like this to the time-related compat handlers in kernel/compat.c. If nobody
> else has started looking at removing set_fs from those, I can extract
> the relevant parts from my series.

That would be great.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                       ` <CALCETrUh8NO2scaqEM48K70Fo2+V3=Cpyk4JurCDiCYp4nm_+g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-10  7:37                         ` Arnd Bergmann
  2017-05-10  8:08                           ` Al Viro
  2017-05-12  7:00                         ` Ingo Molnar
  1 sibling, 1 reply; 89+ messages in thread
From: Arnd Bergmann @ 2017-05-10  7:37 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Christoph Hellwig, Ingo Molnar, Greg KH, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Thomas Gleixner,
	David Howells, René Nyffenegger, Andrew Morton,
	Paul E . McKenney, Eric W . Biederman, Oleg Nesterov,
	Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin, Paolo Bonzini,
	Rik van Riel

On Tue, May 9, 2017 at 3:00 PM, Andy Lutomirski <luto-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org> wrote:
>> On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote:
>>> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would
>>> be a pity to add a runtime check to every system call ...
>>
>> I think we should simply strive to remove all of them that aren't
>> in core scheduler / arch code.  Basically evetyytime we do the
>>
>>         oldfs = get_fs();
>>         set_fs(KERNEL_DS);
>>         ..
>>         set_fs(oldfs);
>>
>> trick we're doing something wrong, and there should always be better
>> ways to archive it.  E.g. using iov_iter with a ITER_KVEC type
>> consistently would already remove most of them.
>
> How about trying to remove all of them?  If we could actually get rid
> of all of them, we could drop the arch support, and we'd get faster,
> simpler, shorter uaccess code throughout the kernel.
>
> The ones in kernel/compat.c are generally garbage.  They should be
> using compat_alloc_user_space().  Ditto for kernel/power/user.c.

compat_alloc_user_space() has some problems too, it adds
complexity to a rarely-tested code path and can add some noticeable
overhead in cases where user space access is slow because of
extra checks.

It's clearly better than set_fs(), but the way I prefer to convert the
code is to avoid both and instead move compat handlers next to
the native code, and splitting out the common code between native
and compat mode into a helper that takes a regular kernel pointer.

I think that's what both Al has done in the past on compat_ioctl()
and select() and what Christoph does in his latest series, but
it seems worth pointing out for others that decide to help out here.

     Arnd

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  7:37                         ` [kernel-hardening] " Arnd Bergmann
@ 2017-05-10  8:08                           ` Al Viro
  2017-05-10  8:14                             ` Christoph Hellwig
  0 siblings, 1 reply; 89+ messages in thread
From: Al Viro @ 2017-05-10  8:08 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Andy Lutomirski, Christoph Hellwig, Ingo Molnar, Greg KH,
	Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Wed, May 10, 2017 at 09:37:04AM +0200, Arnd Bergmann wrote:

> > How about trying to remove all of them?  If we could actually get rid
> > of all of them, we could drop the arch support, and we'd get faster,
> > simpler, shorter uaccess code throughout the kernel.

BTW, not all get_user() under KERNEL_DS are plain loads.  There is an
exception - probe_kernel_read().

> > The ones in kernel/compat.c are generally garbage.  They should be
> > using compat_alloc_user_space().  Ditto for kernel/power/user.c.
> 
> compat_alloc_user_space() has some problems too, it adds
> complexity to a rarely-tested code path and can add some noticeable
> overhead in cases where user space access is slow because of
> extra checks.
> 
> It's clearly better than set_fs(), but the way I prefer to convert the
> code is to avoid both and instead move compat handlers next to
> the native code, and splitting out the common code between native
> and compat mode into a helper that takes a regular kernel pointer.
> 
> I think that's what both Al has done in the past on compat_ioctl()
> and select() and what Christoph does in his latest series, but
> it seems worth pointing out for others that decide to help out here.

Folks, reducing the amount of places where we play with set_fs() is certainly
a good thing.  Getting rid of them completely is something entirely different;
I have tried to plot out patch series in this direction many times during the
last 5 years or so, but it's not going to be easy.  Tomorrow I can start
posting my notes in that direction (and there are tons of those, unfortunately
mixed with git grep results, highly unprintable personal comments, etc.);
just let me grab some sleep first...

BTW, slow userland access is not just due to extra checks; access_ok(),
in particular, is pretty much noise.  The real PITA comes from the things
like STAC/CLAC on recent x86.  Or hardware overhead of cross-address-space
block copy insn (e.g. on s390, where it's optimized for multi-cacheline
blocks).  Or things like uml, where it's a matter of walking the page
tables for each sodding __get_user().  It's not always just a matter of
address space limit...

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  8:08                           ` Al Viro
@ 2017-05-10  8:14                             ` Christoph Hellwig
  2017-05-11  0:18                               ` Andy Lutomirski
  0 siblings, 1 reply; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-10  8:14 UTC (permalink / raw)
  To: Al Viro
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney, Rik van Riel, Kees Cook

On Wed, May 10, 2017 at 09:08:41AM +0100, Al Viro wrote:
> On Wed, May 10, 2017 at 09:37:04AM +0200, Arnd Bergmann wrote:
> 
> > > How about trying to remove all of them?  If we could actually get rid
> > > of all of them, we could drop the arch support, and we'd get faster,
> > > simpler, shorter uaccess code throughout the kernel.
> 
> BTW, not all get_user() under KERNEL_DS are plain loads.  There is an
> exception - probe_kernel_read().

And various calls that looks like opencoded versions, e.g. drivers/dio
or the ELF loader.

But in the long run we'll just need a separate primitive for that,
but that can wait until the set_fs calls outside the core code are
gone.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-10  8:14                             ` Christoph Hellwig
@ 2017-05-11  0:18                               ` Andy Lutomirski
  0 siblings, 0 replies; 89+ messages in thread
From: Andy Lutomirski @ 2017-05-11  0:18 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Al Viro, Arnd Bergmann, Andy Lutomirski, Ingo Molnar, Greg KH,
	Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin

On Wed, May 10, 2017 at 1:14 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Wed, May 10, 2017 at 09:08:41AM +0100, Al Viro wrote:
>> On Wed, May 10, 2017 at 09:37:04AM +0200, Arnd Bergmann wrote:
>>
>> > > How about trying to remove all of them?  If we could actually get rid
>> > > of all of them, we could drop the arch support, and we'd get faster,
>> > > simpler, shorter uaccess code throughout the kernel.
>>
>> BTW, not all get_user() under KERNEL_DS are plain loads.  There is an
>> exception - probe_kernel_read().
>
> And various calls that looks like opencoded versions, e.g. drivers/dio
> or the ELF loader.
>
> But in the long run we'll just need a separate primitive for that,
> but that can wait until the set_fs calls outside the core code are
> gone.

I suspect that, on most arches, the primitive is called
__copy_from_user().  We could make the generic code do that except
where overridden.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                     ` <CAGXu5jL6PeQmmdxh5h--fgrMK8DW_XZYpNfDOvvv_o9E3-Kxdw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-11 11:22                                       ` Borislav Petkov
  0 siblings, 0 replies; 89+ messages in thread
From: Borislav Petkov @ 2017-05-11 11:22 UTC (permalink / raw)
  To: Kees Cook, Andy Lutomirski
  Cc: Christoph Hellwig, Andy Lutomirski, Mark Rutland,
	Kernel Hardening, Greg KH, Heiko Carstens, LKML, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	Peter Zijlstra, linux-s390, the arch/x86 maintainers,
	Russell King, Will Deacon, Christian Borntraeger,
	René Nyffenegger

On Tue, May 09, 2017 at 04:31:00PM -0700, Kees Cook wrote:
> > I don't like silent fixups.  If we want to do this, we should BUG or
> > at least WARN, not just change the addr limit.  But I'm also not
> > convinced it's indicative of an actual bug here.
> 
> Nothing should enter that function with KERNEL_DS set, right?
> 
> BUG_ON(get_fs() != USER_DS);

We're feeling triggerhappy, aren't we? A nice juicy WARN-splat along
with a fixup looks much better than killing the box, to me.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                               ` <CAJcbSZFswDWZoK-1UK+xkRMJ4ttSYbtH2Y5WD5_aPR-8ru6t8A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-11 23:17                                 ` Thomas Garnier
       [not found]                                   ` <CAJcbSZEoRyewUtBHvqmNZL9FtT_q42Vmmd-EuC50x-ZRASiHHg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Thomas Garnier @ 2017-05-11 23:17 UTC (permalink / raw)
  To: Greg KH
  Cc: Ingo Molnar, Kees Cook, Daniel Micay, Martin Schwidefsky,
	Heiko Carstens, Dave Hansen, Arnd Bergmann, Thomas Gleixner,
	David Howells, René Nyffenegger, Andrew Morton,
	Paul E . McKenney, Eric W . Biederman, Oleg Nesterov,
	Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin, Andy Lutomirski,
	Paolo Bonzini

On Tue, May 9, 2017 at 7:29 AM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
>
> On Tue, May 9, 2017 at 4:10 AM, Greg KH <greg-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org> wrote:
> > On Tue, May 09, 2017 at 08:56:19AM +0200, Ingo Molnar wrote:
> >>
> >> * Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> >>
> >> > > There's the option of using GCC plugins now that the infrastructure was
> >> > > upstreamed from grsecurity. It can be used as part of the regular build
> >> > > process and as long as the analysis is pretty simple it shouldn't hurt compile
> >> > > time much.
> >> >
> >> > Well, and that the situation may arise due to memory corruption, not from
> >> > poorly-matched set_fs() calls, which static analysis won't help solve. We need
> >> > to catch this bad kernel state because it is a very bad state to run in.
> >>
> >> If memory corruption corrupted the task state into having addr_limit set to
> >> KERNEL_DS then there's already a fair chance that it's game over: it could also
> >> have set *uid to 0, or changed a sensitive PF_ flag, or a number of other
> >> things...
> >>
> >> Furthermore, think about it: there's literally an infinite amount of corrupted
> >> task states that could be a security problem and that could be checked after every
> >> system call. Do we want to check every one of them?
> >
> > Ok, I'm all for not checking lots of stuff all the time, just to protect
> > from crappy drivers that.  Especially as we _can_ audit and run checks
> > on the source code for them in the kernel tree.
> >
> > But, and here's the problem, outside of the desktop/enterprise world,
> > there are a ton of out-of-tree code that is crap.  The number of
> > security/bug fixes and kernel crashes for out-of-tree code in systems
> > like Android phones is just so high it's laughable.
> >
> > When you have a device that is running 3.2 million lines of kernel code,
> > yet the diffstat of the tree compared to mainline adds 3 million lines
> > of code, there is bound to be a ton of issues/problems there.
> >
> > So this is an entirely different thing we need to try to protect
> > ourselves from.  A long time ago I laughed when I saw that Microsoft had
> > to do lots of "hardening" of their kernel to protect themselves from
> > crappy drivers, as I knew we didn't have to do that because we had the
> > source for them and could fix the root issues.  But that has changed and
> > now we don't all have that option.  That code is out-of-tree because the
> > vendor doesn't care, and doesn't want to take any time at all to do
> > anything resembling a real code review[1].
>
> That's a big part of why I thought would be useful. I am less worried
> about edge cases upstream right now than forks with custom codes not
> using set_fs correctly.
>
> >
> > So, how about options like the ones being proposed here, go behind a new
> > config option:
> >         CONFIG_PROTECT_FROM_CRAPPY_DRIVERS
> > that device owners can enable if they do not trust their vendor-provided
> > code (hint, I sure don't.)  That way the "normal" path that all of us
> > are used to running will be fine, but if you want to take the speed hit
> > to try to protect yourself, then you can do that as well.
>
> Maybe another name but why not.

Ingo: Do you want the change as-is? Would you like it to be optional?
What do you think?

>
> >
> > Anyway, just an idea...
> >
> > thanks,
> >
> > greg k-h
> >
> > [1] I am working really hard with lots of vendors to try to fix their
> >     broken development model, but that is going to take years to resolve
> >     as their device pipelines are years long, and changing their
> >     mindsets takes a long time...
>
>
>
> --
> Thomas




-- 
Thomas

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                   ` <CAJcbSZEoRyewUtBHvqmNZL9FtT_q42Vmmd-EuC50x-ZRASiHHg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-11 23:44                                     ` Linus Torvalds
       [not found]                                       ` <CA+55aFwvQfs_X+paQF6Luc0Rq+W3J2fKuHRou7=ANcquDdXdDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
                                                         ` (2 more replies)
  0 siblings, 3 replies; 89+ messages in thread
From: Linus Torvalds @ 2017-05-11 23:44 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Greg KH, Ingo Molnar, Kees Cook, Daniel Micay,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski

On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
>
> Ingo: Do you want the change as-is? Would you like it to be optional?
> What do you think?

I'm not ingo, but I don't like that patch. It's in the wrong place -
that system call return code is too timing-critical to add address
limit checks.

Now what I think you *could* do is:

 - make "set_fs()" actually set a work flag in the current thread flags

 - do the test in the slow-path (syscall_return_slowpath).

Yes, yes, that ends up being architecture-specific, but it's fairly simple.

And it only slows down the system calls that actually use "set_fs()".
Sure, it will slow those down a fair amount, but they are hopefully a
small subset of all cases.

How does that sound to people?  Thats' where we currently do that

        if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
            WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
regs->orig_ax))
                local_irq_enable();

check too, which is a fairly similar issue.

               Linus

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                       ` <CA+55aFwvQfs_X+paQF6Luc0Rq+W3J2fKuHRou7=ANcquDdXdDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-12  5:28                                         ` Martin Schwidefsky
  2017-05-12  5:34                                           ` Kees Cook
  0 siblings, 1 reply; 89+ messages in thread
From: Martin Schwidefsky @ 2017-05-12  5:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Garnier, Greg KH, Ingo Molnar, Kees Cook, Daniel Micay,
	Heiko Carstens, Dave Hansen, Arnd Bergmann, Thomas Gleixner,
	David Howells, René Nyffenegger, Andrew Morton,
	Paul E . McKenney, Eric W . Biederman, Oleg Nesterov,
	Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin, Andy Lutomirski

On Thu, 11 May 2017 16:44:07 -0700
Linus Torvalds <torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> wrote:

> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
> >
> > Ingo: Do you want the change as-is? Would you like it to be optional?
> > What do you think?  
> 
> I'm not ingo, but I don't like that patch. It's in the wrong place -
> that system call return code is too timing-critical to add address
> limit checks.
> 
> Now what I think you *could* do is:
> 
>  - make "set_fs()" actually set a work flag in the current thread flags
> 
>  - do the test in the slow-path (syscall_return_slowpath).
> 
> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
> 
> And it only slows down the system calls that actually use "set_fs()".
> Sure, it will slow those down a fair amount, but they are hopefully a
> small subset of all cases.
> 
> How does that sound to people?  Thats' where we currently do that
> 
>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
> regs->orig_ax))
>                 local_irq_enable();
> 
> check too, which is a fairly similar issue.

This is exactly what Heiko did for the s390 backend as a result of this
discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S,
for the hot patch the check for the bit is included in the general
_CIF_WORK test. Only the slow patch gets a bit slower.

git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
"s390: restore address space when returning to user space".

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  5:28                                         ` Martin Schwidefsky
@ 2017-05-12  5:34                                           ` Kees Cook
  2017-05-12  5:54                                             ` Martin Schwidefsky
       [not found]                                             ` <CAGXu5j+EatK=DYONRkgovwLgytAnbG8jnAZaMSLckZFNVj3gig-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 2 replies; 89+ messages in thread
From: Kees Cook @ 2017-05-12  5:34 UTC (permalink / raw)
  To: Martin Schwidefsky
  Cc: Linus Torvalds, Thomas Garnier, Greg KH, Ingo Molnar,
	Daniel Micay, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski

On Thu, May 11, 2017 at 10:28 PM, Martin Schwidefsky
<schwidefsky-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org> wrote:
> On Thu, 11 May 2017 16:44:07 -0700
> Linus Torvalds <torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> wrote:
>
>> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
>> >
>> > Ingo: Do you want the change as-is? Would you like it to be optional?
>> > What do you think?
>>
>> I'm not ingo, but I don't like that patch. It's in the wrong place -
>> that system call return code is too timing-critical to add address
>> limit checks.
>>
>> Now what I think you *could* do is:
>>
>>  - make "set_fs()" actually set a work flag in the current thread flags
>>
>>  - do the test in the slow-path (syscall_return_slowpath).
>>
>> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
>>
>> And it only slows down the system calls that actually use "set_fs()".
>> Sure, it will slow those down a fair amount, but they are hopefully a
>> small subset of all cases.
>>
>> How does that sound to people?  Thats' where we currently do that
>>
>>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
>> regs->orig_ax))
>>                 local_irq_enable();
>>
>> check too, which is a fairly similar issue.
>
> This is exactly what Heiko did for the s390 backend as a result of this
> discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S,
> for the hot patch the check for the bit is included in the general
> _CIF_WORK test. Only the slow patch gets a bit slower.
>
> git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
> "s390: restore address space when returning to user space".

If I'm understanding this, it won't catch corruption of addr_limit
during fast-path syscalls, though (i.e. addr_limit changed without a
call to set_fs()). :( This addr_limit corruption is mostly only a risk
archs without THREAD_INFO_IN_TASK, but it would still be nice to catch
unbalanced set_fs() code, so I like the idea. I like getting rid of
addr_limit entirely even more, but that'll take some time. :)

-Kees


-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  5:34                                           ` Kees Cook
@ 2017-05-12  5:54                                             ` Martin Schwidefsky
  2017-05-12 19:01                                               ` Kees Cook
       [not found]                                             ` <CAGXu5j+EatK=DYONRkgovwLgytAnbG8jnAZaMSLckZFNVj3gig-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 89+ messages in thread
From: Martin Schwidefsky @ 2017-05-12  5:54 UTC (permalink / raw)
  To: Kees Cook
  Cc: Linus Torvalds, Thomas Garnier, Greg KH, Ingo Molnar,
	Daniel Micay, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski

On Thu, 11 May 2017 22:34:31 -0700
Kees Cook <keescook@chromium.org> wrote:

> On Thu, May 11, 2017 at 10:28 PM, Martin Schwidefsky
> <schwidefsky@de.ibm.com> wrote:
> > On Thu, 11 May 2017 16:44:07 -0700
> > Linus Torvalds <torvalds@linux-foundation.org> wrote:
> >  
> >> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:  
> >> >
> >> > Ingo: Do you want the change as-is? Would you like it to be optional?
> >> > What do you think?  
> >>
> >> I'm not ingo, but I don't like that patch. It's in the wrong place -
> >> that system call return code is too timing-critical to add address
> >> limit checks.
> >>
> >> Now what I think you *could* do is:
> >>
> >>  - make "set_fs()" actually set a work flag in the current thread flags
> >>
> >>  - do the test in the slow-path (syscall_return_slowpath).
> >>
> >> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
> >>
> >> And it only slows down the system calls that actually use "set_fs()".
> >> Sure, it will slow those down a fair amount, but they are hopefully a
> >> small subset of all cases.
> >>
> >> How does that sound to people?  Thats' where we currently do that
> >>
> >>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
> >>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
> >> regs->orig_ax))
> >>                 local_irq_enable();
> >>
> >> check too, which is a fairly similar issue.  
> >
> > This is exactly what Heiko did for the s390 backend as a result of this
> > discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S,
> > for the hot patch the check for the bit is included in the general
> > _CIF_WORK test. Only the slow patch gets a bit slower.
> >
> > git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
> > "s390: restore address space when returning to user space".  
> 
> If I'm understanding this, it won't catch corruption of addr_limit
> during fast-path syscalls, though (i.e. addr_limit changed without a
> call to set_fs()). :( This addr_limit corruption is mostly only a risk
> archs without THREAD_INFO_IN_TASK, but it would still be nice to catch
> unbalanced set_fs() code, so I like the idea. I like getting rid of
> addr_limit entirely even more, but that'll take some time. :)

Well for s390 there is no addr_limit as we use two separate address space
for kernel vs. user. The equivalent to the addr_limit corruption on a
fast-path syscall would be changing CR7 outside of set_fs. This boils
down to the question what we are protection against? Bad code with 
unbalanced set_fs or evil code that changes addr_limit/CR7 outside of
set_fs

-- 
blue skies,
   Martin.

"Reality continues to ruin my life." - Calvin.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-11 23:44                                     ` Linus Torvalds
       [not found]                                       ` <CA+55aFwvQfs_X+paQF6Luc0Rq+W3J2fKuHRou7=ANcquDdXdDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-12  6:13                                       ` Andy Lutomirski
  2017-05-12  6:58                                       ` Ingo Molnar
  2 siblings, 0 replies; 89+ messages in thread
From: Andy Lutomirski @ 2017-05-12  6:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Garnier, Greg KH, Ingo Molnar, Kees Cook, Daniel Micay,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin

[resending because kernel.org seems to have mangled my SMTP
credentials.  I wonder if this is a common problem.]

On Thu, May 11, 2017 at 4:44 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:
>>
>> Ingo: Do you want the change as-is? Would you like it to be optional?
>> What do you think?
>
> I'm not ingo, but I don't like that patch. It's in the wrong place -
> that system call return code is too timing-critical to add address
> limit checks.
>
> Now what I think you *could* do is:
>
>  - make "set_fs()" actually set a work flag in the current thread flags
>
>  - do the test in the slow-path (syscall_return_slowpath).
>
> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
>
> And it only slows down the system calls that actually use "set_fs()".
> Sure, it will slow those down a fair amount, but they are hopefully a
> small subset of all cases.
>
> How does that sound to people?  Thats' where we currently do that
>
>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
> regs->orig_ax))
>                 local_irq_enable();
>
> check too, which is a fairly similar issue.
>

I like this.  It wouldn't help the problem that I suspect is a major
part of the motivation for this patch: a stack overflow could
overwrite addr_limit.  But we fixed that for real already.

Slightly off-topic: I would *love* to see syscall_return_slowpath() or
similar moved or at least mostly moved into generic code.  Aside from
the fact that it used to be written in asm, there's nothing
fundamentally arch-specific about it.

>
> And it only slows down the system calls that actually use "set_fs()".
> Sure, it will slow those down a fair amount, but they are hopefully a
> small subset of all cases.

It won't even slow them down that much.  The slow path is reasonably
fast these days.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                             ` <CAGXu5j+EatK=DYONRkgovwLgytAnbG8jnAZaMSLckZFNVj3gig-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-12  6:57                                               ` Ingo Molnar
  0 siblings, 0 replies; 89+ messages in thread
From: Ingo Molnar @ 2017-05-12  6:57 UTC (permalink / raw)
  To: Kees Cook
  Cc: Martin Schwidefsky, Linus Torvalds, Thomas Garnier, Greg KH,
	Daniel Micay, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski


* Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:

> > git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
> > "s390: restore address space when returning to user space".
> 
> If I'm understanding this, it won't catch corruption of addr_limit
> during fast-path syscalls, though (i.e. addr_limit changed without a
> call to set_fs()). :(

Nor does it, or the patch you propose, protect against against something 
corrupting task->mm pointer, or the task->*uid values, or any of the myriads of 
security relevant values stored in the task structure!

Making sure API (set_fs()) usage is bug-free and protecting against the effects of 
general data corruption are two unrelated things that should not mixed.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-11 23:44                                     ` Linus Torvalds
       [not found]                                       ` <CA+55aFwvQfs_X+paQF6Luc0Rq+W3J2fKuHRou7=ANcquDdXdDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-05-12  6:13                                       ` Andy Lutomirski
@ 2017-05-12  6:58                                       ` Ingo Molnar
  2017-05-12 17:05                                         ` Thomas Garnier
  2 siblings, 1 reply; 89+ messages in thread
From: Ingo Molnar @ 2017-05-12  6:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Garnier, Greg KH, Kees Cook, Daniel Micay,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:
> >
> > Ingo: Do you want the change as-is? Would you like it to be optional?
> > What do you think?
> 
> I'm not ingo, but I don't like that patch. It's in the wrong place -
> that system call return code is too timing-critical to add address
> limit checks.
> 
> Now what I think you *could* do is:
> 
>  - make "set_fs()" actually set a work flag in the current thread flags
> 
>  - do the test in the slow-path (syscall_return_slowpath).
> 
> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
> 
> And it only slows down the system calls that actually use "set_fs()".
> Sure, it will slow those down a fair amount, but they are hopefully a
> small subset of all cases.
> 
> How does that sound to people?  Thats' where we currently do that
> 
>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
> regs->orig_ax))
>                 local_irq_enable();
> 
> check too, which is a fairly similar issue.

I really like that idea and I'd be perfectly fine with that solution, because it 
puts the overhead where the problem comes from, and adds an extra incentive for 
code to move away from set_fs() facilities. Win-win.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                       ` <CALCETrUh8NO2scaqEM48K70Fo2+V3=Cpyk4JurCDiCYp4nm_+g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-05-10  7:37                         ` [kernel-hardening] " Arnd Bergmann
@ 2017-05-12  7:00                         ` Ingo Molnar
  2017-05-12  7:15                           ` Al Viro
  1 sibling, 1 reply; 89+ messages in thread
From: Ingo Molnar @ 2017-05-12  7:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Christoph Hellwig, Greg KH, Thomas Garnier, Martin Schwidefsky,
	Heiko Carstens, Dave Hansen, Arnd Bergmann, Thomas Gleixner,
	David Howells, René Nyffenegger, Andrew Morton,
	Paul E . McKenney, Eric W . Biederman, Oleg Nesterov,
	Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin, Paolo Bonzini,
	Rik van Riel


* Andy Lutomirski <luto-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:

> On Tue, May 9, 2017 at 1:56 AM, Christoph Hellwig <hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org> wrote:
> > On Tue, May 09, 2017 at 08:45:22AM +0200, Ingo Molnar wrote:
> >> We only have ~115 code blocks in the kernel that set/restore KERNEL_DS, it would
> >> be a pity to add a runtime check to every system call ...
> >
> > I think we should simply strive to remove all of them that aren't
> > in core scheduler / arch code.  Basically evetyytime we do the
> >
> >         oldfs = get_fs();
> >         set_fs(KERNEL_DS);
> >         ..
> >         set_fs(oldfs);
> >
> > trick we're doing something wrong, and there should always be better
> > ways to archive it.  E.g. using iov_iter with a ITER_KVEC type
> > consistently would already remove most of them.
> 
> How about trying to remove all of them?  If we could actually get rid
> of all of them, we could drop the arch support, and we'd get faster,
> simpler, shorter uaccess code throughout the kernel.

I'm all for that!

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  7:00                         ` Ingo Molnar
@ 2017-05-12  7:15                           ` Al Viro
  2017-05-12  7:35                             ` Christoph Hellwig
       [not found]                             ` <20170512071549.GP390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  0 siblings, 2 replies; 89+ messages in thread
From: Al Viro @ 2017-05-12  7:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andy Lutomirski, Christoph Hellwig, Greg KH, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Fri, May 12, 2017 at 09:00:12AM +0200, Ingo Molnar wrote:

> > How about trying to remove all of them?  If we could actually get rid
> > of all of them, we could drop the arch support, and we'd get faster,
> > simpler, shorter uaccess code throughout the kernel.
> 
> I'm all for that!

Oh, for...  Ingo, do you really want to go through all ->write() and ->read()
instances, converting all of them to iov_iter?  Or, better yet, deal with
the patch flood from Nick Krause sock puppet brigade?

Folks, seriously, have you even looked through that zoo?  I have, and it's
really, really not fun.  Sure, we can say "fuck 'em, no need to allow
splice() on random crap".  Would be perfectly reasonable, expect that
it's not the only place doing kernel_write() and its ilk...

And converting everything to ->read_iter()/->write_iter() means an insane
amount of code churn, not to mention coping with random bogosities in
semantics.  ->read() and ->write() are going to stay around, pretty
much indefinitely.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  7:15                           ` Al Viro
@ 2017-05-12  7:35                             ` Christoph Hellwig
  2017-05-12  8:07                               ` Christoph Hellwig
       [not found]                             ` <20170512071549.GP390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  1 sibling, 1 reply; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-12  7:35 UTC (permalink / raw)
  To: Al Viro
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney, Rik van Riel, Kees Cook

On Fri, May 12, 2017 at 08:15:49AM +0100, Al Viro wrote:
> And converting everything to ->read_iter()/->write_iter() means an insane
> amount of code churn, not to mention coping with random bogosities in
> semantics.  ->read() and ->write() are going to stay around, pretty
> much indefinitely.

But I don't think kernel users of them have to.  I've been digging
through the calllers and will send an analysis to the list in a bit.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                             ` <20170512071549.GP390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
@ 2017-05-12  7:43                               ` Arnd Bergmann
  2017-05-12  8:11                                 ` Christoph Hellwig
  2017-05-12  8:11                                 ` Al Viro
  2017-05-12 23:20                               ` Andy Lutomirski
  1 sibling, 2 replies; 89+ messages in thread
From: Arnd Bergmann @ 2017-05-12  7:43 UTC (permalink / raw)
  To: Al Viro
  Cc: Ingo Molnar, Andy Lutomirski, Christoph Hellwig, Greg KH,
	Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Fri, May 12, 2017 at 9:15 AM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
> On Fri, May 12, 2017 at 09:00:12AM +0200, Ingo Molnar wrote:
>
>> > How about trying to remove all of them?  If we could actually get rid
>> > of all of them, we could drop the arch support, and we'd get faster,
>> > simpler, shorter uaccess code throughout the kernel.
>>
>> I'm all for that!
>
> Oh, for...  Ingo, do you really want to go through all ->write() and ->read()
> instances, converting all of them to iov_iter?  Or, better yet, deal with
> the patch flood from Nick Krause sock puppet brigade?

How realistic and how useful would it be to first completely eliminate
the ones that are in loadable modules and then wrapping the definition
in #ifndef MODULE (or even make it an extern function)?

This should be a fairly complete list of the modular users:

drivers/block/drbd/drbd_main.c: set_fs(KERNEL_DS);
drivers/input/serio/hp_sdc.c:   set_fs(KERNEL_DS);
drivers/media/v4l2-core/v4l2-compat-ioctl32.c:          set_fs(KERNEL_DS);
drivers/misc/lkdtm_bugs.c:      set_fs(KERNEL_DS);
drivers/s390/crypto/pkey_api.c: set_fs(KERNEL_DS);
drivers/staging/comedi/drivers/serial2002.c:    set_fs(KERNEL_DS);
drivers/staging/lustre/lnet/libcfs/tracefile.c: set_fs(get_ds());
drivers/staging/media/atomisp/pci/atomisp2/atomisp_compat_ioctl32.c:
 set_fs(KERNEL_DS);
drivers/staging/rtl8723bs/os_dep/osdep_service.c:               oldfs
= get_fs(); set_fs(get_ds());
drivers/usb/gadget/function/f_mass_storage.c:   set_fs(get_ds());
drivers/usb/gadget/function/u_uac1.c:   set_fs(KERNEL_DS);
drivers/vhost/vhost.c:  set_fs(USER_DS);
drivers/video/fbdev/core/fbmem.c:       set_fs(KERNEL_DS);
drivers/video/fbdev/hpfb.c:     set_fs(KERNEL_DS);
fs/autofs4/waitq.c:     set_fs(KERNEL_DS);
fs/binfmt_aout.c:       set_fs(KERNEL_DS);
fs/binfmt_elf.c:                set_fs(USER_DS);
fs/binfmt_elf_fdpic.c:  set_fs(KERNEL_DS);
fs/btrfs/send.c:        set_fs(KERNEL_DS);
fs/ext4/ioctl.c:                set_fs(KERNEL_DS);
fs/nfsd/vfs.c:  set_fs(KERNEL_DS);
net/9p/trans_fd.c:      set_fs(get_ds());
net/ipv6/addrconf.c:                    set_fs(KERNEL_DS);
net/ipv6/exthdrs.c:     set_fs(KERNEL_DS);
net/sunrpc/svcsock.c:   oldfs = get_fs(); set_fs(KERNEL_DS);
sound/core/oss/pcm_oss.c:       set_fs(get_ds());
sound/core/pcm_native.c:        set_fs(get_ds());
sound/drivers/opl3/opl3_oss.c:  set_fs(get_ds());
sound/oss/dmabuf.c:     set_fs(get_ds());
sound/oss/swarm_cs4297a.c:                set_fs(KERNEL_DS);
sound/pci/emu10k1/emufx.c:      set_fs(get_ds());
sound/pci/hda/hda_codec.c:              set_fs(get_ds());

     Arnd

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  7:35                             ` Christoph Hellwig
@ 2017-05-12  8:07                               ` Christoph Hellwig
  2017-05-12  8:23                                 ` Greg KH
  0 siblings, 1 reply; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-12  8:07 UTC (permalink / raw)
  To: Al Viro
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390, Richard Weinberger,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney

On Fri, May 12, 2017 at 12:35:21AM -0700, Christoph Hellwig wrote:
> On Fri, May 12, 2017 at 08:15:49AM +0100, Al Viro wrote:
> > And converting everything to ->read_iter()/->write_iter() means an insane
> > amount of code churn, not to mention coping with random bogosities in
> > semantics.  ->read() and ->write() are going to stay around, pretty
> > much indefinitely.
> 
> But I don't think kernel users of them have to.  I've been digging
> through the calllers and will send an analysis to the list in a bit.

Ok, here is the cleaned up list.  The targets for in-kernel I/O
are basically regular files on normal fs, block devices, pipes
and sockets, with a few narrow exceptions and a few unclear cases.

I've added a few maintainers to Cc to clarify, and the list is below:

When looking at kernel_read/kernel_readv/kernel_write/__kernel_write
instances, or vfs_read/vfs_readv/vfs_write/vfs_writev inststances with
set_fs tricks most of them simply require a regular file on a "real"
fs or a block device, pipe, socket:

 - various binary loaders: need to be regular files (+ on a "real" fs)
 - nandsim: expect a regular file or maybe a block device
 - code: expects a regular file
 - ecryptfs: expects a regular file
 - splice: can be a regular file or device file
 - security/keys: regular file
 - cachefiles: needs to be a regular file
 - coredump: usually a regular file, but possibly a pipe
 - acct: regular file
 - target: regular file
 - nfsd: regular file
 - lustre: regular file
 - f_mass_storage: regular file, or maybe block device
 - autofs: pipe
 - btrfs: regular file / pipe / socket
 - ima: regular file on specific file systems

Then there are a few interesting ones with specific targets:

 - sysctl: regular file on procfs
 - mconsole: procfs as far as I can tell.  Might be able to further
   narrow it down?
 - ashmem: regular file on shmem

The nommu mmap code needs to read everything that wants to support
a MAP_PRIVATE mmap, but as far as a I can tell: do a) address limits
not matter for nommu, and b) I have no f***cking idea why it doesn't
use readpage(s) to start with like the MMU code.

And a few that I can't figure:

 - cx25821: no idea what this opens, need to confirm with the maintainers
   (on Cc)
 - 9p: not clear.  looks like it might be sockets/pipes
 
 Last but not least we have a driver that's a complete mess:

 - series2002: /dev/tty* (drivers is in staging and needs to be rewritten
   using the proper in-kernel APIs anyway)

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  7:43                               ` [kernel-hardening] " Arnd Bergmann
@ 2017-05-12  8:11                                 ` Christoph Hellwig
  2017-05-12  8:16                                   ` Al Viro
  2017-05-12  8:11                                 ` Al Viro
  1 sibling, 1 reply; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-12  8:11 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, Peter Zijlstra, linux-s390,
	the arch/x86 maintainers, Russell King, Will Deacon,
	Christoph Hellwig, Christian Borntraeger, René Nyffenegger,
	Catalin Marinas, Paul E . McKenney, Rik van Riel, Kees Cook

On Fri, May 12, 2017 at 09:43:40AM +0200, Arnd Bergmann wrote:
> How realistic and how useful would it be to first completely eliminate
> the ones that are in loadable modules and then wrapping the definition
> in #ifndef MODULE (or even make it an extern function)?

Should be fairly doable and might be a nice step towards cleaning the
mess up.  In fact with my seres a large part of those are gone, and
most of the remaining handler are ioctl handlers or what seems like
opencoded versions of probe_kernel_read.

But it won't help against exploits modifying addr_limit manually.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  7:43                               ` [kernel-hardening] " Arnd Bergmann
  2017-05-12  8:11                                 ` Christoph Hellwig
@ 2017-05-12  8:11                                 ` Al Viro
       [not found]                                   ` <20170512081154.GQ390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  1 sibling, 1 reply; 89+ messages in thread
From: Al Viro @ 2017-05-12  8:11 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Ingo Molnar, Andy Lutomirski, Christoph Hellwig, Greg KH,
	Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Fri, May 12, 2017 at 09:43:40AM +0200, Arnd Bergmann wrote:

> How realistic and how useful would it be to first completely eliminate
> the ones that are in loadable modules and then wrapping the definition
> in #ifndef MODULE (or even make it an extern function)?

Eliminate _what_?  ->read() and ->write() instances?

> This should be a fairly complete list of the modular users:
> 
> drivers/block/drbd/drbd_main.c: set_fs(KERNEL_DS);

Ah, set_fs()...  Sure, many of those can be killed off.  Wouldn't be
a bad idea, but I don't understand what difference does modular/built-in
make here...

This one: AFAICS doesn't give a damn about set_fs() at all.

> drivers/input/serio/hp_sdc.c:   set_fs(KERNEL_DS);

Open-coded probe_kernel_read(), apparently.

> drivers/media/v4l2-core/v4l2-compat-ioctl32.c:          set_fs(KERNEL_DS);

massive compat ioctl crap.

> drivers/misc/lkdtm_bugs.c:      set_fs(KERNEL_DS);

insane.

> drivers/s390/crypto/pkey_api.c: set_fs(KERNEL_DS);

No idea.

> drivers/staging/comedi/drivers/serial2002.c:    set_fs(KERNEL_DS);

Open-coded kernel_write(); to some character device, no less...  And similar
for kernel_read(), apparently.

> drivers/staging/lustre/lnet/libcfs/tracefile.c: set_fs(get_ds());

Fuck knows; kernel_write() might do it.  Depends upon what it's writing
to.

You've missed other places in lustre, BTW - including the ioctls on
sockets, etc.

> drivers/staging/media/atomisp/pci/atomisp2/atomisp_compat_ioctl32.c:
>  set_fs(KERNEL_DS);

Compat ioctl crap, again.

> drivers/staging/rtl8723bs/os_dep/osdep_service.c:               oldfs
> = get_fs(); set_fs(get_ds());

Oh, lovely - reading an arbitrary (as in, specified by pathname) file.
Firmware (mis)handling?

> drivers/usb/gadget/function/f_mass_storage.c:   set_fs(get_ds());

No idea.

> drivers/usb/gadget/function/u_uac1.c:   set_fs(KERNEL_DS);

kernel_write(), by the look of it.  Or something similar.

> drivers/vhost/vhost.c:  set_fs(USER_DS);

kernel thread doing use_mm()

> drivers/video/fbdev/core/fbmem.c:       set_fs(KERNEL_DS);

compat ioctl.

> drivers/video/fbdev/hpfb.c:     set_fs(KERNEL_DS);

probe_kernel_read()

> fs/autofs4/waitq.c:     set_fs(KERNEL_DS);

kernel_write()

> fs/binfmt_aout.c:       set_fs(KERNEL_DS);
> fs/binfmt_elf.c:                set_fs(USER_DS);
> fs/binfmt_elf_fdpic.c:  set_fs(KERNEL_DS);

coredump stuff.

> fs/btrfs/send.c:        set_fs(KERNEL_DS);
kernel_write()

Anyway, what's special about modules?  IDGI...

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  8:11                                 ` Christoph Hellwig
@ 2017-05-12  8:16                                   ` Al Viro
  0 siblings, 0 replies; 89+ messages in thread
From: Al Viro @ 2017-05-12  8:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Arnd Bergmann, Ingo Molnar, Andy Lutomirski, Greg KH,
	Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Fri, May 12, 2017 at 01:11:26AM -0700, Christoph Hellwig wrote:

> But it won't help against exploits modifying addr_limit manually.

Or the ones setting current->cred to that of init.  Your point being?

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                   ` <20170512081154.GQ390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
@ 2017-05-12  8:20                                     ` Arnd Bergmann
  0 siblings, 0 replies; 89+ messages in thread
From: Arnd Bergmann @ 2017-05-12  8:20 UTC (permalink / raw)
  To: Al Viro
  Cc: Ingo Molnar, Andy Lutomirski, Christoph Hellwig, Greg KH,
	Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini

On Fri, May 12, 2017 at 10:11 AM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
> Anyway, what's special about modules?  IDGI...

One of the arguments that came up earlier was code in external modules
being mostly unaudited, sometimes without any source code available
at all but still used in devices.

If modules can't do set_fs() any more, this could eliminate bugs with
unpaired set_fs in those modules.

Limiting factors of course are:

- embedded systems that ship come with their own kernels (as opposed
  to using whatever users have, or relying on binary distros) can just
  make it available to modules again, by reverting the patch

- As Christoph said, they could have an open-coded set_fs in the
  driver

- Whatever other method a clueless driver write might come up with
  isn't necessarily better than set_fs().

     Arnd

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  8:07                               ` Christoph Hellwig
@ 2017-05-12  8:23                                 ` Greg KH
  0 siblings, 0 replies; 89+ messages in thread
From: Greg KH @ 2017-05-12  8:23 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Al Viro, Ingo Molnar, Andy Lutomirski, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Paolo Bonzini, Rik van Riel, Kees Cook, Josh Poimboeuf

On Fri, May 12, 2017 at 01:07:41AM -0700, Christoph Hellwig wrote:
>  Last but not least we have a driver that's a complete mess:
> 
>  - series2002: /dev/tty* (drivers is in staging and needs to be rewritten
>    using the proper in-kernel APIs anyway)

I agree, that's a mess, and it needs to use the new serdev interface at
the least.  Worse case, we just delete the thing :)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  6:58                                       ` Ingo Molnar
@ 2017-05-12 17:05                                         ` Thomas Garnier
  0 siblings, 0 replies; 89+ messages in thread
From: Thomas Garnier @ 2017-05-12 17:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Greg KH, Kees Cook, Daniel Micay,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski, Paolo Bonzini, Rik van Riel, Josh Poimboeuf

On Thu, May 11, 2017 at 11:58 PM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie@google.com> wrote:
>> >
>> > Ingo: Do you want the change as-is? Would you like it to be optional?
>> > What do you think?
>>
>> I'm not ingo, but I don't like that patch. It's in the wrong place -
>> that system call return code is too timing-critical to add address
>> limit checks.
>>
>> Now what I think you *could* do is:
>>
>>  - make "set_fs()" actually set a work flag in the current thread flags
>>
>>  - do the test in the slow-path (syscall_return_slowpath).
>>
>> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
>>
>> And it only slows down the system calls that actually use "set_fs()".
>> Sure, it will slow those down a fair amount, but they are hopefully a
>> small subset of all cases.
>>
>> How does that sound to people?  Thats' where we currently do that
>>
>>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
>> regs->orig_ax))
>>                 local_irq_enable();
>>
>> check too, which is a fairly similar issue.
>
> I really like that idea and I'd be perfectly fine with that solution, because it
> puts the overhead where the problem comes from, and adds an extra incentive for
> code to move away from set_fs() facilities. Win-win.

Great, I will adapt the patch for that.

>
> Thanks,
>
>         Ingo



-- 
Thomas

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12  5:54                                             ` Martin Schwidefsky
@ 2017-05-12 19:01                                               ` Kees Cook
  2017-05-12 19:08                                                 ` Russell King - ARM Linux
       [not found]                                                 ` <CAGXu5jL9vUrn4kpjO+qa4cHmWBypeqP17OGbrMs=5Nz0YpQMZw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 2 replies; 89+ messages in thread
From: Kees Cook @ 2017-05-12 19:01 UTC (permalink / raw)
  To: Martin Schwidefsky
  Cc: Linus Torvalds, Thomas Garnier, Greg KH, Ingo Molnar,
	Daniel Micay, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski

On Thu, May 11, 2017 at 10:54 PM, Martin Schwidefsky
<schwidefsky-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org> wrote:
> On Thu, 11 May 2017 22:34:31 -0700
> Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>
>> On Thu, May 11, 2017 at 10:28 PM, Martin Schwidefsky
>> <schwidefsky-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org> wrote:
>> > On Thu, 11 May 2017 16:44:07 -0700
>> > Linus Torvalds <torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> wrote:
>> >
>> >> On Thu, May 11, 2017 at 4:17 PM, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
>> >> >
>> >> > Ingo: Do you want the change as-is? Would you like it to be optional?
>> >> > What do you think?
>> >>
>> >> I'm not ingo, but I don't like that patch. It's in the wrong place -
>> >> that system call return code is too timing-critical to add address
>> >> limit checks.
>> >>
>> >> Now what I think you *could* do is:
>> >>
>> >>  - make "set_fs()" actually set a work flag in the current thread flags
>> >>
>> >>  - do the test in the slow-path (syscall_return_slowpath).
>> >>
>> >> Yes, yes, that ends up being architecture-specific, but it's fairly simple.
>> >>
>> >> And it only slows down the system calls that actually use "set_fs()".
>> >> Sure, it will slow those down a fair amount, but they are hopefully a
>> >> small subset of all cases.
>> >>
>> >> How does that sound to people?  Thats' where we currently do that
>> >>
>> >>         if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
>> >>             WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
>> >> regs->orig_ax))
>> >>                 local_irq_enable();
>> >>
>> >> check too, which is a fairly similar issue.
>> >
>> > This is exactly what Heiko did for the s390 backend as a result of this
>> > discussion. See the _CIF_ASCE_SECONDARY bit in arch/s390/kernel/entry.S,
>> > for the hot patch the check for the bit is included in the general
>> > _CIF_WORK test. Only the slow patch gets a bit slower.
>> >
>> > git commit b5a882fcf146c87cb6b67c6df353e1c042b8773d
>> > "s390: restore address space when returning to user space".
>>
>> If I'm understanding this, it won't catch corruption of addr_limit
>> during fast-path syscalls, though (i.e. addr_limit changed without a
>> call to set_fs()). :( This addr_limit corruption is mostly only a risk
>> archs without THREAD_INFO_IN_TASK, but it would still be nice to catch
>> unbalanced set_fs() code, so I like the idea. I like getting rid of
>> addr_limit entirely even more, but that'll take some time. :)
>
> Well for s390 there is no addr_limit as we use two separate address space
> for kernel vs. user. The equivalent to the addr_limit corruption on a
> fast-path syscall would be changing CR7 outside of set_fs. This boils
> down to the question what we are protection against? Bad code with
> unbalanced set_fs or evil code that changes addr_limit/CR7 outside of
> set_fs

Yeah, the risk for "corrupted addr_limit" is mainly a concern for
archs with addr_limit on the kernel stack. If I'm reading things
correctly, that means, from the archs I've been paying closer
attention to, it's an issue for arm, mips, and powerpc:

arch/arm/include/asm/uaccess.h: current_thread_info()->addr_limit = fs;
arch/arm/include/asm/thread_info.h:             (current_stack_pointer
& ~(THREAD_SIZE - 1));

arch/mips/include/asm/uaccess.h:#define set_fs(x)
(current_thread_info()->addr_limit = (x))
arch/mips/kernel/process.c:        * task stacks at THREAD_SIZE - 32

arch/powerpc/include/asm/uaccess.h:#define set_fs(val)
(current->thread.fs = (val))
arch/powerpc/kernel/process.c:          struct pt_regs *regs =
task_stack_page(current) + THREAD_SIZE;

(s390 uses a register, x86 and arm64 implement THREAD_INFO_IN_TASK.)
Targeting addr_limit through arbitrary write attacks isn't too common
since ... it's an arbitrary write. The issue with addr_limit was that
it can live on the kernel stack, which meant all kinds of
stack-related bugs can lead to it getting stomped on.

So, two goals to protect addr_limit:

- get it off the stack to make the difficulty of corruption on par
with other sensitive things that would require an arbitrary write
flaw.

- detect/block unbalanced set_fs() calls.

If we can get the former addressed by the remaining architectures,
then that class of attack will go away. For the latter, it sounds like
Linus's slowpath-exit will work nicely.

To me it looks like he architectures with addr_limit still on the
stack would still benefit from always-check-addr_limit on syscall
exit, but that would be arch-specific anyway.

And then, of course, we've got the parallel task of just removing
set_fs() entirely. :)

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12 19:01                                               ` Kees Cook
@ 2017-05-12 19:08                                                 ` Russell King - ARM Linux
       [not found]                                                 ` <CAGXu5jL9vUrn4kpjO+qa4cHmWBypeqP17OGbrMs=5Nz0YpQMZw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 0 replies; 89+ messages in thread
From: Russell King - ARM Linux @ 2017-05-12 19:08 UTC (permalink / raw)
  To: Kees Cook
  Cc: Martin Schwidefsky, Linus Torvalds, Thomas Garnier, Greg KH,
	Ingo Molnar, Daniel Micay, Heiko Carstens, Dave Hansen,
	Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin

On Fri, May 12, 2017 at 12:01:59PM -0700, Kees Cook wrote:
> Yeah, the risk for "corrupted addr_limit" is mainly a concern for
> archs with addr_limit on the kernel stack. If I'm reading things
> correctly, that means, from the archs I've been paying closer
> attention to, it's an issue for arm, mips, and powerpc:

I'd first want to uninline everything in uaccess.h first that makes
use of access_ok() - which I think is something that needs to happen
anyway.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                 ` <CAGXu5jL9vUrn4kpjO+qa4cHmWBypeqP17OGbrMs=5Nz0YpQMZw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-12 19:08                                                   ` Linus Torvalds
       [not found]                                                     ` <CA+55aFzbiBqsYb7vwO=+L4Vp_GOgPu+DBOrq4fBnyzq5DbBehg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Linus Torvalds @ 2017-05-12 19:08 UTC (permalink / raw)
  To: Kees Cook
  Cc: Martin Schwidefsky, Thomas Garnier, Greg KH, Ingo Molnar,
	Daniel Micay, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski

On Fri, May 12, 2017 at 12:01 PM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>
> Yeah, the risk for "corrupted addr_limit" is mainly a concern for
> archs with addr_limit on the kernel stack. If I'm reading things
> correctly, that means, from the archs I've been paying closer
> attention to, it's an issue for arm, mips, and powerpc:

I don't understand why people are looking at addr_limit as some kind
of special thing.

If somebody is smashing the stack and corrupting thread info data, the
game is over. addr_limit is the *least* of your problems, and it's not
even all that likely that it will be increasing (it's much more likely
that it would be overwritten with a smaller value).

Quite frankly, this kind of idiotic discussion just makes me question
the whole idea of the patch.

Any "security" that is this specific is not real security, it's just
masturbatory garbage.

It may be worth checking that people use "set_fs()" properly. But stop
this idiotic crap. It just makes the kernel security people look like
the crazies.

There are enough incompetent crazy security people, don't go there.
The kinds of things it is worth protecting against are the big class
of generic issues, not the kind of "oh, but imagine if a cosmic ray
flips this particular word in memory" kind of crap that ignores all
the other words of memory.

Seriously, Kees. You are just making security people look bad. Stop it.

                       Linus

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                     ` <CA+55aFzbiBqsYb7vwO=+L4Vp_GOgPu+DBOrq4fBnyzq5DbBehg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-12 19:30                                                       ` Kees Cook
       [not found]                                                         ` <CAGXu5j+xmyJ6RhtPw9rUgs7k3sZ1KKWffvyGGG--oPfu9W42ng-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Kees Cook @ 2017-05-12 19:30 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Martin Schwidefsky, Thomas Garnier, Greg KH, Ingo Molnar,
	Daniel Micay, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski

On Fri, May 12, 2017 at 12:08 PM, Linus Torvalds
<torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org> wrote:
> On Fri, May 12, 2017 at 12:01 PM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>> Yeah, the risk for "corrupted addr_limit" is mainly a concern for
>> archs with addr_limit on the kernel stack. If I'm reading things
>> correctly, that means, from the archs I've been paying closer
>> attention to, it's an issue for arm, mips, and powerpc:
>
> I don't understand why people are looking at addr_limit as some kind
> of special thing.
>
> If somebody is smashing the stack and corrupting thread info data, the
> game is over. addr_limit is the *least* of your problems, and it's not
> even all that likely that it will be increasing (it's much more likely
> that it would be overwritten with a smaller value).
>
> Quite frankly, this kind of idiotic discussion just makes me question
> the whole idea of the patch.
>
> Any "security" that is this specific is not real security, it's just
> masturbatory garbage.
>
> It may be worth checking that people use "set_fs()" properly. But stop
> this idiotic crap. It just makes the kernel security people look like
> the crazies.
>
> There are enough incompetent crazy security people, don't go there.
> The kinds of things it is worth protecting against are the big class
> of generic issues, not the kind of "oh, but imagine if a cosmic ray
> flips this particular word in memory" kind of crap that ignores all
> the other words of memory.
>
> Seriously, Kees. You are just making security people look bad. Stop it.

I'm clearly not explaining things well enough. I shouldn't say
"corruption", I should say "malicious manipulation". The methodology
of attacks against the stack are quite different from the other kinds
of attacks like use-after-free, heap overflow, etc. Being able to
exhaust the kernel stack (either due to deep recursion or unbounded
alloca()) means attackers can control a write to addr_limit, and then
leverage that into an actual arbitrary write via subsequent calls to
copy_to_user() pointed at kernel memory. This isn't theoretical, this
is how those attacks are performed. It may sound crazy, but it's real.

With thread_info off the stack, the whole problem goes away. It's
wonderful that this has happened for x86, arm64, and s390. There are
always going to be new methods of attack for everything, but while we
slowly address the design weakness (set_fs()), we can fix the low
hanging fruit too.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                         ` <CAGXu5j+xmyJ6RhtPw9rUgs7k3sZ1KKWffvyGGG--oPfu9W42ng-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-12 20:21                                                           ` Russell King - ARM Linux
  2017-05-12 20:30                                                             ` Peter Zijlstra
  2017-05-12 21:06                                                             ` Al Viro
  0 siblings, 2 replies; 89+ messages in thread
From: Russell King - ARM Linux @ 2017-05-12 20:21 UTC (permalink / raw)
  To: Kees Cook
  Cc: Linus Torvalds, Mark Rutland, Kernel Hardening, Greg KH,
	Heiko Carstens, LKML, David Howells, Dave Hansen,
	H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov, linux-s390,
	the arch/x86 maintainers, Will Deacon, Christian Borntraeger,
	René Nyffenegger, Catalin Marinas, Paul E . McKenney

On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
> I'm clearly not explaining things well enough. I shouldn't say
> "corruption", I should say "malicious manipulation". The methodology
> of attacks against the stack are quite different from the other kinds
> of attacks like use-after-free, heap overflow, etc. Being able to
> exhaust the kernel stack (either due to deep recursion or unbounded
> alloca())

I really hope we don't have alloca() use in the kernel.  Do you have
evidence to support that assertion?

IMHO alloca() (or similar) should not be present in any kernel code
because we have a limited stack - we have kmalloc() etc for that kind
of thing.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12 20:21                                                           ` Russell King - ARM Linux
@ 2017-05-12 20:30                                                             ` Peter Zijlstra
       [not found]                                                               ` <20170512203044.GI4626-IIpfhp3q70z/8w/KjCw3T+5/BudmfyzbbVWyRVo5IupeoWH0uzbU5w@public.gmane.org>
  2017-05-13  7:21                                                               ` Christoph Hellwig
  2017-05-12 21:06                                                             ` Al Viro
  1 sibling, 2 replies; 89+ messages in thread
From: Peter Zijlstra @ 2017-05-12 20:30 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Mark Rutland, Brian Gerst, Kernel Hardening, Catalin Marinas,
	Heiko Carstens, Oleg Nesterov, David Howells, Dave Hansen,
	H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov, linux-s390,
	the arch/x86 maintainers, Christian Borntraeger,
	René Nyffenegger, Greg KH, Paul E . McKenney, Rik van Riel,
	Kees Cook, Arnd Bergmann, Will Deacon, Borislav Petkov

On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
> On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
> > I'm clearly not explaining things well enough. I shouldn't say
> > "corruption", I should say "malicious manipulation". The methodology
> > of attacks against the stack are quite different from the other kinds
> > of attacks like use-after-free, heap overflow, etc. Being able to
> > exhaust the kernel stack (either due to deep recursion or unbounded
> > alloca())
> 
> I really hope we don't have alloca() use in the kernel.  Do you have
> evidence to support that assertion?
> 
> IMHO alloca() (or similar) should not be present in any kernel code
> because we have a limited stack - we have kmalloc() etc for that kind
> of thing.

On stack variable length arrays get implemented by the compiler doing
alloca(), and we sadly have a few of those around.

But yes, fully agreed on the desirability of alloca() and things.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                               ` <20170512203044.GI4626-IIpfhp3q70z/8w/KjCw3T+5/BudmfyzbbVWyRVo5IupeoWH0uzbU5w@public.gmane.org>
@ 2017-05-12 20:45                                                                 ` Russell King - ARM Linux
  2017-05-12 21:00                                                                   ` Kees Cook
  0 siblings, 1 reply; 89+ messages in thread
From: Russell King - ARM Linux @ 2017-05-12 20:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mark Rutland, Brian Gerst, Kernel Hardening, Catalin Marinas,
	Heiko Carstens, Oleg Nesterov, David Howells, Dave Hansen,
	H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov, linux-s390,
	the arch/x86 maintainers, Christian Borntraeger,
	René Nyffenegger, Greg KH, Paul E . McKenney, Rik van Riel,
	Kees Cook

On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote:
> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
> > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
> > > I'm clearly not explaining things well enough. I shouldn't say
> > > "corruption", I should say "malicious manipulation". The methodology
> > > of attacks against the stack are quite different from the other kinds
> > > of attacks like use-after-free, heap overflow, etc. Being able to
> > > exhaust the kernel stack (either due to deep recursion or unbounded
> > > alloca())
> > 
> > I really hope we don't have alloca() use in the kernel.  Do you have
> > evidence to support that assertion?
> > 
> > IMHO alloca() (or similar) should not be present in any kernel code
> > because we have a limited stack - we have kmalloc() etc for that kind
> > of thing.
> 
> On stack variable length arrays get implemented by the compiler doing
> alloca(), and we sadly have a few of those around.

I hope their size is appropriately limited, but something tells me it
would be foolish to assume that.

> But yes, fully agreed on the desirability of alloca() and things.

Hmm, I wonder if -fno-builtin-alloca would prevent those... it looks
like it certainly would prevent an explicit alloca() call.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12 20:45                                                                 ` Russell King - ARM Linux
@ 2017-05-12 21:00                                                                   ` Kees Cook
       [not found]                                                                     ` <CAGXu5jL6FPuShBpZfi6+XHqOk4gxocUJRYPHT5oR3HYh3xm+sA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 89+ messages in thread
From: Kees Cook @ 2017-05-12 21:00 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Peter Zijlstra, Mark Rutland, Brian Gerst, Kernel Hardening,
	Catalin Marinas, Heiko Carstens, Oleg Nesterov, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	linux-s390, the arch/x86 maintainers, Christian Borntraeger,
	René Nyffenegger, Greg KH, Paul E . McKenney

On Fri, May 12, 2017 at 1:45 PM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote:
>> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
>> > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
>> > > I'm clearly not explaining things well enough. I shouldn't say
>> > > "corruption", I should say "malicious manipulation". The methodology
>> > > of attacks against the stack are quite different from the other kinds
>> > > of attacks like use-after-free, heap overflow, etc. Being able to
>> > > exhaust the kernel stack (either due to deep recursion or unbounded
>> > > alloca())
>> >
>> > I really hope we don't have alloca() use in the kernel.  Do you have
>> > evidence to support that assertion?
>> >
>> > IMHO alloca() (or similar) should not be present in any kernel code
>> > because we have a limited stack - we have kmalloc() etc for that kind
>> > of thing.
>>
>> On stack variable length arrays get implemented by the compiler doing
>> alloca(), and we sadly have a few of those around.
>
> I hope their size is appropriately limited, but something tells me it
> would be foolish to assume that.
>
>> But yes, fully agreed on the desirability of alloca() and things.
>
> Hmm, I wonder if -fno-builtin-alloca would prevent those... it looks
> like it certainly would prevent an explicit alloca() call.

Building with -Werror=vla is exciting. :)

A lot of it is in crypto (which are relatively static sizes, just
using function callbacks), but there is plenty more.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                                     ` <CAGXu5jL6FPuShBpZfi6+XHqOk4gxocUJRYPHT5oR3HYh3xm+sA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-12 21:04                                                                       ` Kees Cook
  0 siblings, 0 replies; 89+ messages in thread
From: Kees Cook @ 2017-05-12 21:04 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Peter Zijlstra, Mark Rutland, Brian Gerst, Kernel Hardening,
	Catalin Marinas, Heiko Carstens, Oleg Nesterov, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	linux-s390, the arch/x86 maintainers, Christian Borntraeger,
	René Nyffenegger, Greg KH, Paul E . McKenney

On Fri, May 12, 2017 at 2:00 PM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> On Fri, May 12, 2017 at 1:45 PM, Russell King - ARM Linux
> <linux-I+IVW8TIWO2tmTQ+vhA3Yw@public.gmane.org> wrote:
>> On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote:
>>> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
>>> > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
>>> > > I'm clearly not explaining things well enough. I shouldn't say
>>> > > "corruption", I should say "malicious manipulation". The methodology
>>> > > of attacks against the stack are quite different from the other kinds
>>> > > of attacks like use-after-free, heap overflow, etc. Being able to
>>> > > exhaust the kernel stack (either due to deep recursion or unbounded
>>> > > alloca())
>>> >
>>> > I really hope we don't have alloca() use in the kernel.  Do you have
>>> > evidence to support that assertion?
>>> >
>>> > IMHO alloca() (or similar) should not be present in any kernel code
>>> > because we have a limited stack - we have kmalloc() etc for that kind
>>> > of thing.
>>>
>>> On stack variable length arrays get implemented by the compiler doing
>>> alloca(), and we sadly have a few of those around.
>>
>> I hope their size is appropriately limited, but something tells me it
>> would be foolish to assume that.
>>
>>> But yes, fully agreed on the desirability of alloca() and things.
>>
>> Hmm, I wonder if -fno-builtin-alloca would prevent those... it looks
>> like it certainly would prevent an explicit alloca() call.
>
> Building with -Werror=vla is exciting. :)
>
> A lot of it is in crypto (which are relatively static sizes, just
> using function callbacks), but there is plenty more.

I meant to also paste an example (which is harmless, I haven't looked
extensively at other examples):

        unsigned long alignmask = crypto_tfm_alg_alignmask(tfm);
        unsigned int size = crypto_tfm_alg_blocksize(tfm);
        u8 buffer[size + alignmask];

Looking at all the places (and having tried to remove a few of these
in pstore), I think it might be quite frustrating to eliminate them
all and then declare VLAs dead. I'm not against trying, though. :)

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12 20:21                                                           ` Russell King - ARM Linux
  2017-05-12 20:30                                                             ` Peter Zijlstra
@ 2017-05-12 21:06                                                             ` Al Viro
       [not found]                                                               ` <20170512210645.GS390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  1 sibling, 1 reply; 89+ messages in thread
From: Al Viro @ 2017-05-12 21:06 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Kees Cook, Linus Torvalds, Mark Rutland, Kernel Hardening,
	Greg KH, Heiko Carstens, LKML, David Howells, Dave Hansen,
	H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov, linux-s390,
	the arch/x86 maintainers, Will Deacon, Christian Borntraeger,
	René Nyffenegger, Catalin Marinas, Paul E . McKenney,
	Rik van Riel, Peter Zijlstra

On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
> On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
> > I'm clearly not explaining things well enough. I shouldn't say
> > "corruption", I should say "malicious manipulation". The methodology
> > of attacks against the stack are quite different from the other kinds
> > of attacks like use-after-free, heap overflow, etc. Being able to
> > exhaust the kernel stack (either due to deep recursion or unbounded
> > alloca())
> 
> I really hope we don't have alloca() use in the kernel.  Do you have
> evidence to support that assertion?
> 
> IMHO alloca() (or similar) should not be present in any kernel code
> because we have a limited stack - we have kmalloc() etc for that kind
> of thing.

No alloca(), but there are VLAs.  Said that, the whole "what if they
can bugger thread_info and/or task_struct and go after set_fs() state"
is idiocy, of course - in that case the box is fucked, no matter what.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                               ` <20170512210645.GS390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
@ 2017-05-12 21:16                                                                 ` Daniel Micay
  2017-05-12 21:17                                                                 ` Kees Cook
  1 sibling, 0 replies; 89+ messages in thread
From: Daniel Micay @ 2017-05-12 21:16 UTC (permalink / raw)
  To: Al Viro, Russell King - ARM Linux
  Cc: Kees Cook, Linus Torvalds, Mark Rutland, Kernel Hardening,
	Greg KH, Heiko Carstens, LKML, David Howells, Dave Hansen,
	H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov, linux-s390,
	the arch/x86 maintainers, Will Deacon, Christian Borntraeger,
	René Nyffenegger, Catalin Marinas

On Fri, 2017-05-12 at 22:06 +0100, Al Viro wrote:
> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux
> wrote:
> > On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
> > > I'm clearly not explaining things well enough. I shouldn't say
> > > "corruption", I should say "malicious manipulation". The
> > > methodology
> > > of attacks against the stack are quite different from the other
> > > kinds
> > > of attacks like use-after-free, heap overflow, etc. Being able to
> > > exhaust the kernel stack (either due to deep recursion or
> > > unbounded
> > > alloca())
> > 
> > I really hope we don't have alloca() use in the kernel.  Do you have
> > evidence to support that assertion?
> > 
> > IMHO alloca() (or similar) should not be present in any kernel code
> > because we have a limited stack - we have kmalloc() etc for that
> > kind
> > of thing.
> 
> No alloca(), but there are VLAs.  Said that, the whole "what if they
> can bugger thread_info and/or task_struct and go after set_fs() state"
> is idiocy, of course - in that case the box is fucked, no matter what.

VMAP_STACK + -fstack-check would prevent exploiting even an unbounded
VLA / alloca size vs. it being an arbitrary write. -fstack-check
guarantees that there's one byte per page as the stack grows, although
there are some unfortunate GCC bugs making it less than perfect right
now... but they recently started caring about it more including making
it near zero overhead as it was always supposed to be.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                               ` <20170512210645.GS390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  2017-05-12 21:16                                                                 ` [kernel-hardening] " Daniel Micay
@ 2017-05-12 21:17                                                                 ` Kees Cook
       [not found]                                                                   ` <CAGXu5jJu=VTqp2tzkPB4RAVxdGC+_SSQwrUwdzWpu24AA-zEcg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 89+ messages in thread
From: Kees Cook @ 2017-05-12 21:17 UTC (permalink / raw)
  To: Al Viro
  Cc: Russell King - ARM Linux, Linus Torvalds, Mark Rutland,
	Kernel Hardening, Greg KH, Heiko Carstens, LKML, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	linux-s390, the arch/x86 maintainers, Will Deacon,
	Christian Borntraeger, René Nyffenegger, Catalin Marinas

On Fri, May 12, 2017 at 2:06 PM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
> On Fri, May 12, 2017 at 09:21:06PM +0100, Russell King - ARM Linux wrote:
>> On Fri, May 12, 2017 at 12:30:02PM -0700, Kees Cook wrote:
>> > I'm clearly not explaining things well enough. I shouldn't say
>> > "corruption", I should say "malicious manipulation". The methodology
>> > of attacks against the stack are quite different from the other kinds
>> > of attacks like use-after-free, heap overflow, etc. Being able to
>> > exhaust the kernel stack (either due to deep recursion or unbounded
>> > alloca())
>>
>> I really hope we don't have alloca() use in the kernel.  Do you have
>> evidence to support that assertion?
>>
>> IMHO alloca() (or similar) should not be present in any kernel code
>> because we have a limited stack - we have kmalloc() etc for that kind
>> of thing.
>
> No alloca(), but there are VLAs.  Said that, the whole "what if they
> can bugger thread_info and/or task_struct and go after set_fs() state"
> is idiocy, of course - in that case the box is fucked, no matter what.

Two things are at risk from stack exhaustion: thread_info (mainly
addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and
overflow into adjacent allocations (fixed by VMAP_STACK). The latter
is fundamentally a heap overflow.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                                   ` <CAGXu5jJu=VTqp2tzkPB4RAVxdGC+_SSQwrUwdzWpu24AA-zEcg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-05-12 21:23                                                                     ` Daniel Micay
  2017-05-12 21:41                                                                     ` Al Viro
  1 sibling, 0 replies; 89+ messages in thread
From: Daniel Micay @ 2017-05-12 21:23 UTC (permalink / raw)
  To: Kees Cook, Al Viro
  Cc: Russell King - ARM Linux, Linus Torvalds, Mark Rutland,
	Kernel Hardening, Greg KH, Heiko Carstens, LKML, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	linux-s390, the arch/x86 maintainers, Will Deacon,
	Christian Borntraeger, René Nyffenegger, Catalin Marinas

> overflow into adjacent allocations (fixed by VMAP_STACK).

99% fixed, but it's possible to skip over the guard page without
-fstack-check enabled (plus some edge cases need to be fixed in GCC),
unless VLAs were forbidden in addition to the existing large frame size
warning.

I'm not sure about in-tree code, but Qualcomm had some of these
improperly bounded VLA vulnerabilities in their MSM kernel...

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                                   ` <CAGXu5jJu=VTqp2tzkPB4RAVxdGC+_SSQwrUwdzWpu24AA-zEcg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2017-05-12 21:23                                                                     ` Daniel Micay
@ 2017-05-12 21:41                                                                     ` Al Viro
  2017-05-12 21:47                                                                       ` Rik van Riel
       [not found]                                                                       ` <20170512214144.GT390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  1 sibling, 2 replies; 89+ messages in thread
From: Al Viro @ 2017-05-12 21:41 UTC (permalink / raw)
  To: Kees Cook
  Cc: Russell King - ARM Linux, Linus Torvalds, Mark Rutland,
	Kernel Hardening, Greg KH, Heiko Carstens, LKML, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	linux-s390, the arch/x86 maintainers, Will Deacon,
	Christian Borntraeger, René Nyffenegger, Catalin Marinas

On Fri, May 12, 2017 at 02:17:19PM -0700, Kees Cook wrote:

> Two things are at risk from stack exhaustion: thread_info (mainly
> addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and

Really?  Let's take a look at arm, for example:

struct thread_info {
        unsigned long           flags;          /* low level flags */
        int                     preempt_count;  /* 0 => preemptable, <0 => bug */
        mm_segment_t            addr_limit;     /* address limit */
        struct task_struct      *task;          /* main task structure */

and current() is defined as current_thread_info()->task.

Seriously, look at these beasts.  Overwriting ->addr_limit is nowhere near
the top threat.  If attacker can overwrite thread_info, you have lost.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12 21:41                                                                     ` Al Viro
@ 2017-05-12 21:47                                                                       ` Rik van Riel
       [not found]                                                                         ` <1494625675.29205.21.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
       [not found]                                                                       ` <20170512214144.GT390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  1 sibling, 1 reply; 89+ messages in thread
From: Rik van Riel @ 2017-05-12 21:47 UTC (permalink / raw)
  To: Al Viro, Kees Cook
  Cc: Russell King - ARM Linux, Linus Torvalds, Mark Rutland,
	Kernel Hardening, Greg KH, Heiko Carstens, LKML, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	linux-s390, the arch/x86 maintainers, Will Deacon,
	Christian Borntraeger, René Nyffenegger, Catalin Marinas

On Fri, 2017-05-12 at 22:41 +0100, Al Viro wrote:
> On Fri, May 12, 2017 at 02:17:19PM -0700, Kees Cook wrote:
> 
> > Two things are at risk from stack exhaustion: thread_info (mainly
> > addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and
> 
> Really?  Let's take a look at arm, for example:
> 
> struct thread_info {
>         unsigned long           flags;          /* low level flags */
>         int                     preempt_count;  /* 0 => preemptable,
> <0 => bug */
>         mm_segment_t            addr_limit;     /* address limit */
>         struct task_struct      *task;          /* main task
> structure */
> 
> and current() is defined as current_thread_info()->task.
> 
> Seriously, look at these beasts.  Overwriting ->addr_limit is nowhere
> near
> the top threat.  If attacker can overwrite thread_info, you have
> lost.

That is why THREAD_INFO_IN_TASK exists. It moves
the struct thread_info to a location away from the
stack, which means a stack overflow will not overwrite
the thread_info.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                                       ` <20170512214144.GT390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
@ 2017-05-12 21:50                                                                         ` Kees Cook
  0 siblings, 0 replies; 89+ messages in thread
From: Kees Cook @ 2017-05-12 21:50 UTC (permalink / raw)
  To: Al Viro
  Cc: Russell King - ARM Linux, Linus Torvalds, Mark Rutland,
	Kernel Hardening, Greg KH, Heiko Carstens, LKML, David Howells,
	Dave Hansen, H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov,
	linux-s390, the arch/x86 maintainers, Will Deacon,
	Christian Borntraeger, René Nyffenegger, Catalin Marinas

On Fri, May 12, 2017 at 2:41 PM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
> On Fri, May 12, 2017 at 02:17:19PM -0700, Kees Cook wrote:
>
>> Two things are at risk from stack exhaustion: thread_info (mainly
>> addr_limit) when on the stack (fixed by THREAD_INFO_IN_TASK), and
>
> Really?  Let's take a look at arm, for example:
>
> struct thread_info {
>         unsigned long           flags;          /* low level flags */
>         int                     preempt_count;  /* 0 => preemptable, <0 => bug */
>         mm_segment_t            addr_limit;     /* address limit */
>         struct task_struct      *task;          /* main task structure */
>
> and current() is defined as current_thread_info()->task.
>
> Seriously, look at these beasts.  Overwriting ->addr_limit is nowhere near
> the top threat.  If attacker can overwrite thread_info, you have lost.

I don't disagree, but the type of attack is different. If the attacker
overwrites task_struct pointer, then they need to have built an false
one, and that may be made difficult by PAN, or need to know more about
kernel memory layout (rather than only stack depth), etc. Attacking
addr_limit makes it very very easy to upgrade attack capabilities. I'm
not say thread_info shouldn't be moved off the stack.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                                                                         ` <1494625675.29205.21.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2017-05-12 22:57                                                                           ` Al Viro
  0 siblings, 0 replies; 89+ messages in thread
From: Al Viro @ 2017-05-12 22:57 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Kees Cook, Russell King - ARM Linux, Linus Torvalds,
	Mark Rutland, Kernel Hardening, Greg KH, Heiko Carstens, LKML,
	David Howells, Dave Hansen, H . Peter Anvin, Ingo Molnar,
	Pavel Tikhomirov, linux-s390, the arch/x86 maintainers,
	Will Deacon, Christian Borntraeger, René Nyffenegger

On Fri, May 12, 2017 at 05:47:55PM -0400, Rik van Riel wrote:

> > Seriously, look at these beasts.  Overwriting ->addr_limit is nowhere
> > near
> > the top threat.  If attacker can overwrite thread_info, you have
> > lost.
> 
> That is why THREAD_INFO_IN_TASK exists. It moves
> the struct thread_info to a location away from the
> stack, which means a stack overflow will not overwrite
> the thread_info.

... in which case such attacks on ->addr_limit also become a non-issue.

AFAICS, we are mixing several unrelated issues here:
	* amount of places where set_fs() is called.  Sure, reducing it
is a good idea and we want to move to primitives like kernel_write() et.al.
Fewer users => lower odds of screwing it up.
	* making sure that remaining callers are properly paired.  Ditto.
	* switching to ->read_iter()/->write_iter() where it makes sense.
Again, no problem with that.
	* providing sane environment for places like perf/oprofile.  Again,
a good idea, and set_fs(USER_DS) is only a part of what's needed there.
	* switching _everything_ to ->read_iter()/->write_iter().  Flat-out
insane and AFAICS nobody is signing up for that.
	* getting rid of set_fs() entirely.  I'm afraid that it's not feasible
without the previous one and frankly, I don't see much point.
	* sanity-checking on return to userland.  Maybe useful, maybe not.
	* taking thread_info out of the way of stack overflows.  Reasonable,
but has very little to do with the rest of that.
	* protecting against Lovecraftian horrors slithering in from the outer
space only to commit unspeakable acts against ->addr_limit and ignoring much
tastier targets next to it, but then what do you expect from degenerate
spawn of Great Old Ones - sanity?

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                     ` <20170508204858.GT29622-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
@ 2017-05-12 23:15                       ` Andy Lutomirski
  0 siblings, 0 replies; 89+ messages in thread
From: Andy Lutomirski @ 2017-05-12 23:15 UTC (permalink / raw)
  To: Al Viro
  Cc: Jann Horn, Ingo Molnar, Kees Cook, Thomas Garnier,
	Martin Schwidefsky, Heiko Carstens, Dave Hansen, Arnd Bergmann,
	Thomas Gleixner, David Howells, René Nyffenegger,
	Andrew Morton, Paul E . McKenney, Eric W . Biederman,
	Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar, H . Peter Anvin,
	Andy Lutomirski

On Mon, May 8, 2017 at 1:48 PM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:
> On Mon, May 08, 2017 at 04:06:35PM +0200, Jann Horn wrote:
>
>> I think Kees might be talking about
>> https://bugs.chromium.org/p/project-zero/issues/detail?id=822, fixed in
>> commit e6978e4bf181fb3b5f8cb6f71b4fe30fbf1b655c. The issue was that
>> perf code that can run in pretty much any context called access_ok().
>
> And that commit has *NOT* solved the problem.  perf_callchain_user()
> can be called synchronously, without passing through that code.
> Tracepoint shite...
>
> That set_fs() should be done in get_perf_callchain(), just around the call of
> perf_callchain_user().  Along with pagefault_disable(), actually.
>

Even that's not quite enough because of a different issue: perf nmis
can hit during scheduling or when we're in lazy mm, leading to the
entirely wrong set of page tables being used.  We need
nmi_uaccess_begin() and nmi_uaccess_end(), and the former needs to be
allowed to fail.

AFAIK this isn't presently a security problem because it mainly
affects kernel threads, and you need to be root to profile them, but
maybe there's some race where it does matter.

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
       [not found]                             ` <20170512071549.GP390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
  2017-05-12  7:43                               ` [kernel-hardening] " Arnd Bergmann
@ 2017-05-12 23:20                               ` Andy Lutomirski
  1 sibling, 0 replies; 89+ messages in thread
From: Andy Lutomirski @ 2017-05-12 23:20 UTC (permalink / raw)
  To: Al Viro
  Cc: Ingo Molnar, Andy Lutomirski, Christoph Hellwig, Greg KH,
	Thomas Garnier, Martin Schwidefsky, Heiko Carstens, Dave Hansen,
	Arnd Bergmann, Thomas Gleixner, David Howells,
	René Nyffenegger, Andrew Morton, Paul E . McKenney,
	Eric W . Biederman, Oleg Nesterov, Pavel Tikhomirov, Ingo Molnar,
	H . Peter Anvin

On Fri, May 12, 2017 at 12:15 AM, Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org> wrote:

> Folks, seriously, have you even looked through that zoo?  I have, and it's
> really, really not fun.  Sure, we can say "fuck 'em, no need to allow
> splice() on random crap".  Would be perfectly reasonable, expect that
> it's not the only place doing kernel_write() and its ilk...

Can you clarify this?  I think we really may be able to do exactly
this.  From Christoph's list, there are only two things that need
kernel_read/kernel_write to user-supplied fds that may come from a
variety of sources: splice and exec.  If you're execing a chardev from
a crappy driver, something is seriously wrong.  And returning -EINVAL
from splice() to or from files that use ->read and ->write seems find
(and splice(2) even documents -EINVAL as meaning that the target
doesn't support splicing).

--Andy

^ permalink raw reply	[flat|nested] 89+ messages in thread

* Re: [kernel-hardening] Re: [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode
  2017-05-12 20:30                                                             ` Peter Zijlstra
       [not found]                                                               ` <20170512203044.GI4626-IIpfhp3q70z/8w/KjCw3T+5/BudmfyzbbVWyRVo5IupeoWH0uzbU5w@public.gmane.org>
@ 2017-05-13  7:21                                                               ` Christoph Hellwig
  1 sibling, 0 replies; 89+ messages in thread
From: Christoph Hellwig @ 2017-05-13  7:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mark Rutland, Brian Gerst, Kernel Hardening, Catalin Marinas,
	Heiko Carstens, Oleg Nesterov, David Howells, Dave Hansen,
	H . Peter Anvin, Ingo Molnar, Pavel Tikhomirov, linux-s390,
	the arch/x86 maintainers, Russell King - ARM Linux,
	Christian Borntraeger, René Nyffenegger, Greg KH,
	Paul E . McKenney, Rik van Riel, Kees Cook, Arnd Bergmann,
	Will Deacon

On Fri, May 12, 2017 at 10:30:44PM +0200, Peter Zijlstra wrote:
> On stack variable length arrays get implemented by the compiler doing
> alloca(), and we sadly have a few of those around.

I've just got rid of one of those and I wish they would appear
entirely as they are horrible in so many different ways.  Sparse
warns about them, btw.

^ permalink raw reply	[flat|nested] 89+ messages in thread

end of thread, other threads:[~2017-05-13  7:21 UTC | newest]

Thread overview: 89+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-28 15:32 [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode Thomas Garnier
     [not found] ` <20170428153213.137279-1-thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
2017-04-28 15:32   ` [PATCH v9 2/4] x86/syscalls: Optimize address limit check Thomas Garnier
2017-05-05 22:18   ` [PATCH v9 1/4] syscalls: Verify address limit before returning to user-mode Thomas Garnier
     [not found]     ` <CAJcbSZGQsRVg3QZ9QfLn2HBC+RP-7fUTab0bYDJ455d8y8GyNw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-08  7:33       ` Ingo Molnar
     [not found]         ` <20170508073352.caqe3fqf7nuxypgi-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-05-08  7:52           ` Ingo Molnar
     [not found]             ` <20170508075209.7aluvpwildw325rf-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-05-08 15:22               ` [kernel-hardening] " Daniel Micay
2017-05-08 15:26                 ` Kees Cook
2017-05-08 19:51                   ` Thomas Garnier
     [not found]                   ` <CAGXu5jL61K0bRSEg9a_LswNyrt3K1J57REbWVcvAXT54zWwtMA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-09  6:56                     ` Ingo Molnar
     [not found]                       ` <20170509065619.wmqa6z6w3n6xpvrw-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-05-09 11:10                         ` Greg KH
     [not found]                           ` <20170509111007.GA14702-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>
2017-05-09 14:29                             ` Thomas Garnier
     [not found]                               ` <CAJcbSZFswDWZoK-1UK+xkRMJ4ttSYbtH2Y5WD5_aPR-8ru6t8A-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-11 23:17                                 ` Thomas Garnier
     [not found]                                   ` <CAJcbSZEoRyewUtBHvqmNZL9FtT_q42Vmmd-EuC50x-ZRASiHHg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-11 23:44                                     ` Linus Torvalds
     [not found]                                       ` <CA+55aFwvQfs_X+paQF6Luc0Rq+W3J2fKuHRou7=ANcquDdXdDA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-12  5:28                                         ` Martin Schwidefsky
2017-05-12  5:34                                           ` Kees Cook
2017-05-12  5:54                                             ` Martin Schwidefsky
2017-05-12 19:01                                               ` Kees Cook
2017-05-12 19:08                                                 ` Russell King - ARM Linux
     [not found]                                                 ` <CAGXu5jL9vUrn4kpjO+qa4cHmWBypeqP17OGbrMs=5Nz0YpQMZw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-12 19:08                                                   ` Linus Torvalds
     [not found]                                                     ` <CA+55aFzbiBqsYb7vwO=+L4Vp_GOgPu+DBOrq4fBnyzq5DbBehg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-12 19:30                                                       ` Kees Cook
     [not found]                                                         ` <CAGXu5j+xmyJ6RhtPw9rUgs7k3sZ1KKWffvyGGG--oPfu9W42ng-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-12 20:21                                                           ` Russell King - ARM Linux
2017-05-12 20:30                                                             ` Peter Zijlstra
     [not found]                                                               ` <20170512203044.GI4626-IIpfhp3q70z/8w/KjCw3T+5/BudmfyzbbVWyRVo5IupeoWH0uzbU5w@public.gmane.org>
2017-05-12 20:45                                                                 ` Russell King - ARM Linux
2017-05-12 21:00                                                                   ` Kees Cook
     [not found]                                                                     ` <CAGXu5jL6FPuShBpZfi6+XHqOk4gxocUJRYPHT5oR3HYh3xm+sA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-12 21:04                                                                       ` Kees Cook
2017-05-13  7:21                                                               ` Christoph Hellwig
2017-05-12 21:06                                                             ` Al Viro
     [not found]                                                               ` <20170512210645.GS390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2017-05-12 21:16                                                                 ` [kernel-hardening] " Daniel Micay
2017-05-12 21:17                                                                 ` Kees Cook
     [not found]                                                                   ` <CAGXu5jJu=VTqp2tzkPB4RAVxdGC+_SSQwrUwdzWpu24AA-zEcg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-12 21:23                                                                     ` Daniel Micay
2017-05-12 21:41                                                                     ` Al Viro
2017-05-12 21:47                                                                       ` Rik van Riel
     [not found]                                                                         ` <1494625675.29205.21.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2017-05-12 22:57                                                                           ` Al Viro
     [not found]                                                                       ` <20170512214144.GT390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2017-05-12 21:50                                                                         ` Kees Cook
     [not found]                                             ` <CAGXu5j+EatK=DYONRkgovwLgytAnbG8jnAZaMSLckZFNVj3gig-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-12  6:57                                               ` Ingo Molnar
2017-05-12  6:13                                       ` Andy Lutomirski
2017-05-12  6:58                                       ` Ingo Molnar
2017-05-12 17:05                                         ` Thomas Garnier
2017-05-09 16:30                       ` [kernel-hardening] " Kees Cook
2017-05-08 12:46           ` Greg KH
     [not found]             ` <20170508124621.GA20705-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>
2017-05-09  6:45               ` Ingo Molnar
     [not found]                 ` <20170509064522.anusoikaalvlux3w-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-05-09  8:56                   ` Christoph Hellwig
2017-05-09 13:00                     ` Andy Lutomirski
2017-05-09 13:02                       ` [kernel-hardening] " Christoph Hellwig
2017-05-09 16:03                         ` Christoph Hellwig
2017-05-09 16:50                           ` Kees Cook
     [not found]                             ` <CAGXu5jKHVMRMKDfn+=kkbm+JkWPhoEtDwKx=QXAYxg1p9bn7PQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-09 22:52                               ` Andy Lutomirski
     [not found]                                 ` <CALCETrV73=cDvaSLOMvb299yaGNJYME8LC-=P+N6p7R1NN97Yg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-09 23:31                                   ` Kees Cook
2017-05-10  1:59                                     ` Andy Lutomirski
2017-05-10  7:15                                     ` Christoph Hellwig
     [not found]                                     ` <CAGXu5jL6PeQmmdxh5h--fgrMK8DW_XZYpNfDOvvv_o9E3-Kxdw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-11 11:22                                       ` Borislav Petkov
2017-05-10  6:46                             ` Christoph Hellwig
     [not found]                           ` <20170509160322.GA15902-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
2017-05-10  2:11                             ` Al Viro
2017-05-10  2:45                               ` Al Viro
2017-05-10  3:12                                 ` Al Viro
     [not found]                                   ` <20170510031254.GC390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2017-05-10  3:21                                     ` Al Viro
     [not found]                                       ` <20170510032137.GD390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2017-05-10  3:39                                         ` Al Viro
2017-05-10  6:54                                           ` Christoph Hellwig
2017-05-10  6:53                                   ` Christoph Hellwig
2017-05-10  7:27                                     ` Al Viro
2017-05-10  7:35                                       ` Christoph Hellwig
2017-05-10  6:49                                 ` Christoph Hellwig
2017-05-10  7:28                             ` Arnd Bergmann
2017-05-10  7:35                               ` Christoph Hellwig
2017-05-09 16:05                       ` Brian Gerst
     [not found]                       ` <CALCETrUh8NO2scaqEM48K70Fo2+V3=Cpyk4JurCDiCYp4nm_+g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-10  7:37                         ` [kernel-hardening] " Arnd Bergmann
2017-05-10  8:08                           ` Al Viro
2017-05-10  8:14                             ` Christoph Hellwig
2017-05-11  0:18                               ` Andy Lutomirski
2017-05-12  7:00                         ` Ingo Molnar
2017-05-12  7:15                           ` Al Viro
2017-05-12  7:35                             ` Christoph Hellwig
2017-05-12  8:07                               ` Christoph Hellwig
2017-05-12  8:23                                 ` Greg KH
     [not found]                             ` <20170512071549.GP390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2017-05-12  7:43                               ` [kernel-hardening] " Arnd Bergmann
2017-05-12  8:11                                 ` Christoph Hellwig
2017-05-12  8:16                                   ` Al Viro
2017-05-12  8:11                                 ` Al Viro
     [not found]                                   ` <20170512081154.GQ390-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2017-05-12  8:20                                     ` Arnd Bergmann
2017-05-12 23:20                               ` Andy Lutomirski
2017-05-08 13:09         ` Kees Cook
2017-05-08 14:02           ` Ingo Molnar
     [not found]             ` <20170508140230.23kxf2kfeazeo4zr-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-05-08 14:06               ` Jann Horn
     [not found]                 ` <CAG48ez0Hz=CimkPwuq903tgJkGj8gXUtiQJJb-P2zUes6bd6Hw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-08 20:48                   ` Al Viro
     [not found]                     ` <20170508204858.GT29622-3bDd1+5oDREiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
2017-05-12 23:15                       ` Andy Lutomirski
2017-05-08 15:24               ` Kees Cook
     [not found]                 ` <CAGXu5jJ4iY7QZ9wRu5dmm7RHtLh_V6TQh4huWwLCYPKOr63aiA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-05-09  6:34                   ` Ingo Molnar
2017-04-28 15:32 ` [PATCH v9 3/4] arm/syscalls: Optimize address limit check Thomas Garnier
2017-04-28 15:32 ` [PATCH v9 4/4] arm64/syscalls: " Thomas Garnier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).