* [PATCH v3 1/4] syscalls: Restore address limit after a syscall
@ 2017-03-11 0:04 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-11 0:04 UTC (permalink / raw)
To: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, Thomas Garnier, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, Ingo Molnar, John Stultz, Thomas Gleixner
Cc: linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
kernel-hardening
This patch ensures a syscall does not return to user-mode with a kernel
address limit. If that happened, a process can corrupt kernel-mode
memory and elevate privileges.
For example, it would mitigation this bug:
- https://bugs.chromium.org/p/project-zero/issues/detail?id=990
If the CONFIG_BUG_ON_DATA_CORRUPTION option is enabled, an incorrect
state will result in a BUG_ON.
The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
added so each architecture can optimize this change.
Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
Based on next-20170308
---
arch/s390/Kconfig | 1 +
include/linux/syscalls.h | 18 +++++++++++++++++-
init/Kconfig | 7 +++++++
kernel/sys.c | 8 ++++++++
4 files changed, 33 insertions(+), 1 deletion(-)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a2dcef0aacc7..b73f5b87bc99 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -103,6 +103,7 @@ config S390
select ARCH_INLINE_WRITE_UNLOCK_BH
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_SAVE_PAGE_KEYS if HIBERNATION
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_NUMA_BALANCING
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..e659076adf6c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -191,6 +191,19 @@ extern struct trace_event_functions exit_syscall_print_funcs;
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+asmlinkage void verify_pre_usermode_state(void);
+
+#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+#define __CHECK_USER_CALLER() \
+ bool user_caller = segment_eq(get_fs(), USER_DS)
+#define __VERIFY_PRE_USERMODE_STATE() \
+ if (user_caller) verify_pre_usermode_state()
+#else
+#define __CHECK_USER_CALLER()
+#define __VERIFY_PRE_USERMODE_STATE()
+#endif
+
+
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
@@ -199,7 +212,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
- long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
+ long ret; \
+ __CHECK_USER_CALLER(); \
+ ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
+ __VERIFY_PRE_USERMODE_STATE(); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
diff --git a/init/Kconfig b/init/Kconfig
index c859c993c26f..c4efc3a95e4a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1929,6 +1929,13 @@ config PROFILING
config TRACEPOINTS
bool
+#
+# Set by each architecture that want to optimize how verify_pre_usermode_state
+# is called.
+#
+config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+ bool
+
source "arch/Kconfig"
endmenu # General setup
diff --git a/kernel/sys.c b/kernel/sys.c
index 196c7134bee6..411163ac9dc3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2459,3 +2459,11 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
return 0;
}
#endif /* CONFIG_COMPAT */
+
+/* Called before coming back to user-mode */
+asmlinkage void verify_pre_usermode_state(void)
+{
+ if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
+ "incorrect get_fs() on user-mode return"))
+ set_fs(USER_DS);
+}
--
2.12.0.246.ga2ecc84866-goog
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [kernel-hardening] [PATCH v3 1/4] syscalls: Restore address limit after a syscall
@ 2017-03-11 0:04 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-11 0:04 UTC (permalink / raw)
To: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, Thomas Garnier, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, Ingo Molnar, John Stultz, Thomas Gleixner,
Oleg Nesterov, Stephen Smalley, Pavel Tikhomirov,
Frederic Weisbecker, Stanislav Kinsburskiy, Ingo Molnar,
H . Peter Anvin, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf
Cc: linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
kernel-hardening
This patch ensures a syscall does not return to user-mode with a kernel
address limit. If that happened, a process can corrupt kernel-mode
memory and elevate privileges.
For example, it would mitigation this bug:
- https://bugs.chromium.org/p/project-zero/issues/detail?id=990
If the CONFIG_BUG_ON_DATA_CORRUPTION option is enabled, an incorrect
state will result in a BUG_ON.
The CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE option is also
added so each architecture can optimize this change.
Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
Based on next-20170308
---
arch/s390/Kconfig | 1 +
include/linux/syscalls.h | 18 +++++++++++++++++-
init/Kconfig | 7 +++++++
kernel/sys.c | 8 ++++++++
4 files changed, 33 insertions(+), 1 deletion(-)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a2dcef0aacc7..b73f5b87bc99 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -103,6 +103,7 @@ config S390
select ARCH_INLINE_WRITE_UNLOCK_BH
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_SAVE_PAGE_KEYS if HIBERNATION
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_NUMA_BALANCING
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 980c3c9b06f8..e659076adf6c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -191,6 +191,19 @@ extern struct trace_event_functions exit_syscall_print_funcs;
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+asmlinkage void verify_pre_usermode_state(void);
+
+#ifndef CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+#define __CHECK_USER_CALLER() \
+ bool user_caller = segment_eq(get_fs(), USER_DS)
+#define __VERIFY_PRE_USERMODE_STATE() \
+ if (user_caller) verify_pre_usermode_state()
+#else
+#define __CHECK_USER_CALLER()
+#define __VERIFY_PRE_USERMODE_STATE()
+#endif
+
+
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
@@ -199,7 +212,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
- long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
+ long ret; \
+ __CHECK_USER_CALLER(); \
+ ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
+ __VERIFY_PRE_USERMODE_STATE(); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
diff --git a/init/Kconfig b/init/Kconfig
index c859c993c26f..c4efc3a95e4a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1929,6 +1929,13 @@ config PROFILING
config TRACEPOINTS
bool
+#
+# Set by each architecture that want to optimize how verify_pre_usermode_state
+# is called.
+#
+config ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
+ bool
+
source "arch/Kconfig"
endmenu # General setup
diff --git a/kernel/sys.c b/kernel/sys.c
index 196c7134bee6..411163ac9dc3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2459,3 +2459,11 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
return 0;
}
#endif /* CONFIG_COMPAT */
+
+/* Called before coming back to user-mode */
+asmlinkage void verify_pre_usermode_state(void)
+{
+ if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
+ "incorrect get_fs() on user-mode return"))
+ set_fs(USER_DS);
+}
--
2.12.0.246.ga2ecc84866-goog
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-11 0:04 ` [kernel-hardening] " Thomas Garnier
@ 2017-03-11 0:04 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-11 0:04 UTC (permalink / raw)
To: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, Thomas Garnier, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, Ingo Molnar, John Stultz, Thomas Gleixner
Cc: linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
kernel-hardening
Implement specific usage of verify_pre_usermode_state for user-mode
returns for x86.
---
Based on next-20170308
---
arch/x86/Kconfig | 1 +
arch/x86/entry/common.c | 3 +++
arch/x86/entry/entry_64.S | 19 +++++++++++++++++++
arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
arch/x86/include/asm/processor.h | 11 -----------
5 files changed, 34 insertions(+), 11 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 005df7c825f5..6d48e18e6f09 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -63,6 +63,7 @@ config X86
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 370c42c7f046..525edbb77f03 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -22,6 +22,7 @@
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
+#include <linux/syscalls.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -180,6 +181,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
u32 cached_flags;
+ verify_pre_usermode_state();
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2b2a2948ffe..04db589be466 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -218,6 +218,25 @@ entry_SYSCALL_64_fastpath:
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
+ /*
+ * Check user-mode state on fast path return, the same check is done
+ * under the slow path through syscall_return_slowpath.
+ */
+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
+ call verify_pre_usermode_state
+#else
+ /*
+ * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
+ * warning.
+ */
+ movq PER_CPU_VAR(current_task), %rax
+ movq $TASK_SIZE_MAX, %rcx
+ cmp %rcx, TASK_addr_limit(%rax)
+ jz 1f
+ movq %rcx, TASK_addr_limit(%rax)
+1:
+#endif
+
LOCKDEP_SYS_EXIT
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 3a264200c62f..0fbbb79d058c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -76,4 +76,15 @@ typedef struct { pteval_t pte; } pte_t;
#define EARLY_DYNAMIC_PAGE_TABLES 64
+/*
+ * User space process size. 47bits minus one guard page. The guard
+ * page is necessary on Intel CPUs: if a SYSCALL instruction is at
+ * the highest possible canonical userspace address, then that
+ * syscall will enter the kernel with a non-canonical return
+ * address, and SYSRET will explode dangerously. We avoid this
+ * particular problem by preventing anything from being mapped
+ * at the maximum canonical address.
+ */
+#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca5407a..9bc99d37133e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -829,17 +829,6 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
- */
-#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
-
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
--
2.12.0.246.ga2ecc84866-goog
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [kernel-hardening] [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-11 0:04 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-11 0:04 UTC (permalink / raw)
To: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, Thomas Garnier, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, Ingo Molnar, John Stultz, Thomas Gleixner,
Oleg Nesterov, Stephen Smalley, Pavel Tikhomirov,
Frederic Weisbecker, Stanislav Kinsburskiy, Ingo Molnar,
H . Peter Anvin, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf
Cc: linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
kernel-hardening
Implement specific usage of verify_pre_usermode_state for user-mode
returns for x86.
---
Based on next-20170308
---
arch/x86/Kconfig | 1 +
arch/x86/entry/common.c | 3 +++
arch/x86/entry/entry_64.S | 19 +++++++++++++++++++
arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
arch/x86/include/asm/processor.h | 11 -----------
5 files changed, 34 insertions(+), 11 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 005df7c825f5..6d48e18e6f09 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -63,6 +63,7 @@ config X86
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 370c42c7f046..525edbb77f03 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -22,6 +22,7 @@
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
+#include <linux/syscalls.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -180,6 +181,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
u32 cached_flags;
+ verify_pre_usermode_state();
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2b2a2948ffe..04db589be466 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -218,6 +218,25 @@ entry_SYSCALL_64_fastpath:
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
+ /*
+ * Check user-mode state on fast path return, the same check is done
+ * under the slow path through syscall_return_slowpath.
+ */
+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
+ call verify_pre_usermode_state
+#else
+ /*
+ * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
+ * warning.
+ */
+ movq PER_CPU_VAR(current_task), %rax
+ movq $TASK_SIZE_MAX, %rcx
+ cmp %rcx, TASK_addr_limit(%rax)
+ jz 1f
+ movq %rcx, TASK_addr_limit(%rax)
+1:
+#endif
+
LOCKDEP_SYS_EXIT
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 3a264200c62f..0fbbb79d058c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -76,4 +76,15 @@ typedef struct { pteval_t pte; } pte_t;
#define EARLY_DYNAMIC_PAGE_TABLES 64
+/*
+ * User space process size. 47bits minus one guard page. The guard
+ * page is necessary on Intel CPUs: if a SYSCALL instruction is at
+ * the highest possible canonical userspace address, then that
+ * syscall will enter the kernel with a non-canonical return
+ * address, and SYSRET will explode dangerously. We avoid this
+ * particular problem by preventing anything from being mapped
+ * at the maximum canonical address.
+ */
+#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca5407a..9bc99d37133e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -829,17 +829,6 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
- */
-#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
-
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
--
2.12.0.246.ga2ecc84866-goog
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 3/4] arm/syscalls: Specific usage of verify_pre_usermode_state
2017-03-11 0:04 ` [kernel-hardening] " Thomas Garnier
@ 2017-03-11 0:05 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-11 0:05 UTC (permalink / raw)
To: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, Thomas Garnier, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, Ingo Molnar, John Stultz, Thomas Gleixner
Cc: linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
kernel-hardening
Implement specific usage of verify_pre_usermode_state for user-mode
returns for arm.
---
Based on next-20170308
---
arch/arm/Kconfig | 1 +
arch/arm/include/asm/domain.h | 3 +++
arch/arm/kernel/entry-common.S | 32 +++++++++++++++++++++++++++++++-
3 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0d4e71b42c77..704fd8f197fa 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -12,6 +12,7 @@ config ARM
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_MIGHT_HAVE_PC_PARPORT
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
select ARCH_SUPPORTS_ATOMIC_RMW
diff --git a/arch/arm/include/asm/domain.h b/arch/arm/include/asm/domain.h
index 99d9f630d6b6..fd1cd24ca9fe 100644
--- a/arch/arm/include/asm/domain.h
+++ b/arch/arm/include/asm/domain.h
@@ -58,6 +58,9 @@
#define domain_mask(dom) ((3) << (2 * (dom)))
#define domain_val(dom,type) ((type) << (2 * (dom)))
+#define DOMAIN_KERNEL_MASK domain_mask(DOMAIN_KERNEL)
+#define DOMAIN_CLIENT_VALUE domain_val(DOMAIN_KERNEL, DOMAIN_CLIENT)
+
#ifdef CONFIG_CPU_SW_DOMAIN_PAN
#define DACR_INIT \
(domain_val(DOMAIN_USER, DOMAIN_NOACCESS) | \
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..8d9bfe9ff313 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -12,6 +12,7 @@
#include <asm/unistd.h>
#include <asm/ftrace.h>
#include <asm/unwind.h>
+#include <asm/memory.h>
#ifdef CONFIG_AEABI
#include <asm/unistd-oabi.h>
#endif
@@ -27,7 +28,6 @@
#include "entry-header.S"
-
.align 5
#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING))
/*
@@ -40,9 +40,12 @@ ret_fast_syscall:
UNWIND(.fnstart )
UNWIND(.cantunwind )
disable_irq_notrace @ disable interrupts
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
bne fast_work_pending
+ cmp r2, #TASK_SIZE
+ blne addr_limit_fail
/* perform architecture specific actions before user return */
arch_ret_to_user r1, lr
@@ -66,6 +69,7 @@ ret_fast_syscall:
UNWIND(.cantunwind )
str r0, [sp, #S_R0 + S_OFF]! @ save returned r0
disable_irq_notrace @ disable interrupts
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
beq no_work_pending
@@ -82,6 +86,7 @@ slow_work_pending:
mov r2, why @ 'syscall'
bl do_work_pending
cmp r0, #0
+ ldreq r2, [tsk, #TI_ADDR_LIMIT]
beq no_work_pending
movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE)
ldmia sp, {r0 - r6} @ have to reload r0 - r6
@@ -99,9 +104,12 @@ ret_slow_syscall:
disable_irq_notrace @ disable interrupts
ENTRY(ret_to_user_from_irq)
ldr r1, [tsk, #TI_FLAGS]
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
tst r1, #_TIF_WORK_MASK
bne slow_work_pending
no_work_pending:
+ cmp r2, #TASK_SIZE
+ blne addr_limit_fail
asm_trace_hardirqs_on save = 0
/* perform architecture specific actions before user return */
@@ -125,6 +133,28 @@ ENTRY(ret_from_fork)
b ret_slow_syscall
ENDPROC(ret_from_fork)
+addr_limit_fail:
+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
+ stmfd sp!, {r0, lr}
+ bl verify_pre_usermode_state
+ ldmfd sp!, {r0, lr}
+#else
+ /*
+ * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
+ * warning.
+ */
+ mov r2, #TASK_SIZE
+ str r2, [tsk, #TI_ADDR_LIMIT]
+#ifdef CONFIG_CPU_USE_DOMAINS
+ /* Switch domain like modify_domain(DOMAIN_KERNEL, DOMAIN_CLIENT) */
+ mrc p15, 0, r2, c3, c0, 0 @ Get domain register
+ bic r2, r2, #DOMAIN_KERNEL_MASK @ Clear the domain part
+ orr r2, r2, #DOMAIN_CLIENT_VALUE @ Enforce DOMAIN_CLIENT
+ mcr p15, 0, r2, c3, c0, 0 @ Set domain register
+#endif
+#endif
+ ret lr
+
/*=============================================================================
* SWI handler
*-----------------------------------------------------------------------------
--
2.12.0.246.ga2ecc84866-goog
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [kernel-hardening] [PATCH v3 3/4] arm/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-11 0:05 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-11 0:05 UTC (permalink / raw)
To: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, Thomas Garnier, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, Ingo Molnar, John Stultz, Thomas Gleixner,
Oleg Nesterov, Stephen Smalley, Pavel Tikhomirov,
Frederic Weisbecker, Stanislav Kinsburskiy, Ingo Molnar,
H . Peter Anvin, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf
Cc: linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
kernel-hardening
Implement specific usage of verify_pre_usermode_state for user-mode
returns for arm.
---
Based on next-20170308
---
arch/arm/Kconfig | 1 +
arch/arm/include/asm/domain.h | 3 +++
arch/arm/kernel/entry-common.S | 32 +++++++++++++++++++++++++++++++-
3 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0d4e71b42c77..704fd8f197fa 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -12,6 +12,7 @@ config ARM
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_MIGHT_HAVE_PC_PARPORT
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
select ARCH_SUPPORTS_ATOMIC_RMW
diff --git a/arch/arm/include/asm/domain.h b/arch/arm/include/asm/domain.h
index 99d9f630d6b6..fd1cd24ca9fe 100644
--- a/arch/arm/include/asm/domain.h
+++ b/arch/arm/include/asm/domain.h
@@ -58,6 +58,9 @@
#define domain_mask(dom) ((3) << (2 * (dom)))
#define domain_val(dom,type) ((type) << (2 * (dom)))
+#define DOMAIN_KERNEL_MASK domain_mask(DOMAIN_KERNEL)
+#define DOMAIN_CLIENT_VALUE domain_val(DOMAIN_KERNEL, DOMAIN_CLIENT)
+
#ifdef CONFIG_CPU_SW_DOMAIN_PAN
#define DACR_INIT \
(domain_val(DOMAIN_USER, DOMAIN_NOACCESS) | \
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index eb5cd77bf1d8..8d9bfe9ff313 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -12,6 +12,7 @@
#include <asm/unistd.h>
#include <asm/ftrace.h>
#include <asm/unwind.h>
+#include <asm/memory.h>
#ifdef CONFIG_AEABI
#include <asm/unistd-oabi.h>
#endif
@@ -27,7 +28,6 @@
#include "entry-header.S"
-
.align 5
#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING))
/*
@@ -40,9 +40,12 @@ ret_fast_syscall:
UNWIND(.fnstart )
UNWIND(.cantunwind )
disable_irq_notrace @ disable interrupts
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
bne fast_work_pending
+ cmp r2, #TASK_SIZE
+ blne addr_limit_fail
/* perform architecture specific actions before user return */
arch_ret_to_user r1, lr
@@ -66,6 +69,7 @@ ret_fast_syscall:
UNWIND(.cantunwind )
str r0, [sp, #S_R0 + S_OFF]! @ save returned r0
disable_irq_notrace @ disable interrupts
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
beq no_work_pending
@@ -82,6 +86,7 @@ slow_work_pending:
mov r2, why @ 'syscall'
bl do_work_pending
cmp r0, #0
+ ldreq r2, [tsk, #TI_ADDR_LIMIT]
beq no_work_pending
movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE)
ldmia sp, {r0 - r6} @ have to reload r0 - r6
@@ -99,9 +104,12 @@ ret_slow_syscall:
disable_irq_notrace @ disable interrupts
ENTRY(ret_to_user_from_irq)
ldr r1, [tsk, #TI_FLAGS]
+ ldr r2, [tsk, #TI_ADDR_LIMIT]
tst r1, #_TIF_WORK_MASK
bne slow_work_pending
no_work_pending:
+ cmp r2, #TASK_SIZE
+ blne addr_limit_fail
asm_trace_hardirqs_on save = 0
/* perform architecture specific actions before user return */
@@ -125,6 +133,28 @@ ENTRY(ret_from_fork)
b ret_slow_syscall
ENDPROC(ret_from_fork)
+addr_limit_fail:
+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
+ stmfd sp!, {r0, lr}
+ bl verify_pre_usermode_state
+ ldmfd sp!, {r0, lr}
+#else
+ /*
+ * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
+ * warning.
+ */
+ mov r2, #TASK_SIZE
+ str r2, [tsk, #TI_ADDR_LIMIT]
+#ifdef CONFIG_CPU_USE_DOMAINS
+ /* Switch domain like modify_domain(DOMAIN_KERNEL, DOMAIN_CLIENT) */
+ mrc p15, 0, r2, c3, c0, 0 @ Get domain register
+ bic r2, r2, #DOMAIN_KERNEL_MASK @ Clear the domain part
+ orr r2, r2, #DOMAIN_CLIENT_VALUE @ Enforce DOMAIN_CLIENT
+ mcr p15, 0, r2, c3, c0, 0 @ Set domain register
+#endif
+#endif
+ ret lr
+
/*=============================================================================
* SWI handler
*-----------------------------------------------------------------------------
--
2.12.0.246.ga2ecc84866-goog
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 4/4] arm64/syscalls: Specific usage of verify_pre_usermode_state
2017-03-11 0:04 ` [kernel-hardening] " Thomas Garnier
@ 2017-03-11 0:05 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-11 0:05 UTC (permalink / raw)
To: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, Thomas Garnier, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, Ingo Molnar, John Stultz, Thomas Gleixner
Cc: linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
kernel-hardening
Implement specific usage of verify_pre_usermode_state for user-mode
returns for arm64.
---
Based on next-20170308
---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/entry.S | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 896eba61e5ed..da54774838d8 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -24,6 +24,7 @@ config ARM64
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
select ARCH_WANT_FRAME_POINTERS
select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARM_AMBA
select ARM_ARCH_TIMER
select ARM_GIC
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 43512d4d7df2..685b43771ac9 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -744,6 +744,10 @@ ENDPROC(cpu_switch_to)
ret_fast_syscall:
disable_irq // disable interrupts
str x0, [sp, #S_X0] // returned x0
+ ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
+ mov x1, #TASK_SIZE_64
+ cmp x2, x1
+ b.ne addr_limit_fail
ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
and x2, x1, #_TIF_SYSCALL_WORK
cbnz x2, ret_fast_syscall_trace
@@ -771,6 +775,11 @@ work_pending:
*/
ret_to_user:
disable_irq // disable interrupts
+ ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
+ mov x1, #TASK_SIZE_64
+ cmp x2, x1
+ b.ne addr_limit_fail
+
ldr x1, [tsk, #TSK_TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
@@ -779,6 +788,22 @@ finish_ret_to_user:
kernel_exit 0
ENDPROC(ret_to_user)
+addr_limit_fail:
+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
+ stp x0, lr, [sp,#-16]!
+ bl verify_pre_usermode_state
+ ldp x0, lr, [sp],#16
+#else
+ /*
+ * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
+ * warning.
+ */
+ mov x2, #TASK_SIZE_64
+ str x2, [tsk, #TSK_TI_ADDR_LIMIT]
+ ALTERNATIVE(nop, SET_PSTATE_UAO(0), ARM64_HAS_UAO, CONFIG_ARM64_UAO)
+#endif
+ ret lr
+
/*
* This is how we return from a fork.
*/
--
2.12.0.246.ga2ecc84866-goog
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [kernel-hardening] [PATCH v3 4/4] arm64/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-11 0:05 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-11 0:05 UTC (permalink / raw)
To: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, Thomas Garnier, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, Ingo Molnar, John Stultz, Thomas Gleixner,
Oleg Nesterov, Stephen Smalley, Pavel Tikhomirov,
Frederic Weisbecker, Stanislav Kinsburskiy, Ingo Molnar,
H . Peter Anvin, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf
Cc: linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
kernel-hardening
Implement specific usage of verify_pre_usermode_state for user-mode
returns for arm64.
---
Based on next-20170308
---
arch/arm64/Kconfig | 1 +
arch/arm64/kernel/entry.S | 25 +++++++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 896eba61e5ed..da54774838d8 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -24,6 +24,7 @@ config ARM64
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
select ARCH_WANT_FRAME_POINTERS
select ARCH_HAS_UBSAN_SANITIZE_ALL
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARM_AMBA
select ARM_ARCH_TIMER
select ARM_GIC
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 43512d4d7df2..685b43771ac9 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -744,6 +744,10 @@ ENDPROC(cpu_switch_to)
ret_fast_syscall:
disable_irq // disable interrupts
str x0, [sp, #S_X0] // returned x0
+ ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
+ mov x1, #TASK_SIZE_64
+ cmp x2, x1
+ b.ne addr_limit_fail
ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
and x2, x1, #_TIF_SYSCALL_WORK
cbnz x2, ret_fast_syscall_trace
@@ -771,6 +775,11 @@ work_pending:
*/
ret_to_user:
disable_irq // disable interrupts
+ ldr x2, [tsk, #TSK_TI_ADDR_LIMIT] // check addr limit change
+ mov x1, #TASK_SIZE_64
+ cmp x2, x1
+ b.ne addr_limit_fail
+
ldr x1, [tsk, #TSK_TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
@@ -779,6 +788,22 @@ finish_ret_to_user:
kernel_exit 0
ENDPROC(ret_to_user)
+addr_limit_fail:
+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
+ stp x0, lr, [sp,#-16]!
+ bl verify_pre_usermode_state
+ ldp x0, lr, [sp],#16
+#else
+ /*
+ * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
+ * warning.
+ */
+ mov x2, #TASK_SIZE_64
+ str x2, [tsk, #TSK_TI_ADDR_LIMIT]
+ ALTERNATIVE(nop, SET_PSTATE_UAO(0), ARM64_HAS_UAO, CONFIG_ARM64_UAO)
+#endif
+ ret lr
+
/*
* This is how we return from a fork.
*/
--
2.12.0.246.ga2ecc84866-goog
^ permalink raw reply related [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-11 0:04 ` [kernel-hardening] " Thomas Garnier
@ 2017-03-11 9:42 ` Ingo Molnar
-1 siblings, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2017-03-11 9:42 UTC (permalink / raw)
To: Thomas Garnier
Cc: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, René Nyffenegger, Andrew Morton,
Kees Cook, Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley
* Thomas Garnier <thgarnie@google.com> wrote:
> Implement specific usage of verify_pre_usermode_state for user-mode
> returns for x86.
> ---
> Based on next-20170308
> ---
> arch/x86/Kconfig | 1 +
> arch/x86/entry/common.c | 3 +++
> arch/x86/entry/entry_64.S | 19 +++++++++++++++++++
> arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
> arch/x86/include/asm/processor.h | 11 -----------
> 5 files changed, 34 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 005df7c825f5..6d48e18e6f09 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -63,6 +63,7 @@ config X86
> select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
> select ARCH_MIGHT_HAVE_PC_PARPORT
> select ARCH_MIGHT_HAVE_PC_SERIO
> + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> select ARCH_SUPPORTS_ATOMIC_RMW
> select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
> select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 370c42c7f046..525edbb77f03 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -22,6 +22,7 @@
> #include <linux/context_tracking.h>
> #include <linux/user-return-notifier.h>
> #include <linux/uprobes.h>
> +#include <linux/syscalls.h>
>
> #include <asm/desc.h>
> #include <asm/traps.h>
> @@ -180,6 +181,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
> struct thread_info *ti = current_thread_info();
> u32 cached_flags;
>
> + verify_pre_usermode_state();
> +
> if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
> local_irq_disable();
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index d2b2a2948ffe..04db589be466 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -218,6 +218,25 @@ entry_SYSCALL_64_fastpath:
> testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
> jnz 1f
>
> + /*
> + * Check user-mode state on fast path return, the same check is done
> + * under the slow path through syscall_return_slowpath.
> + */
> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
> + call verify_pre_usermode_state
> +#else
> + /*
> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
> + * warning.
> + */
> + movq PER_CPU_VAR(current_task), %rax
> + movq $TASK_SIZE_MAX, %rcx
> + cmp %rcx, TASK_addr_limit(%rax)
> + jz 1f
> + movq %rcx, TASK_addr_limit(%rax)
> +1:
> +#endif
> +
> LOCKDEP_SYS_EXIT
> TRACE_IRQS_ON /* user mode is traced as IRQs on */
> movq RIP(%rsp), %rcx
Ugh, so you call an assembly function just to ... call another function.
Plus why is it in assembly to begin with? Is this some older code that got
written when the x86 entry code was in assembly, and never properly
converted to C?
Thanks,
Ingo
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-11 9:42 ` Ingo Molnar
0 siblings, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2017-03-11 9:42 UTC (permalink / raw)
To: Thomas Garnier
Cc: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, René Nyffenegger, Andrew Morton,
Kees Cook, Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, H . Peter Anvin, Paolo Bonzini, Dmitry Safonov,
Borislav Petkov, Josh Poimboeuf, Brian Gerst, Jan Beulich,
Christian Borntraeger, Fenghua Yu, He Chen, Russell King,
Vladimir Murzin, Will Deacon, Catalin Marinas, Mark Rutland,
James Morse, David A . Long, Pratyush Anand, Laura Abbott,
Andre Przywara, Chris Metcalf, linux-s390, linux-kernel,
linux-api, x86, linux-arm-kernel, kernel-hardening
* Thomas Garnier <thgarnie@google.com> wrote:
> Implement specific usage of verify_pre_usermode_state for user-mode
> returns for x86.
> ---
> Based on next-20170308
> ---
> arch/x86/Kconfig | 1 +
> arch/x86/entry/common.c | 3 +++
> arch/x86/entry/entry_64.S | 19 +++++++++++++++++++
> arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
> arch/x86/include/asm/processor.h | 11 -----------
> 5 files changed, 34 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 005df7c825f5..6d48e18e6f09 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -63,6 +63,7 @@ config X86
> select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
> select ARCH_MIGHT_HAVE_PC_PARPORT
> select ARCH_MIGHT_HAVE_PC_SERIO
> + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> select ARCH_SUPPORTS_ATOMIC_RMW
> select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
> select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 370c42c7f046..525edbb77f03 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -22,6 +22,7 @@
> #include <linux/context_tracking.h>
> #include <linux/user-return-notifier.h>
> #include <linux/uprobes.h>
> +#include <linux/syscalls.h>
>
> #include <asm/desc.h>
> #include <asm/traps.h>
> @@ -180,6 +181,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
> struct thread_info *ti = current_thread_info();
> u32 cached_flags;
>
> + verify_pre_usermode_state();
> +
> if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
> local_irq_disable();
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index d2b2a2948ffe..04db589be466 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -218,6 +218,25 @@ entry_SYSCALL_64_fastpath:
> testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
> jnz 1f
>
> + /*
> + * Check user-mode state on fast path return, the same check is done
> + * under the slow path through syscall_return_slowpath.
> + */
> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
> + call verify_pre_usermode_state
> +#else
> + /*
> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
> + * warning.
> + */
> + movq PER_CPU_VAR(current_task), %rax
> + movq $TASK_SIZE_MAX, %rcx
> + cmp %rcx, TASK_addr_limit(%rax)
> + jz 1f
> + movq %rcx, TASK_addr_limit(%rax)
> +1:
> +#endif
> +
> LOCKDEP_SYS_EXIT
> TRACE_IRQS_ON /* user mode is traced as IRQs on */
> movq RIP(%rsp), %rcx
Ugh, so you call an assembly function just to ... call another function.
Plus why is it in assembly to begin with? Is this some older code that got
written when the x86 entry code was in assembly, and never properly
converted to C?
Thanks,
Ingo
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-11 9:42 ` [kernel-hardening] " Ingo Molnar
@ 2017-03-13 15:53 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-13 15:53 UTC (permalink / raw)
To: Ingo Molnar
Cc: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, René Nyffenegger, Andrew Morton,
Kees Cook, Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley
On Sat, Mar 11, 2017 at 1:42 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> Ugh, so you call an assembly function just to ... call another function.
>
The verify_pre_usermode_state function is the architecture independent
checker. By default it is added on each syscall handler so calling it
here save code size.
> Plus why is it in assembly to begin with? Is this some older code that got
> written when the x86 entry code was in assembly, and never properly
> converted to C?
I wrote the assembly to make it faster and save a call on each syscall
return. I can just call verify_pre_usermode_state if you prefer.
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-13 15:53 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-13 15:53 UTC (permalink / raw)
To: Ingo Molnar
Cc: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, René Nyffenegger, Andrew Morton,
Kees Cook, Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, H . Peter Anvin, Paolo Bonzini, Dmitry Safonov,
Borislav Petkov, Josh Poimboeuf, Brian Gerst, Jan Beulich,
Christian Borntraeger, Fenghua Yu, He Chen, Russell King,
Vladimir Murzin, Will Deacon, Catalin Marinas, Mark Rutland,
James Morse, David A . Long, Pratyush Anand, Laura Abbott,
Andre Przywara, Chris Metcalf, linux-s390, LKML, Linux API,
the arch/x86 maintainers, linux-arm-kernel, Kernel Hardening
On Sat, Mar 11, 2017 at 1:42 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> Ugh, so you call an assembly function just to ... call another function.
>
The verify_pre_usermode_state function is the architecture independent
checker. By default it is added on each syscall handler so calling it
here save code size.
> Plus why is it in assembly to begin with? Is this some older code that got
> written when the x86 entry code was in assembly, and never properly
> converted to C?
I wrote the assembly to make it faster and save a call on each syscall
return. I can just call verify_pre_usermode_state if you prefer.
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-13 21:48 ` H. Peter Anvin
0 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-13 21:48 UTC (permalink / raw)
To: Ingo Molnar, Thomas Garnier
Cc: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, René Nyffenegger, Andrew Morton,
Kees Cook, Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav.Kinsburskiy
<skinsbursky@virtuozzo.com>,Ingo Molnar <mingo@redhat.com>,Paolo Bonzini <pbonzini@redhat.com>,Dmitry Safonov <dsafonov@virtuozzo.com>,Borislav Petkov <bp@alien8.de>,Josh Poimboeuf <jpoimboe@redhat.com>,Brian Gerst <brgerst@gmail.com>,Jan Beulich <JBeulich@suse.com>,Christian Borntraeger <borntraeger@de.ibm.com>,Fenghua Yu <fenghua.yu@intel.com>,He Chen <he.chen@linux.intel.com>,Russell King <linux@armlinux.org.uk>,Vladimir Murzin <vladimir.murzin@arm.com>,Will Deacon <will.deacon@arm.com>,Catalin Marinas <catalin.marinas@arm.com>,Mark Rutland <mark.rutland@arm.com>,James Morse <james.morse@arm.com>,"David A . Long" <dave.long@linaro.org>,Pratyush Anand <panand@redhat.com>,Laura Abbott <labbott@redhat.com>,Andre Przywara <andre.przywara@arm.com>,Chris Metcalf <cmetcalf@mellanox.com>,linux-s390@vger.kernel.org,linux-kernel@vger.kernel.org,linux-api@vger.kernel.org,x86@kernel.org,linux-arm-kernel@lists.infradead.org,kernel-hardening@lists.openwall.com
From: hpa@zytor.com
Message-ID: <BB78ABF9-382E-43E8-BAC6-1EA6416A30DB@zytor.com>
On March 11, 2017 1:42:00 AM PST, Ingo Molnar <mingo@kernel.org> wrote:
>
>* Thomas Garnier <thgarnie@google.com> wrote:
>
>> Implement specific usage of verify_pre_usermode_state for user-mode
>> returns for x86.
>> ---
>> Based on next-20170308
>> ---
>> arch/x86/Kconfig | 1 +
>> arch/x86/entry/common.c | 3 +++
>> arch/x86/entry/entry_64.S | 19 +++++++++++++++++++
>> arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
>> arch/x86/include/asm/processor.h | 11 -----------
>> 5 files changed, 34 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index 005df7c825f5..6d48e18e6f09 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -63,6 +63,7 @@ config X86
>> select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
>> select ARCH_MIGHT_HAVE_PC_PARPORT
>> select ARCH_MIGHT_HAVE_PC_SERIO
>> + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>> select ARCH_SUPPORTS_ATOMIC_RMW
>> select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
>> select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
>> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
>> index 370c42c7f046..525edbb77f03 100644
>> --- a/arch/x86/entry/common.c
>> +++ b/arch/x86/entry/common.c
>> @@ -22,6 +22,7 @@
>> #include <linux/context_tracking.h>
>> #include <linux/user-return-notifier.h>
>> #include <linux/uprobes.h>
>> +#include <linux/syscalls.h>
>>
>> #include <asm/desc.h>
>> #include <asm/traps.h>
>> @@ -180,6 +181,8 @@ __visible inline void
>prepare_exit_to_usermode(struct pt_regs *regs)
>> struct thread_info *ti = current_thread_info();
>> u32 cached_flags;
>>
>> + verify_pre_usermode_state();
>> +
>> if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
>> local_irq_disable();
>>
>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>> index d2b2a2948ffe..04db589be466 100644
>> --- a/arch/x86/entry/entry_64.S
>> +++ b/arch/x86/entry/entry_64.S
>> @@ -218,6 +218,25 @@ entry_SYSCALL_64_fastpath:
>> testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
>> jnz 1f
>>
>> + /*
>> + * Check user-mode state on fast path return, the same check is
>done
>> + * under the slow path through syscall_return_slowpath.
>> + */
>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>> + call verify_pre_usermode_state
>> +#else
>> + /*
>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without
>a
>> + * warning.
>> + */
>> + movq PER_CPU_VAR(current_task), %rax
>> + movq $TASK_SIZE_MAX, %rcx
>> + cmp %rcx, TASK_addr_limit(%rax)
>> + jz 1f
>> + movq %rcx, TASK_addr_limit(%rax)
>> +1:
>> +#endif
>> +
>> LOCKDEP_SYS_EXIT
>> TRACE_IRQS_ON /* user mode is traced as IRQs on */
>> movq RIP(%rsp), %rcx
>
>Ugh, so you call an assembly function just to ... call another
>function.
>
>Plus why is it in assembly to begin with? Is this some older code that
>got
>written when the x86 entry code was in assembly, and never properly
>converted to C?
>
>Thanks,
>
> Ingo
The code does a compare to jump around a store. It would be much cleaner and faster to simply clobber the value unconditionally. If there is a test it should be to avoid the function call, not (only) the assignment.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-13 21:48 ` H. Peter Anvin
0 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-13 21:48 UTC (permalink / raw)
To: Ingo Molnar, Thomas Garnier
Cc: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, René Nyffenegger, Andrew Morton,
Kees Cook, Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel
<skinsbursky-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org>,Ingo Molnar <mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,Paolo Bonzini <pbonzini-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,Dmitry Safonov <dsafonov-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org>,Borislav Petkov <bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org>,Josh Poimboeuf <jpoimboe-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,Brian Gerst <brgerst-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>,Jan Beulich <JBeulich-IBi9RG/b67k@public.gmane.org>,Christian Borntraeger <borntraeger-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>,Fenghua Yu <fenghua.yu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>,He Chen <he.chen-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>,Russell King <linux-I+IVW8TIWO2tmTQ+vhA3Yw@public.gmane.org>,Vladimir Murzin <vladimir.murzin-5wv7dgnIgG8@public.gmane.org>,Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org>,Catalin Marinas <catalin.marinas-5wv7dgnIgG8@public.gmane.org>,Mark Rutland <mark.rutland-5wv7dgnIgG8@public.gmane.org>,James Morse <james.morse-5wv7dgnIgG8@public.gmane.org>,"David A . Long" <dave.long-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>,Pratyush Anand <panand-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,Laura Abbott <labbott-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,Andre Przywara <andre.przywara-5wv7dgnIgG8@public.gmane.org>,Chris Metcalf <cmetcalf-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>,linux-
s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org,linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org,kernel-hardening-ZwoEplunGu1jrUoiu81ncdBPR1lH4CV8@public.gmane.org
From: hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org
Message-ID: <BB78ABF9-382E-43E8-BAC6-1EA6416A30DB-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
On March 11, 2017 1:42:00 AM PST, Ingo Molnar <mingo-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
>
>* Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
>
>> Implement specific usage of verify_pre_usermode_state for user-mode
>> returns for x86.
>> ---
>> Based on next-20170308
>> ---
>> arch/x86/Kconfig | 1 +
>> arch/x86/entry/common.c | 3 +++
>> arch/x86/entry/entry_64.S | 19 +++++++++++++++++++
>> arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
>> arch/x86/include/asm/processor.h | 11 -----------
>> 5 files changed, 34 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index 005df7c825f5..6d48e18e6f09 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -63,6 +63,7 @@ config X86
>> select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
>> select ARCH_MIGHT_HAVE_PC_PARPORT
>> select ARCH_MIGHT_HAVE_PC_SERIO
>> + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
>> select ARCH_SUPPORTS_ATOMIC_RMW
>> select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
>> select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
>> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
>> index 370c42c7f046..525edbb77f03 100644
>> --- a/arch/x86/entry/common.c
>> +++ b/arch/x86/entry/common.c
>> @@ -22,6 +22,7 @@
>> #include <linux/context_tracking.h>
>> #include <linux/user-return-notifier.h>
>> #include <linux/uprobes.h>
>> +#include <linux/syscalls.h>
>>
>> #include <asm/desc.h>
>> #include <asm/traps.h>
>> @@ -180,6 +181,8 @@ __visible inline void
>prepare_exit_to_usermode(struct pt_regs *regs)
>> struct thread_info *ti = current_thread_info();
>> u32 cached_flags;
>>
>> + verify_pre_usermode_state();
>> +
>> if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
>> local_irq_disable();
>>
>> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>> index d2b2a2948ffe..04db589be466 100644
>> --- a/arch/x86/entry/entry_64.S
>> +++ b/arch/x86/entry/entry_64.S
>> @@ -218,6 +218,25 @@ entry_SYSCALL_64_fastpath:
>> testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
>> jnz 1f
>>
>> + /*
>> + * Check user-mode state on fast path return, the same check is
>done
>> + * under the slow path through syscall_return_slowpath.
>> + */
>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>> + call verify_pre_usermode_state
>> +#else
>> + /*
>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without
>a
>> + * warning.
>> + */
>> + movq PER_CPU_VAR(current_task), %rax
>> + movq $TASK_SIZE_MAX, %rcx
>> + cmp %rcx, TASK_addr_limit(%rax)
>> + jz 1f
>> + movq %rcx, TASK_addr_limit(%rax)
>> +1:
>> +#endif
>> +
>> LOCKDEP_SYS_EXIT
>> TRACE_IRQS_ON /* user mode is traced as IRQs on */
>> movq RIP(%rsp), %rcx
>
>Ugh, so you call an assembly function just to ... call another
>function.
>
>Plus why is it in assembly to begin with? Is this some older code that
>got
>written when the x86 entry code was in assembly, and never properly
>converted to C?
>
>Thanks,
>
> Ingo
The code does a compare to jump around a store. It would be much cleaner and faster to simply clobber the value unconditionally. If there is a test it should be to avoid the function call, not (only) the assignment.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-11 9:42 ` [kernel-hardening] " Ingo Molnar
@ 2017-03-14 0:04 ` H. Peter Anvin
-1 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-14 0:04 UTC (permalink / raw)
To: Ingo Molnar, Thomas Garnier
Cc: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, René Nyffenegger, Andrew Morton,
Kees Cook, Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz
On 03/11/17 01:42, Ingo Molnar wrote:
>>
>> + /*
>> + * Check user-mode state on fast path return, the same check is done
>> + * under the slow path through syscall_return_slowpath.
>> + */
>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>> + call verify_pre_usermode_state
>> +#else
>> + /*
>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
>> + * warning.
>> + */
>> + movq PER_CPU_VAR(current_task), %rax
>> + movq $TASK_SIZE_MAX, %rcx
>> + cmp %rcx, TASK_addr_limit(%rax)
>> + jz 1f
>> + movq %rcx, TASK_addr_limit(%rax)
>> +1:
>> +#endif
>> +
How about simply doing...
movq PER_CPU_VAR(current_task), %rax
movq $TASK_SIZE_MAX, %rcx
#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
cmpq %rcx, TASK_addr_limit(%rax)
jne syscall_return_slowpath
#else
movq %rcx, TASK_addr_limit(%rax)
#endif
... and let the slow path take care of BUG. This should be much faster,
even with the BUG, and is simpler to boot.
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-14 0:04 ` H. Peter Anvin
0 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-14 0:04 UTC (permalink / raw)
To: Ingo Molnar, Thomas Garnier
Cc: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, René Nyffenegger, Andrew Morton,
Kees Cook, Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
kernel-hardening
On 03/11/17 01:42, Ingo Molnar wrote:
>>
>> + /*
>> + * Check user-mode state on fast path return, the same check is done
>> + * under the slow path through syscall_return_slowpath.
>> + */
>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>> + call verify_pre_usermode_state
>> +#else
>> + /*
>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
>> + * warning.
>> + */
>> + movq PER_CPU_VAR(current_task), %rax
>> + movq $TASK_SIZE_MAX, %rcx
>> + cmp %rcx, TASK_addr_limit(%rax)
>> + jz 1f
>> + movq %rcx, TASK_addr_limit(%rax)
>> +1:
>> +#endif
>> +
How about simply doing...
movq PER_CPU_VAR(current_task), %rax
movq $TASK_SIZE_MAX, %rcx
#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
cmpq %rcx, TASK_addr_limit(%rax)
jne syscall_return_slowpath
#else
movq %rcx, TASK_addr_limit(%rax)
#endif
... and let the slow path take care of BUG. This should be much faster,
even with the BUG, and is simpler to boot.
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-14 0:04 ` [kernel-hardening] " H. Peter Anvin
@ 2017-03-14 9:40 ` H. Peter Anvin
-1 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-14 9:40 UTC (permalink / raw)
To: Ingo Molnar, Thomas Garnier
Cc: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, René Nyffenegger, Andrew Morton,
Kees Cook, Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz
On 03/13/17 17:04, H. Peter Anvin wrote:
> On 03/11/17 01:42, Ingo Molnar wrote:
>>>
>>> + /*
>>> + * Check user-mode state on fast path return, the same check is done
>>> + * under the slow path through syscall_return_slowpath.
>>> + */
>>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>>> + call verify_pre_usermode_state
>>> +#else
>>> + /*
>>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
>>> + * warning.
>>> + */
>>> + movq PER_CPU_VAR(current_task), %rax
>>> + movq $TASK_SIZE_MAX, %rcx
>>> + cmp %rcx, TASK_addr_limit(%rax)
>>> + jz 1f
>>> + movq %rcx, TASK_addr_limit(%rax)
>>> +1:
>>> +#endif
>>> +
>
> How about simply doing...
>
> movq PER_CPU_VAR(current_task), %rax
> movq $TASK_SIZE_MAX, %rcx
> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
> cmpq %rcx, TASK_addr_limit(%rax)
> jne syscall_return_slowpath
> #else
> movq %rcx, TASK_addr_limit(%rax)
> #endif
>
> ... and let the slow path take care of BUG. This should be much faster,
> even with the BUG, and is simpler to boot.
>
In fact, we could even to the cmpq/jne unconditionally. I'm guessing
the occasional branch mispredict will be offset by occasionally touching
a clean cacheline in the case of an unconditional store.
Since this is something that should never happen, performance doesn't
matter.
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-14 9:40 ` H. Peter Anvin
0 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-14 9:40 UTC (permalink / raw)
To: Ingo Molnar, Thomas Garnier
Cc: Martin Schwidefsky, Heiko Carstens, David Howells, Arnd Bergmann,
Al Viro, Dave Hansen, René Nyffenegger, Andrew Morton,
Kees Cook, Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, linux-kernel, linux-api, x86, linux-arm-kernel,
kernel-hardening
On 03/13/17 17:04, H. Peter Anvin wrote:
> On 03/11/17 01:42, Ingo Molnar wrote:
>>>
>>> + /*
>>> + * Check user-mode state on fast path return, the same check is done
>>> + * under the slow path through syscall_return_slowpath.
>>> + */
>>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>>> + call verify_pre_usermode_state
>>> +#else
>>> + /*
>>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
>>> + * warning.
>>> + */
>>> + movq PER_CPU_VAR(current_task), %rax
>>> + movq $TASK_SIZE_MAX, %rcx
>>> + cmp %rcx, TASK_addr_limit(%rax)
>>> + jz 1f
>>> + movq %rcx, TASK_addr_limit(%rax)
>>> +1:
>>> +#endif
>>> +
>
> How about simply doing...
>
> movq PER_CPU_VAR(current_task), %rax
> movq $TASK_SIZE_MAX, %rcx
> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
> cmpq %rcx, TASK_addr_limit(%rax)
> jne syscall_return_slowpath
> #else
> movq %rcx, TASK_addr_limit(%rax)
> #endif
>
> ... and let the slow path take care of BUG. This should be much faster,
> even with the BUG, and is simpler to boot.
>
In fact, we could even to the cmpq/jne unconditionally. I'm guessing
the occasional branch mispredict will be offset by occasionally touching
a clean cacheline in the case of an unconditional store.
Since this is something that should never happen, performance doesn't
matter.
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-14 9:40 ` [kernel-hardening] " H. Peter Anvin
@ 2017-03-14 15:17 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-14 15:17 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Ingo Molnar, Martin Schwidefsky, Heiko Carstens, David Howells,
Arnd Bergmann, Al Viro, Dave Hansen, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, John Stultz, Thomas Gleixner, Oleg Nesterov
On Tue, Mar 14, 2017 at 2:40 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 03/13/17 17:04, H. Peter Anvin wrote:
>> On 03/11/17 01:42, Ingo Molnar wrote:
>>>>
>>>> + /*
>>>> + * Check user-mode state on fast path return, the same check is done
>>>> + * under the slow path through syscall_return_slowpath.
>>>> + */
>>>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>>>> + call verify_pre_usermode_state
>>>> +#else
>>>> + /*
>>>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
>>>> + * warning.
>>>> + */
>>>> + movq PER_CPU_VAR(current_task), %rax
>>>> + movq $TASK_SIZE_MAX, %rcx
>>>> + cmp %rcx, TASK_addr_limit(%rax)
>>>> + jz 1f
>>>> + movq %rcx, TASK_addr_limit(%rax)
>>>> +1:
>>>> +#endif
>>>> +
>>
>> How about simply doing...
>>
>> movq PER_CPU_VAR(current_task), %rax
>> movq $TASK_SIZE_MAX, %rcx
>> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>> cmpq %rcx, TASK_addr_limit(%rax)
>> jne syscall_return_slowpath
>> #else
>> movq %rcx, TASK_addr_limit(%rax)
>> #endif
>>
>> ... and let the slow path take care of BUG. This should be much faster,
>> even with the BUG, and is simpler to boot.
>>
>
> In fact, we could even to the cmpq/jne unconditionally. I'm guessing
> the occasional branch mispredict will be offset by occasionally touching
> a clean cacheline in the case of an unconditional store.
>
> Since this is something that should never happen, performance doesn't
> matter.
Ingo: Which approach do you favor? I want to keep the fast path as
fast as possible obviously.
>
> -hpa
>
>
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-14 15:17 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-14 15:17 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Ingo Molnar, Martin Schwidefsky, Heiko Carstens, David Howells,
Arnd Bergmann, Al Viro, Dave Hansen, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, John Stultz, Thomas Gleixner, Oleg Nesterov,
Stephen Smalley, Pavel Tikhomirov, Frederic Weisbecker,
Stanislav Kinsburskiy, Ingo Molnar, Paolo Bonzini,
Dmitry Safonov, Borislav Petkov, Josh Poimboeuf, Brian Gerst,
Jan Beulich, Christian Borntraeger, Fenghua Yu, He Chen,
Russell King, Vladimir Murzin, Will Deacon, Catalin Marinas,
Mark Rutland, James Morse, David A . Long, Pratyush Anand,
Laura Abbott, Andre Przywara, Chris Metcalf, linux-s390, LKML,
Linux API, the arch/x86 maintainers, linux-arm-kernel,
Kernel Hardening
On Tue, Mar 14, 2017 at 2:40 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 03/13/17 17:04, H. Peter Anvin wrote:
>> On 03/11/17 01:42, Ingo Molnar wrote:
>>>>
>>>> + /*
>>>> + * Check user-mode state on fast path return, the same check is done
>>>> + * under the slow path through syscall_return_slowpath.
>>>> + */
>>>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>>>> + call verify_pre_usermode_state
>>>> +#else
>>>> + /*
>>>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
>>>> + * warning.
>>>> + */
>>>> + movq PER_CPU_VAR(current_task), %rax
>>>> + movq $TASK_SIZE_MAX, %rcx
>>>> + cmp %rcx, TASK_addr_limit(%rax)
>>>> + jz 1f
>>>> + movq %rcx, TASK_addr_limit(%rax)
>>>> +1:
>>>> +#endif
>>>> +
>>
>> How about simply doing...
>>
>> movq PER_CPU_VAR(current_task), %rax
>> movq $TASK_SIZE_MAX, %rcx
>> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>> cmpq %rcx, TASK_addr_limit(%rax)
>> jne syscall_return_slowpath
>> #else
>> movq %rcx, TASK_addr_limit(%rax)
>> #endif
>>
>> ... and let the slow path take care of BUG. This should be much faster,
>> even with the BUG, and is simpler to boot.
>>
>
> In fact, we could even to the cmpq/jne unconditionally. I'm guessing
> the occasional branch mispredict will be offset by occasionally touching
> a clean cacheline in the case of an unconditional store.
>
> Since this is something that should never happen, performance doesn't
> matter.
Ingo: Which approach do you favor? I want to keep the fast path as
fast as possible obviously.
>
> -hpa
>
>
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-14 15:17 ` [kernel-hardening] " Thomas Garnier
@ 2017-03-14 15:39 ` Andy Lutomirski
-1 siblings, 0 replies; 46+ messages in thread
From: Andy Lutomirski @ 2017-03-14 15:39 UTC (permalink / raw)
To: Thomas Garnier
Cc: H. Peter Anvin, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner
On Tue, Mar 14, 2017 at 8:17 AM, Thomas Garnier <thgarnie@google.com> wrote:
> On Tue, Mar 14, 2017 at 2:40 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 03/13/17 17:04, H. Peter Anvin wrote:
>>> On 03/11/17 01:42, Ingo Molnar wrote:
>>>>>
>>>>> + /*
>>>>> + * Check user-mode state on fast path return, the same check is done
>>>>> + * under the slow path through syscall_return_slowpath.
>>>>> + */
>>>>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>>>>> + call verify_pre_usermode_state
>>>>> +#else
>>>>> + /*
>>>>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
>>>>> + * warning.
>>>>> + */
>>>>> + movq PER_CPU_VAR(current_task), %rax
>>>>> + movq $TASK_SIZE_MAX, %rcx
>>>>> + cmp %rcx, TASK_addr_limit(%rax)
>>>>> + jz 1f
>>>>> + movq %rcx, TASK_addr_limit(%rax)
>>>>> +1:
>>>>> +#endif
>>>>> +
>>>
>>> How about simply doing...
>>>
>>> movq PER_CPU_VAR(current_task), %rax
>>> movq $TASK_SIZE_MAX, %rcx
>>> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>>> cmpq %rcx, TASK_addr_limit(%rax)
>>> jne syscall_return_slowpath
>>> #else
>>> movq %rcx, TASK_addr_limit(%rax)
>>> #endif
>>>
>>> ... and let the slow path take care of BUG. This should be much faster,
>>> even with the BUG, and is simpler to boot.
>>>
>>
>> In fact, we could even to the cmpq/jne unconditionally. I'm guessing
>> the occasional branch mispredict will be offset by occasionally touching
>> a clean cacheline in the case of an unconditional store.
>>
>> Since this is something that should never happen, performance doesn't
>> matter.
>
> Ingo: Which approach do you favor? I want to keep the fast path as
> fast as possible obviously.
Even though my name isn't Ingo, Linus keeps trying to get me to be the
actual maintainer of this file. :) How about (sorry about whitespace
damage):
#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
movq PER_CPU_VAR(current_task), %rax
bt $63, TASK_addr_limit(%rax)
jc syscall_return_slowpath
#endif
Now the kernel is totally unchanged if the config option is off and
it's fast and simple if the option is on.
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-14 15:39 ` Andy Lutomirski
0 siblings, 0 replies; 46+ messages in thread
From: Andy Lutomirski @ 2017-03-14 15:39 UTC (permalink / raw)
To: Thomas Garnier
Cc: H. Peter Anvin, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, LKML, Linux API, the arch/x86 maintainers,
linux-arm-kernel, Kernel Hardening
On Tue, Mar 14, 2017 at 8:17 AM, Thomas Garnier <thgarnie@google.com> wrote:
> On Tue, Mar 14, 2017 at 2:40 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 03/13/17 17:04, H. Peter Anvin wrote:
>>> On 03/11/17 01:42, Ingo Molnar wrote:
>>>>>
>>>>> + /*
>>>>> + * Check user-mode state on fast path return, the same check is done
>>>>> + * under the slow path through syscall_return_slowpath.
>>>>> + */
>>>>> +#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>>>>> + call verify_pre_usermode_state
>>>>> +#else
>>>>> + /*
>>>>> + * Similar to set_fs(USER_DS) in verify_pre_usermode_state without a
>>>>> + * warning.
>>>>> + */
>>>>> + movq PER_CPU_VAR(current_task), %rax
>>>>> + movq $TASK_SIZE_MAX, %rcx
>>>>> + cmp %rcx, TASK_addr_limit(%rax)
>>>>> + jz 1f
>>>>> + movq %rcx, TASK_addr_limit(%rax)
>>>>> +1:
>>>>> +#endif
>>>>> +
>>>
>>> How about simply doing...
>>>
>>> movq PER_CPU_VAR(current_task), %rax
>>> movq $TASK_SIZE_MAX, %rcx
>>> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>>> cmpq %rcx, TASK_addr_limit(%rax)
>>> jne syscall_return_slowpath
>>> #else
>>> movq %rcx, TASK_addr_limit(%rax)
>>> #endif
>>>
>>> ... and let the slow path take care of BUG. This should be much faster,
>>> even with the BUG, and is simpler to boot.
>>>
>>
>> In fact, we could even to the cmpq/jne unconditionally. I'm guessing
>> the occasional branch mispredict will be offset by occasionally touching
>> a clean cacheline in the case of an unconditional store.
>>
>> Since this is something that should never happen, performance doesn't
>> matter.
>
> Ingo: Which approach do you favor? I want to keep the fast path as
> fast as possible obviously.
Even though my name isn't Ingo, Linus keeps trying to get me to be the
actual maintainer of this file. :) How about (sorry about whitespace
damage):
#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
movq PER_CPU_VAR(current_task), %rax
bt $63, TASK_addr_limit(%rax)
jc syscall_return_slowpath
#endif
Now the kernel is totally unchanged if the config option is off and
it's fast and simple if the option is on.
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-14 15:39 ` [kernel-hardening] " Andy Lutomirski
@ 2017-03-14 16:29 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-14 16:29 UTC (permalink / raw)
To: Andy Lutomirski
Cc: H. Peter Anvin, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner
On Tue, Mar 14, 2017 at 8:39 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> Even though my name isn't Ingo, Linus keeps trying to get me to be the
> actual maintainer of this file. :) How about (sorry about whitespace
> damage):
:D
>
> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
> movq PER_CPU_VAR(current_task), %rax
> bt $63, TASK_addr_limit(%rax)
> jc syscall_return_slowpath
> #endif
>
> Now the kernel is totally unchanged if the config option is off and
> it's fast and simple if the option is on.
I like using bt for fast comparison.
We want to enforce the address limit by default, not only when
CONFIG_BUG_ON_DATA_CORRUPTION is enabled. I tested this one:
/* Check user-mode state on fast path return. */
movq PER_CPU_VAR(current_task), %rax
btq $63, TASK_addr_limit(%rax)
jnc 1f
#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
call syscall_return_slowpath
jmp return_from_SYSCALL_64
#else
movq $TASK_SIZE_MAX, %rcx
movq %rcx, TASK_addr_limit(%rax)
#endif
1:
I saw that syscall_return_slowpath is supposed to be called not jumped
to. I could just call verify_pre_usermode_state that would be about
the same.
If we want to avoid if/def then I guess this one is the best I can think of:
/* Check user-mode state on fast path return. */
movq PER_CPU_VAR(current_task), %rax
btq $63, TASK_addr_limit(%rax)
jnc 1f
call verify_pre_usermode_state
1:
The check is fast and the call will happen only on corruption.
What do you think?
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-14 16:29 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-14 16:29 UTC (permalink / raw)
To: Andy Lutomirski
Cc: H. Peter Anvin, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, LKML, Linux API, the arch/x86 maintainers,
linux-arm-kernel, Kernel Hardening
On Tue, Mar 14, 2017 at 8:39 AM, Andy Lutomirski <luto@amacapital.net> wrote:
> Even though my name isn't Ingo, Linus keeps trying to get me to be the
> actual maintainer of this file. :) How about (sorry about whitespace
> damage):
:D
>
> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
> movq PER_CPU_VAR(current_task), %rax
> bt $63, TASK_addr_limit(%rax)
> jc syscall_return_slowpath
> #endif
>
> Now the kernel is totally unchanged if the config option is off and
> it's fast and simple if the option is on.
I like using bt for fast comparison.
We want to enforce the address limit by default, not only when
CONFIG_BUG_ON_DATA_CORRUPTION is enabled. I tested this one:
/* Check user-mode state on fast path return. */
movq PER_CPU_VAR(current_task), %rax
btq $63, TASK_addr_limit(%rax)
jnc 1f
#ifdef CONFIG_BUG_ON_DATA_CORRUPTION
call syscall_return_slowpath
jmp return_from_SYSCALL_64
#else
movq $TASK_SIZE_MAX, %rcx
movq %rcx, TASK_addr_limit(%rax)
#endif
1:
I saw that syscall_return_slowpath is supposed to be called not jumped
to. I could just call verify_pre_usermode_state that would be about
the same.
If we want to avoid if/def then I guess this one is the best I can think of:
/* Check user-mode state on fast path return. */
movq PER_CPU_VAR(current_task), %rax
btq $63, TASK_addr_limit(%rax)
jnc 1f
call verify_pre_usermode_state
1:
The check is fast and the call will happen only on corruption.
What do you think?
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-14 15:39 ` [kernel-hardening] " Andy Lutomirski
@ 2017-03-14 16:30 ` H. Peter Anvin
-1 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-14 16:30 UTC (permalink / raw)
To: Andy Lutomirski, Thomas Garnier
Cc: Ingo Molnar, Martin Schwidefsky, Heiko Carstens, David Howells,
Arnd Bergmann, Al Viro, Dave Hansen, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel
On 03/14/17 08:39, Andy Lutomirski wrote:
>>
>> Ingo: Which approach do you favor? I want to keep the fast path as
>> fast as possible obviously.
>
> Even though my name isn't Ingo, Linus keeps trying to get me to be the
> actual maintainer of this file. :) How about (sorry about whitespace
> damage):
>
> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
> movq PER_CPU_VAR(current_task), %rax
> bt $63, TASK_addr_limit(%rax)
> jc syscall_return_slowpath
> #endif
>
> Now the kernel is totally unchanged if the config option is off and
> it's fast and simple if the option is on.
>
The idea as far as I understand was that the option was about whether or
not to clobber the broken value or BUG on it, not to remove the check.
My point, though, was that we can bail out to the slow path if there is
a discrepancy and worry about BUG or not there; performance doesn't
matter one iota if this triggers regardless of the remediation.
It isn't clear that using bt would be faster, though; although it saves
an instruction that instruction can be hoisted arbitrarily and so is
extremely likely to be hidden in the pipeline. cmp (which is really a
variant of sub) is one of the basic ALU instructions that are
super-optimized on every CPU, whereas bt is substantially slower on some
implementations.
This version is also "slightly less secure" since it would make it
possible to overwrite the guard page at the end of TASK_SIZE_MAX if one
could figure out a way to put an arbitrary value into this variable, but
I doubt that matters in any way.
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-14 16:30 ` H. Peter Anvin
0 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-14 16:30 UTC (permalink / raw)
To: Andy Lutomirski, Thomas Garnier
Cc: Ingo Molnar, Martin Schwidefsky, Heiko Carstens, David Howells,
Arnd Bergmann, Al Viro, Dave Hansen, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, John Stultz, Thomas Gleixner, Oleg Nesterov,
Stephen Smalley, Pavel Tikhomirov, Frederic Weisbecker,
Stanislav Kinsburskiy, Ingo Molnar, Paolo Bonzini,
Dmitry Safonov, Borislav Petkov, Josh Poimboeuf, Brian Gerst,
Jan Beulich, Christian Borntraeger, Fenghua Yu, He Chen,
Russell King, Vladimir Murzin, Will Deacon, Catalin Marinas,
Mark Rutland, James Morse, David A . Long, Pratyush Anand,
Laura Abbott, Andre Przywara, Chris Metcalf, linux-s390, LKML,
Linux API, the arch/x86 maintainers, linux-arm-kernel,
Kernel Hardening
On 03/14/17 08:39, Andy Lutomirski wrote:
>>
>> Ingo: Which approach do you favor? I want to keep the fast path as
>> fast as possible obviously.
>
> Even though my name isn't Ingo, Linus keeps trying to get me to be the
> actual maintainer of this file. :) How about (sorry about whitespace
> damage):
>
> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
> movq PER_CPU_VAR(current_task), %rax
> bt $63, TASK_addr_limit(%rax)
> jc syscall_return_slowpath
> #endif
>
> Now the kernel is totally unchanged if the config option is off and
> it's fast and simple if the option is on.
>
The idea as far as I understand was that the option was about whether or
not to clobber the broken value or BUG on it, not to remove the check.
My point, though, was that we can bail out to the slow path if there is
a discrepancy and worry about BUG or not there; performance doesn't
matter one iota if this triggers regardless of the remediation.
It isn't clear that using bt would be faster, though; although it saves
an instruction that instruction can be hoisted arbitrarily and so is
extremely likely to be hidden in the pipeline. cmp (which is really a
variant of sub) is one of the basic ALU instructions that are
super-optimized on every CPU, whereas bt is substantially slower on some
implementations.
This version is also "slightly less secure" since it would make it
possible to overwrite the guard page at the end of TASK_SIZE_MAX if one
could figure out a way to put an arbitrary value into this variable, but
I doubt that matters in any way.
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-14 16:29 ` [kernel-hardening] " Thomas Garnier
@ 2017-03-14 16:44 ` H. Peter Anvin
-1 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-14 16:44 UTC (permalink / raw)
To: Thomas Garnier, Andy Lutomirski
Cc: Ingo Molnar, Martin Schwidefsky, Heiko Carstens, David Howells,
Arnd Bergmann, Al Viro, Dave Hansen, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel
On 03/14/17 09:29, Thomas Garnier wrote:
>
> We want to enforce the address limit by default, not only when
> CONFIG_BUG_ON_DATA_CORRUPTION is enabled. I tested this one:
>
> /* Check user-mode state on fast path return. */
> movq PER_CPU_VAR(current_task), %rax
> btq $63, TASK_addr_limit(%rax)
> jnc 1f
> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
> call syscall_return_slowpath
> jmp return_from_SYSCALL_64
> #else
> movq $TASK_SIZE_MAX, %rcx
> movq %rcx, TASK_addr_limit(%rax)
> #endif
> 1:
>
> I saw that syscall_return_slowpath is supposed to be called not jumped
> to. I could just call verify_pre_usermode_state that would be about
> the same.
I wanted to comment on that thing: why on earth isn't
verify_pre_usermode_state() an inline? Making it an out-of-line
function adds pointless extra overhead to the C code when we are talking
about a few instructions.
Second, you never do a branch-around to handle an exceptional condition
on the fast path: you jump *out of line* to handle the special
condition; a forward branch is preferred since it is slightly more
likely to be predicted not taken.
Now, I finally had a chance to actually look at the full file (I was
preoccupied yesterday), and am a bit disappointed, to say the least.
First of all, the jump target you need is only a handful of instructions
further down the code path; you need to do *exactly* that is done when
the test of _TIF_ALLWORK_MASK right above is tested! Not only that, but
you already have PER_CPU_VAR(current_task) in %r11 just ready to be
used! This was all in the three instructions immediately prior to the
code you modified...
So, all you'd need would be:
movq $TASK_SIZE_MAX, %rcx
cmpq %rcx, TASK_addr_limit(%r11)
jne 1f
We even get a short jump instruction!
(Using bt saves one more instruction, but see previous caveats about it.)
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-14 16:44 ` H. Peter Anvin
0 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-14 16:44 UTC (permalink / raw)
To: Thomas Garnier, Andy Lutomirski
Cc: Ingo Molnar, Martin Schwidefsky, Heiko Carstens, David Howells,
Arnd Bergmann, Al Viro, Dave Hansen, René Nyffenegger,
Andrew Morton, Kees Cook, Paul E . McKenney, Andy Lutomirski,
Ard Biesheuvel, Nicolas Pitre, Petr Mladek,
Sebastian Andrzej Siewior, Sergey Senozhatsky, Helge Deller,
Rik van Riel, John Stultz, Thomas Gleixner, Oleg Nesterov,
Stephen Smalley, Pavel Tikhomirov, Frederic Weisbecker,
Stanislav Kinsburskiy, Ingo Molnar, Paolo Bonzini,
Dmitry Safonov, Borislav Petkov, Josh Poimboeuf, Brian Gerst,
Jan Beulich, Christian Borntraeger, Fenghua Yu, He Chen,
Russell King, Vladimir Murzin, Will Deacon, Catalin Marinas,
Mark Rutland, James Morse, David A . Long, Pratyush Anand,
Laura Abbott, Andre Przywara, Chris Metcalf, linux-s390, LKML,
Linux API, the arch/x86 maintainers, linux-arm-kernel,
Kernel Hardening
On 03/14/17 09:29, Thomas Garnier wrote:
>
> We want to enforce the address limit by default, not only when
> CONFIG_BUG_ON_DATA_CORRUPTION is enabled. I tested this one:
>
> /* Check user-mode state on fast path return. */
> movq PER_CPU_VAR(current_task), %rax
> btq $63, TASK_addr_limit(%rax)
> jnc 1f
> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
> call syscall_return_slowpath
> jmp return_from_SYSCALL_64
> #else
> movq $TASK_SIZE_MAX, %rcx
> movq %rcx, TASK_addr_limit(%rax)
> #endif
> 1:
>
> I saw that syscall_return_slowpath is supposed to be called not jumped
> to. I could just call verify_pre_usermode_state that would be about
> the same.
I wanted to comment on that thing: why on earth isn't
verify_pre_usermode_state() an inline? Making it an out-of-line
function adds pointless extra overhead to the C code when we are talking
about a few instructions.
Second, you never do a branch-around to handle an exceptional condition
on the fast path: you jump *out of line* to handle the special
condition; a forward branch is preferred since it is slightly more
likely to be predicted not taken.
Now, I finally had a chance to actually look at the full file (I was
preoccupied yesterday), and am a bit disappointed, to say the least.
First of all, the jump target you need is only a handful of instructions
further down the code path; you need to do *exactly* that is done when
the test of _TIF_ALLWORK_MASK right above is tested! Not only that, but
you already have PER_CPU_VAR(current_task) in %r11 just ready to be
used! This was all in the three instructions immediately prior to the
code you modified...
So, all you'd need would be:
movq $TASK_SIZE_MAX, %rcx
cmpq %rcx, TASK_addr_limit(%r11)
jne 1f
We even get a short jump instruction!
(Using bt saves one more instruction, but see previous caveats about it.)
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-14 16:44 ` [kernel-hardening] " H. Peter Anvin
@ 2017-03-14 16:51 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-14 16:51 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner
On Tue, Mar 14, 2017 at 9:44 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 03/14/17 09:29, Thomas Garnier wrote:
>>
>> We want to enforce the address limit by default, not only when
>> CONFIG_BUG_ON_DATA_CORRUPTION is enabled. I tested this one:
>>
>> /* Check user-mode state on fast path return. */
>> movq PER_CPU_VAR(current_task), %rax
>> btq $63, TASK_addr_limit(%rax)
>> jnc 1f
>> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>> call syscall_return_slowpath
>> jmp return_from_SYSCALL_64
>> #else
>> movq $TASK_SIZE_MAX, %rcx
>> movq %rcx, TASK_addr_limit(%rax)
>> #endif
>> 1:
>>
>> I saw that syscall_return_slowpath is supposed to be called not jumped
>> to. I could just call verify_pre_usermode_state that would be about
>> the same.
>
> I wanted to comment on that thing: why on earth isn't
> verify_pre_usermode_state() an inline? Making it an out-of-line
> function adds pointless extra overhead to the C code when we are talking
> about a few instructions.
Because outside of arch specific implementation it is called by each
syscall handler. it will increase the code size a lot.
>
> Second, you never do a branch-around to handle an exceptional condition
> on the fast path: you jump *out of line* to handle the special
> condition; a forward branch is preferred since it is slightly more
> likely to be predicted not taken.
>
> Now, I finally had a chance to actually look at the full file (I was
> preoccupied yesterday), and am a bit disappointed, to say the least.
>
> First of all, the jump target you need is only a handful of instructions
> further down the code path; you need to do *exactly* that is done when
> the test of _TIF_ALLWORK_MASK right above is tested! Not only that, but
> you already have PER_CPU_VAR(current_task) in %r11 just ready to be
> used! This was all in the three instructions immediately prior to the
> code you modified...
Correct.
>
> So, all you'd need would be:
>
> movq $TASK_SIZE_MAX, %rcx
> cmpq %rcx, TASK_addr_limit(%r11)
> jne 1f
>
> We even get a short jump instruction!
>
> (Using bt saves one more instruction, but see previous caveats about it.)
Okay that seems fair to me. Andy what do you think? (Given you
suggested the bt). Ingo?
Thanks for the feedback.
>
> -hpa
>
>
>
>
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-14 16:51 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-14 16:51 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, LKML, Linux API, the arch/x86 maintainers,
linux-arm-kernel, Kernel Hardening
On Tue, Mar 14, 2017 at 9:44 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 03/14/17 09:29, Thomas Garnier wrote:
>>
>> We want to enforce the address limit by default, not only when
>> CONFIG_BUG_ON_DATA_CORRUPTION is enabled. I tested this one:
>>
>> /* Check user-mode state on fast path return. */
>> movq PER_CPU_VAR(current_task), %rax
>> btq $63, TASK_addr_limit(%rax)
>> jnc 1f
>> #ifdef CONFIG_BUG_ON_DATA_CORRUPTION
>> call syscall_return_slowpath
>> jmp return_from_SYSCALL_64
>> #else
>> movq $TASK_SIZE_MAX, %rcx
>> movq %rcx, TASK_addr_limit(%rax)
>> #endif
>> 1:
>>
>> I saw that syscall_return_slowpath is supposed to be called not jumped
>> to. I could just call verify_pre_usermode_state that would be about
>> the same.
>
> I wanted to comment on that thing: why on earth isn't
> verify_pre_usermode_state() an inline? Making it an out-of-line
> function adds pointless extra overhead to the C code when we are talking
> about a few instructions.
Because outside of arch specific implementation it is called by each
syscall handler. it will increase the code size a lot.
>
> Second, you never do a branch-around to handle an exceptional condition
> on the fast path: you jump *out of line* to handle the special
> condition; a forward branch is preferred since it is slightly more
> likely to be predicted not taken.
>
> Now, I finally had a chance to actually look at the full file (I was
> preoccupied yesterday), and am a bit disappointed, to say the least.
>
> First of all, the jump target you need is only a handful of instructions
> further down the code path; you need to do *exactly* that is done when
> the test of _TIF_ALLWORK_MASK right above is tested! Not only that, but
> you already have PER_CPU_VAR(current_task) in %r11 just ready to be
> used! This was all in the three instructions immediately prior to the
> code you modified...
Correct.
>
> So, all you'd need would be:
>
> movq $TASK_SIZE_MAX, %rcx
> cmpq %rcx, TASK_addr_limit(%r11)
> jne 1f
>
> We even get a short jump instruction!
>
> (Using bt saves one more instruction, but see previous caveats about it.)
Okay that seems fair to me. Andy what do you think? (Given you
suggested the bt). Ingo?
Thanks for the feedback.
>
> -hpa
>
>
>
>
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-14 16:51 ` [kernel-hardening] " Thomas Garnier
@ 2017-03-14 17:53 ` H. Peter Anvin
-1 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-14 17:53 UTC (permalink / raw)
To: Thomas Garnier
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller
On 03/14/17 09:51, Thomas Garnier wrote:
>>
>> I wanted to comment on that thing: why on earth isn't
>> verify_pre_usermode_state() an inline? Making it an out-of-line
>> function adds pointless extra overhead to the C code when we are talking
>> about a few instructions.
>
> Because outside of arch specific implementation it is called by each
> syscall handler. it will increase the code size a lot.
>
Don't assume that. On a lot of architectures a function call can be
more expensive than a simple compare and branch, because the compiler
has to assume a whole bunch of registers are lost at that point.
Either way, don't penalize the common architectures for it. Not okay.
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-14 17:53 ` H. Peter Anvin
0 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-14 17:53 UTC (permalink / raw)
To: Thomas Garnier
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, LKML, Linux API, the arch/x86 maintainers,
linux-arm-kernel, Kernel Hardening
On 03/14/17 09:51, Thomas Garnier wrote:
>>
>> I wanted to comment on that thing: why on earth isn't
>> verify_pre_usermode_state() an inline? Making it an out-of-line
>> function adds pointless extra overhead to the C code when we are talking
>> about a few instructions.
>
> Because outside of arch specific implementation it is called by each
> syscall handler. it will increase the code size a lot.
>
Don't assume that. On a lot of architectures a function call can be
more expensive than a simple compare and branch, because the compiler
has to assume a whole bunch of registers are lost at that point.
Either way, don't penalize the common architectures for it. Not okay.
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-14 17:53 ` [kernel-hardening] " H. Peter Anvin
@ 2017-03-15 17:43 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-15 17:43 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner
Thanks for the feedback. I will look into inlining by default (looking
at code size on different arch), the updated patch for x86 in the
meantime:
===========
Implement specific usage of verify_pre_usermode_state for user-mode
returns for x86.
---
Based on next-20170308
---
arch/x86/Kconfig | 1 +
arch/x86/entry/common.c | 3 +++
arch/x86/entry/entry_64.S | 8 ++++++++
arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
arch/x86/include/asm/processor.h | 11 -----------
5 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 005df7c825f5..6d48e18e6f09 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -63,6 +63,7 @@ config X86
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 370c42c7f046..525edbb77f03 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -22,6 +22,7 @@
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
+#include <linux/syscalls.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -180,6 +181,8 @@ __visible inline void
prepare_exit_to_usermode(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
u32 cached_flags;
+ verify_pre_usermode_state();
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2b2a2948ffe..c079b010205c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -218,6 +218,14 @@ entry_SYSCALL_64_fastpath:
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
+ /*
+ * If address limit is not based on user-mode, jump to slow path for
+ * additional security checks.
+ */
+ movq $TASK_SIZE_MAX, %rcx
+ cmp %rcx, TASK_addr_limit(%r11)
+ jnz 1f
+
LOCKDEP_SYS_EXIT
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
diff --git a/arch/x86/include/asm/pgtable_64_types.h
b/arch/x86/include/asm/pgtable_64_types.h
index 3a264200c62f..0fbbb79d058c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -76,4 +76,15 @@ typedef struct { pteval_t pte; } pte_t;
#define EARLY_DYNAMIC_PAGE_TABLES 64
+/*
+ * User space process size. 47bits minus one guard page. The guard
+ * page is necessary on Intel CPUs: if a SYSCALL instruction is at
+ * the highest possible canonical userspace address, then that
+ * syscall will enter the kernel with a non-canonical return
+ * address, and SYSRET will explode dangerously. We avoid this
+ * particular problem by preventing anything from being mapped
+ * at the maximum canonical address.
+ */
+#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca5407a..9bc99d37133e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -829,17 +829,6 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
- */
-#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
-
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
--
2.12.0.367.g23dc2f6d3c-goog
On Tue, Mar 14, 2017 at 10:53 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 03/14/17 09:51, Thomas Garnier wrote:
>>>
>>> I wanted to comment on that thing: why on earth isn't
>>> verify_pre_usermode_state() an inline? Making it an out-of-line
>>> function adds pointless extra overhead to the C code when we are talking
>>> about a few instructions.
>>
>> Because outside of arch specific implementation it is called by each
>> syscall handler. it will increase the code size a lot.
>>
>
> Don't assume that. On a lot of architectures a function call can be
> more expensive than a simple compare and branch, because the compiler
> has to assume a whole bunch of registers are lost at that point.
>
> Either way, don't penalize the common architectures for it. Not okay.
>
> -hpa
>
--
Thomas
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-15 17:43 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-15 17:43 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, LKML, Linux API, the arch/x86 maintainers,
linux-arm-kernel, Kernel Hardening
Thanks for the feedback. I will look into inlining by default (looking
at code size on different arch), the updated patch for x86 in the
meantime:
===========
Implement specific usage of verify_pre_usermode_state for user-mode
returns for x86.
---
Based on next-20170308
---
arch/x86/Kconfig | 1 +
arch/x86/entry/common.c | 3 +++
arch/x86/entry/entry_64.S | 8 ++++++++
arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
arch/x86/include/asm/processor.h | 11 -----------
5 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 005df7c825f5..6d48e18e6f09 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -63,6 +63,7 @@ config X86
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
+ select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 370c42c7f046..525edbb77f03 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -22,6 +22,7 @@
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
+#include <linux/syscalls.h>
#include <asm/desc.h>
#include <asm/traps.h>
@@ -180,6 +181,8 @@ __visible inline void
prepare_exit_to_usermode(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
u32 cached_flags;
+ verify_pre_usermode_state();
+
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
local_irq_disable();
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2b2a2948ffe..c079b010205c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -218,6 +218,14 @@ entry_SYSCALL_64_fastpath:
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
+ /*
+ * If address limit is not based on user-mode, jump to slow path for
+ * additional security checks.
+ */
+ movq $TASK_SIZE_MAX, %rcx
+ cmp %rcx, TASK_addr_limit(%r11)
+ jnz 1f
+
LOCKDEP_SYS_EXIT
TRACE_IRQS_ON /* user mode is traced as IRQs on */
movq RIP(%rsp), %rcx
diff --git a/arch/x86/include/asm/pgtable_64_types.h
b/arch/x86/include/asm/pgtable_64_types.h
index 3a264200c62f..0fbbb79d058c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -76,4 +76,15 @@ typedef struct { pteval_t pte; } pte_t;
#define EARLY_DYNAMIC_PAGE_TABLES 64
+/*
+ * User space process size. 47bits minus one guard page. The guard
+ * page is necessary on Intel CPUs: if a SYSCALL instruction is at
+ * the highest possible canonical userspace address, then that
+ * syscall will enter the kernel with a non-canonical return
+ * address, and SYSRET will explode dangerously. We avoid this
+ * particular problem by preventing anything from being mapped
+ * at the maximum canonical address.
+ */
+#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca5407a..9bc99d37133e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -829,17 +829,6 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-/*
- * User space process size. 47bits minus one guard page. The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously. We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
- */
-#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
-
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
--
2.12.0.367.g23dc2f6d3c-goog
On Tue, Mar 14, 2017 at 10:53 AM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 03/14/17 09:51, Thomas Garnier wrote:
>>>
>>> I wanted to comment on that thing: why on earth isn't
>>> verify_pre_usermode_state() an inline? Making it an out-of-line
>>> function adds pointless extra overhead to the C code when we are talking
>>> about a few instructions.
>>
>> Because outside of arch specific implementation it is called by each
>> syscall handler. it will increase the code size a lot.
>>
>
> Don't assume that. On a lot of architectures a function call can be
> more expensive than a simple compare and branch, because the compiler
> has to assume a whole bunch of registers are lost at that point.
>
> Either way, don't penalize the common architectures for it. Not okay.
>
> -hpa
>
--
Thomas
^ permalink raw reply related [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-15 17:43 ` [kernel-hardening] " Thomas Garnier
@ 2017-03-22 19:15 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-22 19:15 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner
On Wed, Mar 15, 2017 at 10:43 AM, Thomas Garnier <thgarnie@google.com> wrote:
> Thanks for the feedback. I will look into inlining by default (looking
> at code size on different arch), the updated patch for x86 in the
> meantime:
I did couple checks and it doesn't seem worth it. I will send a v4
with the change below for additional feedback.
> ===========
>
> Implement specific usage of verify_pre_usermode_state for user-mode
> returns for x86.
> ---
> Based on next-20170308
> ---
> arch/x86/Kconfig | 1 +
> arch/x86/entry/common.c | 3 +++
> arch/x86/entry/entry_64.S | 8 ++++++++
> arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
> arch/x86/include/asm/processor.h | 11 -----------
> 5 files changed, 23 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 005df7c825f5..6d48e18e6f09 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -63,6 +63,7 @@ config X86
> select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
> select ARCH_MIGHT_HAVE_PC_PARPORT
> select ARCH_MIGHT_HAVE_PC_SERIO
> + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> select ARCH_SUPPORTS_ATOMIC_RMW
> select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
> select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 370c42c7f046..525edbb77f03 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -22,6 +22,7 @@
> #include <linux/context_tracking.h>
> #include <linux/user-return-notifier.h>
> #include <linux/uprobes.h>
> +#include <linux/syscalls.h>
>
> #include <asm/desc.h>
> #include <asm/traps.h>
> @@ -180,6 +181,8 @@ __visible inline void
> prepare_exit_to_usermode(struct pt_regs *regs)
> struct thread_info *ti = current_thread_info();
> u32 cached_flags;
>
> + verify_pre_usermode_state();
> +
> if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
> local_irq_disable();
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index d2b2a2948ffe..c079b010205c 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -218,6 +218,14 @@ entry_SYSCALL_64_fastpath:
> testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
> jnz 1f
>
> + /*
> + * If address limit is not based on user-mode, jump to slow path for
> + * additional security checks.
> + */
> + movq $TASK_SIZE_MAX, %rcx
> + cmp %rcx, TASK_addr_limit(%r11)
> + jnz 1f
> +
> LOCKDEP_SYS_EXIT
> TRACE_IRQS_ON /* user mode is traced as IRQs on */
> movq RIP(%rsp), %rcx
> diff --git a/arch/x86/include/asm/pgtable_64_types.h
> b/arch/x86/include/asm/pgtable_64_types.h
> index 3a264200c62f..0fbbb79d058c 100644
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -76,4 +76,15 @@ typedef struct { pteval_t pte; } pte_t;
>
> #define EARLY_DYNAMIC_PAGE_TABLES 64
>
> +/*
> + * User space process size. 47bits minus one guard page. The guard
> + * page is necessary on Intel CPUs: if a SYSCALL instruction is at
> + * the highest possible canonical userspace address, then that
> + * syscall will enter the kernel with a non-canonical return
> + * address, and SYSRET will explode dangerously. We avoid this
> + * particular problem by preventing anything from being mapped
> + * at the maximum canonical address.
> + */
> +#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
> +
> #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index f385eca5407a..9bc99d37133e 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -829,17 +829,6 @@ static inline void spin_lock_prefetch(const void *x)
> #define KSTK_ESP(task) (task_pt_regs(task)->sp)
>
> #else
> -/*
> - * User space process size. 47bits minus one guard page. The guard
> - * page is necessary on Intel CPUs: if a SYSCALL instruction is at
> - * the highest possible canonical userspace address, then that
> - * syscall will enter the kernel with a non-canonical return
> - * address, and SYSRET will explode dangerously. We avoid this
> - * particular problem by preventing anything from being mapped
> - * at the maximum canonical address.
> - */
> -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
> -
> /* This decides where the kernel will search for a free chunk of vm
> * space during mmap's.
> */
> --
> 2.12.0.367.g23dc2f6d3c-goog
>
>
> On Tue, Mar 14, 2017 at 10:53 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 03/14/17 09:51, Thomas Garnier wrote:
>>>>
>>>> I wanted to comment on that thing: why on earth isn't
>>>> verify_pre_usermode_state() an inline? Making it an out-of-line
>>>> function adds pointless extra overhead to the C code when we are talking
>>>> about a few instructions.
>>>
>>> Because outside of arch specific implementation it is called by each
>>> syscall handler. it will increase the code size a lot.
>>>
>>
>> Don't assume that. On a lot of architectures a function call can be
>> more expensive than a simple compare and branch, because the compiler
>> has to assume a whole bunch of registers are lost at that point.
>>
>> Either way, don't penalize the common architectures for it. Not okay.
>>
>> -hpa
>>
>
>
>
> --
> Thomas
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-22 19:15 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-22 19:15 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, LKML, Linux API, the arch/x86 maintainers,
linux-arm-kernel, Kernel Hardening
On Wed, Mar 15, 2017 at 10:43 AM, Thomas Garnier <thgarnie@google.com> wrote:
> Thanks for the feedback. I will look into inlining by default (looking
> at code size on different arch), the updated patch for x86 in the
> meantime:
I did couple checks and it doesn't seem worth it. I will send a v4
with the change below for additional feedback.
> ===========
>
> Implement specific usage of verify_pre_usermode_state for user-mode
> returns for x86.
> ---
> Based on next-20170308
> ---
> arch/x86/Kconfig | 1 +
> arch/x86/entry/common.c | 3 +++
> arch/x86/entry/entry_64.S | 8 ++++++++
> arch/x86/include/asm/pgtable_64_types.h | 11 +++++++++++
> arch/x86/include/asm/processor.h | 11 -----------
> 5 files changed, 23 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 005df7c825f5..6d48e18e6f09 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -63,6 +63,7 @@ config X86
> select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
> select ARCH_MIGHT_HAVE_PC_PARPORT
> select ARCH_MIGHT_HAVE_PC_SERIO
> + select ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE
> select ARCH_SUPPORTS_ATOMIC_RMW
> select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
> select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 370c42c7f046..525edbb77f03 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -22,6 +22,7 @@
> #include <linux/context_tracking.h>
> #include <linux/user-return-notifier.h>
> #include <linux/uprobes.h>
> +#include <linux/syscalls.h>
>
> #include <asm/desc.h>
> #include <asm/traps.h>
> @@ -180,6 +181,8 @@ __visible inline void
> prepare_exit_to_usermode(struct pt_regs *regs)
> struct thread_info *ti = current_thread_info();
> u32 cached_flags;
>
> + verify_pre_usermode_state();
> +
> if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
> local_irq_disable();
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index d2b2a2948ffe..c079b010205c 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -218,6 +218,14 @@ entry_SYSCALL_64_fastpath:
> testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
> jnz 1f
>
> + /*
> + * If address limit is not based on user-mode, jump to slow path for
> + * additional security checks.
> + */
> + movq $TASK_SIZE_MAX, %rcx
> + cmp %rcx, TASK_addr_limit(%r11)
> + jnz 1f
> +
> LOCKDEP_SYS_EXIT
> TRACE_IRQS_ON /* user mode is traced as IRQs on */
> movq RIP(%rsp), %rcx
> diff --git a/arch/x86/include/asm/pgtable_64_types.h
> b/arch/x86/include/asm/pgtable_64_types.h
> index 3a264200c62f..0fbbb79d058c 100644
> --- a/arch/x86/include/asm/pgtable_64_types.h
> +++ b/arch/x86/include/asm/pgtable_64_types.h
> @@ -76,4 +76,15 @@ typedef struct { pteval_t pte; } pte_t;
>
> #define EARLY_DYNAMIC_PAGE_TABLES 64
>
> +/*
> + * User space process size. 47bits minus one guard page. The guard
> + * page is necessary on Intel CPUs: if a SYSCALL instruction is at
> + * the highest possible canonical userspace address, then that
> + * syscall will enter the kernel with a non-canonical return
> + * address, and SYSRET will explode dangerously. We avoid this
> + * particular problem by preventing anything from being mapped
> + * at the maximum canonical address.
> + */
> +#define TASK_SIZE_MAX ((_AC(1, UL) << 47) - PAGE_SIZE)
> +
> #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index f385eca5407a..9bc99d37133e 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -829,17 +829,6 @@ static inline void spin_lock_prefetch(const void *x)
> #define KSTK_ESP(task) (task_pt_regs(task)->sp)
>
> #else
> -/*
> - * User space process size. 47bits minus one guard page. The guard
> - * page is necessary on Intel CPUs: if a SYSCALL instruction is at
> - * the highest possible canonical userspace address, then that
> - * syscall will enter the kernel with a non-canonical return
> - * address, and SYSRET will explode dangerously. We avoid this
> - * particular problem by preventing anything from being mapped
> - * at the maximum canonical address.
> - */
> -#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
> -
> /* This decides where the kernel will search for a free chunk of vm
> * space during mmap's.
> */
> --
> 2.12.0.367.g23dc2f6d3c-goog
>
>
> On Tue, Mar 14, 2017 at 10:53 AM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 03/14/17 09:51, Thomas Garnier wrote:
>>>>
>>>> I wanted to comment on that thing: why on earth isn't
>>>> verify_pre_usermode_state() an inline? Making it an out-of-line
>>>> function adds pointless extra overhead to the C code when we are talking
>>>> about a few instructions.
>>>
>>> Because outside of arch specific implementation it is called by each
>>> syscall handler. it will increase the code size a lot.
>>>
>>
>> Don't assume that. On a lot of architectures a function call can be
>> more expensive than a simple compare and branch, because the compiler
>> has to assume a whole bunch of registers are lost at that point.
>>
>> Either way, don't penalize the common architectures for it. Not okay.
>>
>> -hpa
>>
>
>
>
> --
> Thomas
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-22 19:15 ` [kernel-hardening] " Thomas Garnier
@ 2017-03-22 20:21 ` H. Peter Anvin
-1 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-22 20:21 UTC (permalink / raw)
To: Thomas Garnier
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller
On 03/22/17 12:15, Thomas Garnier wrote:
> On Wed, Mar 15, 2017 at 10:43 AM, Thomas Garnier <thgarnie@google.com> wrote:
>> Thanks for the feedback. I will look into inlining by default (looking
>> at code size on different arch), the updated patch for x86 in the
>> meantime:
>
> I did couple checks and it doesn't seem worth it. I will send a v4
> with the change below for additional feedback.
Can you specify what that means?
On x86, where there is only one caller of this, it really seems like it
ought to reduce the overhead to almost zero (since it most likely is
hidden in the pipeline.)
I would like to suggest defining it inline if
CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE is set; I really don't
care about an architecture which doesn't have it.
Note that the upcoming 5-level paging (LA57) support will make
TASK_SIZE_MAX dependent on the CPU. The best way to handle that is
probably to tag this immediate with an assembly alternative rather than
loading it from a memory variable (most likely taking a cache hit.)
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-22 20:21 ` H. Peter Anvin
0 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-22 20:21 UTC (permalink / raw)
To: Thomas Garnier
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, LKML, Linux API, the arch/x86 maintainers,
linux-arm-kernel, Kernel Hardening
On 03/22/17 12:15, Thomas Garnier wrote:
> On Wed, Mar 15, 2017 at 10:43 AM, Thomas Garnier <thgarnie@google.com> wrote:
>> Thanks for the feedback. I will look into inlining by default (looking
>> at code size on different arch), the updated patch for x86 in the
>> meantime:
>
> I did couple checks and it doesn't seem worth it. I will send a v4
> with the change below for additional feedback.
Can you specify what that means?
On x86, where there is only one caller of this, it really seems like it
ought to reduce the overhead to almost zero (since it most likely is
hidden in the pipeline.)
I would like to suggest defining it inline if
CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE is set; I really don't
care about an architecture which doesn't have it.
Note that the upcoming 5-level paging (LA57) support will make
TASK_SIZE_MAX dependent on the CPU. The best way to handle that is
probably to tag this immediate with an assembly alternative rather than
loading it from a memory variable (most likely taking a cache hit.)
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-22 20:21 ` [kernel-hardening] " H. Peter Anvin
@ 2017-03-22 20:41 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-22 20:41 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner
On Wed, Mar 22, 2017 at 1:21 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 03/22/17 12:15, Thomas Garnier wrote:
>> On Wed, Mar 15, 2017 at 10:43 AM, Thomas Garnier <thgarnie@google.com> wrote:
>>> Thanks for the feedback. I will look into inlining by default (looking
>>> at code size on different arch), the updated patch for x86 in the
>>> meantime:
>>
>> I did couple checks and it doesn't seem worth it. I will send a v4
>> with the change below for additional feedback.
>
> Can you specify what that means?
If I set inline by default, the compiler chose not to inline it on
x86. If I force inline the size impact was actually bigger (without
the architecture specific code).
>
> On x86, where there is only one caller of this, it really seems like it
> ought to reduce the overhead to almost zero (since it most likely is
> hidden in the pipeline.)
>
> I would like to suggest defining it inline if
> CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE is set; I really don't
> care about an architecture which doesn't have it.
But if there is only one caller, does the compiler is not suppose to
inline the function based on options?
The assembly will call it too, so I would need an inline and a
non-inline based on the caller.
>
> Note that the upcoming 5-level paging (LA57) support will make
> TASK_SIZE_MAX dependent on the CPU. The best way to handle that is
> probably to tag this immediate with an assembly alternative rather than
> loading it from a memory variable (most likely taking a cache hit.)
>
> -hpa
>
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-22 20:41 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-22 20:41 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, LKML, Linux API, the arch/x86 maintainers,
linux-arm-kernel, Kernel Hardening
On Wed, Mar 22, 2017 at 1:21 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 03/22/17 12:15, Thomas Garnier wrote:
>> On Wed, Mar 15, 2017 at 10:43 AM, Thomas Garnier <thgarnie@google.com> wrote:
>>> Thanks for the feedback. I will look into inlining by default (looking
>>> at code size on different arch), the updated patch for x86 in the
>>> meantime:
>>
>> I did couple checks and it doesn't seem worth it. I will send a v4
>> with the change below for additional feedback.
>
> Can you specify what that means?
If I set inline by default, the compiler chose not to inline it on
x86. If I force inline the size impact was actually bigger (without
the architecture specific code).
>
> On x86, where there is only one caller of this, it really seems like it
> ought to reduce the overhead to almost zero (since it most likely is
> hidden in the pipeline.)
>
> I would like to suggest defining it inline if
> CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE is set; I really don't
> care about an architecture which doesn't have it.
But if there is only one caller, does the compiler is not suppose to
inline the function based on options?
The assembly will call it too, so I would need an inline and a
non-inline based on the caller.
>
> Note that the upcoming 5-level paging (LA57) support will make
> TASK_SIZE_MAX dependent on the CPU. The best way to handle that is
> probably to tag this immediate with an assembly alternative rather than
> loading it from a memory variable (most likely taking a cache hit.)
>
> -hpa
>
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-22 20:41 ` [kernel-hardening] " Thomas Garnier
@ 2017-03-22 20:49 ` H. Peter Anvin
-1 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-22 20:49 UTC (permalink / raw)
To: Thomas Garnier
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller
On 03/22/17 13:41, Thomas Garnier wrote:
>>> with the change below for additional feedback.
>>
>> Can you specify what that means?
>
> If I set inline by default, the compiler chose not to inline it on
> x86. If I force inline the size impact was actually bigger (without
> the architecture specific code).
>
That's utterly bizarre. Something strange is going on there. I suspect
the right thing to do is to out-of-line the error case only, but even
that seems strange. It should be something like four instructions inline.
>>
>> On x86, where there is only one caller of this, it really seems like it
>> ought to reduce the overhead to almost zero (since it most likely is
>> hidden in the pipeline.)
>>
>> I would like to suggest defining it inline if
>> CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE is set; I really don't
>> care about an architecture which doesn't have it.
>
> But if there is only one caller, does the compiler is not suppose to
> inline the function based on options?
If it is marked static in the same file, yes, but you have it in a
different file from what I can tell.
> The assembly will call it too, so I would need an inline and a
> non-inline based on the caller.
Where? I don't see that anywhere, at least for x86.
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-22 20:49 ` H. Peter Anvin
0 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-22 20:49 UTC (permalink / raw)
To: Thomas Garnier
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, LKML, Linux API, the arch/x86 maintainers,
linux-arm-kernel, Kernel Hardening
On 03/22/17 13:41, Thomas Garnier wrote:
>>> with the change below for additional feedback.
>>
>> Can you specify what that means?
>
> If I set inline by default, the compiler chose not to inline it on
> x86. If I force inline the size impact was actually bigger (without
> the architecture specific code).
>
That's utterly bizarre. Something strange is going on there. I suspect
the right thing to do is to out-of-line the error case only, but even
that seems strange. It should be something like four instructions inline.
>>
>> On x86, where there is only one caller of this, it really seems like it
>> ought to reduce the overhead to almost zero (since it most likely is
>> hidden in the pipeline.)
>>
>> I would like to suggest defining it inline if
>> CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE is set; I really don't
>> care about an architecture which doesn't have it.
>
> But if there is only one caller, does the compiler is not suppose to
> inline the function based on options?
If it is marked static in the same file, yes, but you have it in a
different file from what I can tell.
> The assembly will call it too, so I would need an inline and a
> non-inline based on the caller.
Where? I don't see that anywhere, at least for x86.
-hpa
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-22 20:49 ` [kernel-hardening] " H. Peter Anvin
@ 2017-03-22 21:11 ` Thomas Garnier
-1 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-22 21:11 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner
On Wed, Mar 22, 2017 at 1:49 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 03/22/17 13:41, Thomas Garnier wrote:
>>>> with the change below for additional feedback.
>>>
>>> Can you specify what that means?
>>
>> If I set inline by default, the compiler chose not to inline it on
>> x86. If I force inline the size impact was actually bigger (without
>> the architecture specific code).
>>
>
> That's utterly bizarre. Something strange is going on there. I suspect
> the right thing to do is to out-of-line the error case only, but even
> that seems strange. It should be something like four instructions inline.
>
The compiler seemed to often inline other functions called by the
syscall handlers. I assume the growth was due to changes in code
optimization because the function is much larger at the end.
>>>
>>> On x86, where there is only one caller of this, it really seems like it
>>> ought to reduce the overhead to almost zero (since it most likely is
>>> hidden in the pipeline.)
>>>
>>> I would like to suggest defining it inline if
>>> CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE is set; I really don't
>>> care about an architecture which doesn't have it.
>>
>> But if there is only one caller, does the compiler is not suppose to
>> inline the function based on options?
>
> If it is marked static in the same file, yes, but you have it in a
> different file from what I can tell.
If we do global optimization, it should. Having it as a static inline
make it easier on all types of builds.
>
>> The assembly will call it too, so I would need an inline and a
>> non-inline based on the caller.
>
> Where? I don't see that anywhere, at least for x86.
After the latest changes on x86, yes. On arm/arm64, we call it with
the CHECK_DATA_CORRUPTION config.
>
> -hpa
>
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* [kernel-hardening] Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
@ 2017-03-22 21:11 ` Thomas Garnier
0 siblings, 0 replies; 46+ messages in thread
From: Thomas Garnier @ 2017-03-22 21:11 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic Weisbecker, Stanislav Kinsburskiy,
Ingo Molnar, Paolo Bonzini, Dmitry Safonov, Borislav Petkov,
Josh Poimboeuf, Brian Gerst, Jan Beulich, Christian Borntraeger,
Fenghua Yu, He Chen, Russell King, Vladimir Murzin, Will Deacon,
Catalin Marinas, Mark Rutland, James Morse, David A . Long,
Pratyush Anand, Laura Abbott, Andre Przywara, Chris Metcalf,
linux-s390, LKML, Linux API, the arch/x86 maintainers,
linux-arm-kernel, Kernel Hardening
On Wed, Mar 22, 2017 at 1:49 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> On 03/22/17 13:41, Thomas Garnier wrote:
>>>> with the change below for additional feedback.
>>>
>>> Can you specify what that means?
>>
>> If I set inline by default, the compiler chose not to inline it on
>> x86. If I force inline the size impact was actually bigger (without
>> the architecture specific code).
>>
>
> That's utterly bizarre. Something strange is going on there. I suspect
> the right thing to do is to out-of-line the error case only, but even
> that seems strange. It should be something like four instructions inline.
>
The compiler seemed to often inline other functions called by the
syscall handlers. I assume the growth was due to changes in code
optimization because the function is much larger at the end.
>>>
>>> On x86, where there is only one caller of this, it really seems like it
>>> ought to reduce the overhead to almost zero (since it most likely is
>>> hidden in the pipeline.)
>>>
>>> I would like to suggest defining it inline if
>>> CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE is set; I really don't
>>> care about an architecture which doesn't have it.
>>
>> But if there is only one caller, does the compiler is not suppose to
>> inline the function based on options?
>
> If it is marked static in the same file, yes, but you have it in a
> different file from what I can tell.
If we do global optimization, it should. Having it as a static inline
make it easier on all types of builds.
>
>> The assembly will call it too, so I would need an inline and a
>> non-inline based on the caller.
>
> Where? I don't see that anywhere, at least for x86.
After the latest changes on x86, yes. On arm/arm64, we call it with
the CHECK_DATA_CORRUPTION config.
>
> -hpa
>
--
Thomas
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
2017-03-22 21:11 ` [kernel-hardening] " Thomas Garnier
(?)
(?)
@ 2017-03-23 19:14 ` H. Peter Anvin
-1 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-23 19:14 UTC (permalink / raw)
To: Thomas Garnier
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky, Helge Deller, Rik van Riel, John Stultz,
Thomas Gleixner, Oleg Nesterov, Stephen Smalley,
Pavel Tikhomirov, Frederic
Weisbecker <fweisbec@gmail.com>,Stanislav Kinsburskiy <skinsbursky@virtuozzo.com>,Ingo Molnar <mingo@redhat.com>,Paolo Bonzini <pbonzini@redhat.com>,Dmitry Safonov <dsafonov@virtuozzo.com>,Borislav Petkov <bp@alien8.de>,Josh Poimboeuf <jpoimboe@redhat.com>,Brian Gerst <brgerst@gmail.com>,Jan Beulich <JBeulich@suse.com>,Christian Borntraeger <borntraeger@de.ibm.com>,Fenghua Yu <fenghua.yu@intel.com>,He Chen <he.chen@linux.intel.com>,Russell King <linux@armlinux.org.uk>,Vladimir Murzin <vladimir.murzin@arm.com>,Will Deacon <will.deacon@arm.com>,Catalin Marinas <catalin.marinas@arm.com>,Mark Rutland <mark.rutland@arm.com>,James Morse <james.morse@arm.com>,"David A . Long" <dave.long@linaro.org>,Pratyush Anand <panand@redhat.com>,Laura Abbott <labbott@redhat.com>,Andre Przywara <andre.przywara@arm.com>,Chris Metcalf <cmetcalf@mellanox.com>,linux-s390 <linux-s390@vger.kernel.org>,LKML <linux-kernel@vger.kernel.org>,Linux API <linux-api@vger.kernel.org>,the arch/x86 maintainers
<x86@kernel.org>,"linux-arm-kernel@lists.infradead.org" <linux-arm-kernel@lists.infradead.org>,Kernel Hardening <kernel-hardening@lists.openwall.com>
From: hpa@zytor.com
Message-ID: <E0C934E7-6789-4598-AF55-468C84B8568B@zytor.com>
On March 22, 2017 2:11:12 PM PDT, Thomas Garnier <thgarnie@google.com> wrote:
>On Wed, Mar 22, 2017 at 1:49 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>> On 03/22/17 13:41, Thomas Garnier wrote:
>>>>> with the change below for additional feedback.
>>>>
>>>> Can you specify what that means?
>>>
>>> If I set inline by default, the compiler chose not to inline it on
>>> x86. If I force inline the size impact was actually bigger (without
>>> the architecture specific code).
>>>
>>
>> That's utterly bizarre. Something strange is going on there. I
>suspect
>> the right thing to do is to out-of-line the error case only, but even
>> that seems strange. It should be something like four instructions
>inline.
>>
>
>The compiler seemed to often inline other functions called by the
>syscall handlers. I assume the growth was due to changes in code
>optimization because the function is much larger at the end.
>
>>>>
>>>> On x86, where there is only one caller of this, it really seems
>like it
>>>> ought to reduce the overhead to almost zero (since it most likely
>is
>>>> hidden in the pipeline.)
>>>>
>>>> I would like to suggest defining it inline if
>>>> CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE is set; I really
>don't
>>>> care about an architecture which doesn't have it.
>>>
>>> But if there is only one caller, does the compiler is not suppose to
>>> inline the function based on options?
>>
>> If it is marked static in the same file, yes, but you have it in a
>> different file from what I can tell.
>
>If we do global optimization, it should. Having it as a static inline
>make it easier on all types of builds.
>
>>
>>> The assembly will call it too, so I would need an inline and a
>>> non-inline based on the caller.
>>
>> Where? I don't see that anywhere, at least for x86.
>
>After the latest changes on x86, yes. On arm/arm64, we call it with
>the CHECK_DATA_CORRUPTION config.
>
>>
>> -hpa
>>
If we do global optimization, yes, but global optimization (generally called link-time optimization, LTO, on Linux) is very much the exception and not the rule for the Linux kernel at this time.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state
[not found] ` <CAJcbSZEouZ2v+q_i-3Xiba2FNT18ipKwF09838vvfSCwEi7e4Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2017-03-23 19:14 ` H. Peter Anvin
0 siblings, 0 replies; 46+ messages in thread
From: H. Peter Anvin @ 2017-03-23 19:14 UTC (permalink / raw)
To: Thomas Garnier
Cc: Andy Lutomirski, Ingo Molnar, Martin Schwidefsky, Heiko Carstens,
David Howells, Arnd Bergmann, Al Viro, Dave Hansen,
René Nyffenegger, Andrew Morton, Kees Cook,
Paul E . McKenney, Andy Lutomirski, Ard Biesheuvel,
Nicolas Pitre, Petr Mladek, Sebastian Andrzej Siewior,
Sergey Senozhatsky
Weisbecker <fweisbec-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>,Stanislav Kinsburskiy <skinsbursky-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org>,Ingo Molnar <mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,Paolo Bonzini <pbonzini-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,Dmitry Safonov <dsafonov-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org>,Borislav Petkov <bp-Gina5bIWoIWzQB+pC5nmwQ@public.gmane.org>,Josh Poimboeuf <jpoimboe-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,Brian Gerst <brgerst-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>,Jan Beulich <JBeulich-IBi9RG/b67k@public.gmane.org>,Christian Borntraeger <borntraeger-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>,Fenghua Yu <fenghua.yu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>,He Chen <he.chen-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>,Russell King <linux-I+IVW8TIWO2tmTQ+vhA3Yw@public.gmane.org>,Vladimir Murzin <vladimir.murzin-5wv7dgnIgG8@public.gmane.org>,Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org>,Catalin Marinas <catalin.marinas-5wv7dgnIgG8@public.gmane.org>,Mark Rutland <mark.rutland-5wv7dgnIgG8@public.gmane.org>,James Morse <james.morse-5wv7dgnIgG8@public.gmane.org>,"David A . Long" <dave.long-QSEj5FYQhm4dnm+yROfE0A@public.gmane.org>,Pratyush Anand <panand-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,Laura Abbott <labbott-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>,Andre Przywara <andre.przywara
@arm.com>,Chris Metcalf <cmetcalf-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>,linux-s390 <linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,LKML <linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,Linux API <linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,the arch/x86 maintainers
<x86-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>,"linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org" <linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org>,Kernel Hardening <kernel-hardening-ZwoEplunGu1jrUoiu81ncdBPR1lH4CV8@public.gmane.org>
From: hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org
Message-ID: <E0C934E7-6789-4598-AF55-468C84B8568B-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org>
On March 22, 2017 2:11:12 PM PDT, Thomas Garnier <thgarnie-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org> wrote:
>On Wed, Mar 22, 2017 at 1:49 PM, H. Peter Anvin <hpa-YMNOUZJC4hwAvxtiuMwx3w@public.gmane.org> wrote:
>> On 03/22/17 13:41, Thomas Garnier wrote:
>>>>> with the change below for additional feedback.
>>>>
>>>> Can you specify what that means?
>>>
>>> If I set inline by default, the compiler chose not to inline it on
>>> x86. If I force inline the size impact was actually bigger (without
>>> the architecture specific code).
>>>
>>
>> That's utterly bizarre. Something strange is going on there. I
>suspect
>> the right thing to do is to out-of-line the error case only, but even
>> that seems strange. It should be something like four instructions
>inline.
>>
>
>The compiler seemed to often inline other functions called by the
>syscall handlers. I assume the growth was due to changes in code
>optimization because the function is much larger at the end.
>
>>>>
>>>> On x86, where there is only one caller of this, it really seems
>like it
>>>> ought to reduce the overhead to almost zero (since it most likely
>is
>>>> hidden in the pipeline.)
>>>>
>>>> I would like to suggest defining it inline if
>>>> CONFIG_ARCH_NO_SYSCALL_VERIFY_PRE_USERMODE_STATE is set; I really
>don't
>>>> care about an architecture which doesn't have it.
>>>
>>> But if there is only one caller, does the compiler is not suppose to
>>> inline the function based on options?
>>
>> If it is marked static in the same file, yes, but you have it in a
>> different file from what I can tell.
>
>If we do global optimization, it should. Having it as a static inline
>make it easier on all types of builds.
>
>>
>>> The assembly will call it too, so I would need an inline and a
>>> non-inline based on the caller.
>>
>> Where? I don't see that anywhere, at least for x86.
>
>After the latest changes on x86, yes. On arm/arm64, we call it with
>the CHECK_DATA_CORRUPTION config.
>
>>
>> -hpa
>>
If we do global optimization, yes, but global optimization (generally called link-time optimization, LTO, on Linux) is very much the exception and not the rule for the Linux kernel at this time.
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
^ permalink raw reply [flat|nested] 46+ messages in thread
end of thread, other threads:[~2017-03-23 19:39 UTC | newest]
Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-11 0:04 [PATCH v3 1/4] syscalls: Restore address limit after a syscall Thomas Garnier
2017-03-11 0:04 ` [kernel-hardening] " Thomas Garnier
2017-03-11 0:04 ` [PATCH v3 2/4] x86/syscalls: Specific usage of verify_pre_usermode_state Thomas Garnier
2017-03-11 0:04 ` [kernel-hardening] " Thomas Garnier
2017-03-11 9:42 ` Ingo Molnar
2017-03-11 9:42 ` [kernel-hardening] " Ingo Molnar
2017-03-13 15:53 ` Thomas Garnier
2017-03-13 15:53 ` [kernel-hardening] " Thomas Garnier
2017-03-13 21:48 ` H. Peter Anvin
2017-03-13 21:48 ` H. Peter Anvin
2017-03-14 0:04 ` H. Peter Anvin
2017-03-14 0:04 ` [kernel-hardening] " H. Peter Anvin
2017-03-14 9:40 ` H. Peter Anvin
2017-03-14 9:40 ` [kernel-hardening] " H. Peter Anvin
2017-03-14 15:17 ` Thomas Garnier
2017-03-14 15:17 ` [kernel-hardening] " Thomas Garnier
2017-03-14 15:39 ` Andy Lutomirski
2017-03-14 15:39 ` [kernel-hardening] " Andy Lutomirski
2017-03-14 16:29 ` Thomas Garnier
2017-03-14 16:29 ` [kernel-hardening] " Thomas Garnier
2017-03-14 16:44 ` H. Peter Anvin
2017-03-14 16:44 ` [kernel-hardening] " H. Peter Anvin
2017-03-14 16:51 ` Thomas Garnier
2017-03-14 16:51 ` [kernel-hardening] " Thomas Garnier
2017-03-14 17:53 ` H. Peter Anvin
2017-03-14 17:53 ` [kernel-hardening] " H. Peter Anvin
2017-03-15 17:43 ` Thomas Garnier
2017-03-15 17:43 ` [kernel-hardening] " Thomas Garnier
2017-03-22 19:15 ` Thomas Garnier
2017-03-22 19:15 ` [kernel-hardening] " Thomas Garnier
2017-03-22 20:21 ` H. Peter Anvin
2017-03-22 20:21 ` [kernel-hardening] " H. Peter Anvin
2017-03-22 20:41 ` Thomas Garnier
2017-03-22 20:41 ` [kernel-hardening] " Thomas Garnier
2017-03-22 20:49 ` H. Peter Anvin
2017-03-22 20:49 ` [kernel-hardening] " H. Peter Anvin
2017-03-22 21:11 ` Thomas Garnier
2017-03-22 21:11 ` [kernel-hardening] " Thomas Garnier
[not found] ` <CAJcbSZEouZ2v+q_i-3Xiba2FNT18ipKwF09838vvfSCwEi7e4Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2017-03-23 19:14 ` H. Peter Anvin
2017-03-23 19:14 ` H. Peter Anvin
2017-03-14 16:30 ` H. Peter Anvin
2017-03-14 16:30 ` [kernel-hardening] " H. Peter Anvin
2017-03-11 0:05 ` [PATCH v3 3/4] arm/syscalls: " Thomas Garnier
2017-03-11 0:05 ` [kernel-hardening] " Thomas Garnier
2017-03-11 0:05 ` [PATCH v3 4/4] arm64/syscalls: " Thomas Garnier
2017-03-11 0:05 ` [kernel-hardening] " Thomas Garnier
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.