All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-12 15:03 ` Vasiliy Kulikov
  0 siblings, 0 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-12 15:03 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar
  Cc: H. Peter Anvin, James Morris, kernel-hardening, x86,
	linux-kernel, linux-security-module

This patch allows x86-64 systems with 32 bit syscalls support to lock a
pid namespace to 32 or 64 bitness syscalls/tasks.  By denying rarely
used compatibility syscalls it reduces an attack surface for 32 bit
containers.

The new sysctl is introduced, abi.bitness_locked.  If set to 1, it locks
all tasks inside of current pid namespace to the bitness of init task
(pid_ns->child_reaper).  After that:

1) a task trying to do a syscall of other bitness would get a signal as
if the corresponding syscall is not enabled (IDT entry/MSR is not
initialized).

2) loading ELF binaries of another bitness is prohibited (as if the
corresponding CONFIG_BINFMT_*=N).

If there is any task which differs in bitness, the lockup fails.


In this patch version the lockup is handled by sysctl.  In the future I
plan to do it via prctl() to handle situations of container root
compromize.  For now, the lockup can be configured by init scripts,
which parse /etc/sysctl.conf and set the sysctl variable.  But if
/sbin/init is compromized, the malicious code would gain a possibility
to do arbitrary syscalls.  So, it should be possible to lockup the
container before the init execution.

( The asm stubs for denied syscalls might be buggy, if so - please
ignore them :) it is just a PoC. )

Qestions/thoughts:

The patch adds a check in syscalls code.  Is it a significant
slowdown for fast syscalls?  If so, probably it worth moving the check
into scheduler code and enabling/disabling corresponding interrupt/MSRs
on each task switch?


Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
---
 arch/x86/ia32/ia32entry.S          |   33 +++++
 arch/x86/include/asm/elf.h         |    5 +-
 arch/x86/include/asm/thread_info.h |   13 ++-
 arch/x86/kernel/Makefile           |    1 +
 arch/x86/kernel/entry_64.S         |   12 ++-
 arch/x86/kernel/syscall_restrict.c |  229 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c            |    2 +-
 kernel/fork.c                      |    5 +
 8 files changed, 293 insertions(+), 7 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..5bc1882 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -151,6 +151,8 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_denied_sysenter
 	orl    $TS_COMPAT,TI_status(%r10)
 	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
@@ -310,6 +312,8 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_denied_syscall
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
@@ -421,6 +425,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,1,0
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_denied_int
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	jnz ia32_tracesys
@@ -453,6 +459,33 @@ ia32_badsys:
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
+ia32_denied_sysenter:
+	SAVE_REST
+	CLEAR_RREGS
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	call	do_ia32_denied_sysenter
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
+ia32_denied_syscall:
+	SAVE_REST
+	CLEAR_RREGS
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	movq $-ENOSYS,%rax
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
+ia32_denied_int:
+	SAVE_REST
+	CLEAR_RREGS
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	call	do_ia32_denied_int
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
 quiet_ni_syscall:
 	movq $-ENOSYS,%rax
 	ret
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..fb054c7 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -153,9 +153,10 @@ do {						\
  * This is used to ensure we don't load something for the wrong architecture.
  */
 #define elf_check_arch(x)			\
-	((x)->e_machine == EM_X86_64)
+	((x)->e_machine == EM_X86_64 && !test_thread_flag(TIF_SYSCALL64_DENIED))
 
-#define compat_elf_check_arch(x)	elf_check_arch_ia32(x)
+#define compat_elf_check_arch(x)		\
+	(elf_check_arch_ia32(x) && !test_thread_flag(TIF_SYSCALL32_DENIED))
 
 static inline void elf_common_init(struct thread_struct *t,
 				   struct pt_regs *regs, const u16 ds)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..1e93040 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,8 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_SYSCALL32_DENIED	29	/* 32 bit syscalls are denied */
+#define TIF_SYSCALL64_DENIED	30	/* 64 bit syscalls are denied */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -117,6 +119,8 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL32_DENIED	(1 << TIF_SYSCALL32_DENIED)
+#define _TIF_SYSCALL64_DENIED	(1 << TIF_SYSCALL64_DENIED)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -259,9 +263,14 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
-#endif	/* !__ASSEMBLY__ */
 
-#ifndef __ASSEMBLY__
+#ifdef CONFIG_IA32_EMULATION
+#define __HAVE_ARCH_POST_FORK
+
+extern void arch_post_fork(struct task_struct *task);
+
+#endif /* CONFIG_IA32_EMULATION */
+
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0410557..a200ff3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SYSCTL)		+= syscall_restrict.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e13329d..b184a45 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -474,6 +474,8 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
+	testl $_TIF_SYSCALL64_DENIED,TI_flags(%rcx)
+	jnz denied_sys
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
 	jnz tracesys
 system_call_fastpath:
@@ -539,8 +541,14 @@ sysret_signal:
 	jmp int_check_syscall_exit_work
 
 badsys:
-	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
-	jmp ret_from_sys_call
+	SAVE_REST
+	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+	FIXUP_TOP_OF_STACK %rdi
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	call do_denied_syscall
+	LOAD_ARGS ARGOFFSET, 1
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
 
 #ifdef CONFIG_AUDITSYSCALL
 	/*
diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c
new file mode 100644
index 0000000..a676f22
--- /dev/null
+++ b/arch/x86/kernel/syscall_restrict.c
@@ -0,0 +1,229 @@
+#include <linux/thread_info.h>
+#include <linux/pid_namespace.h>
+#include <linux/sysctl.h>
+#include <linux/kprobes.h>
+#include <asm/kdebug.h>
+#include <linux/kdebug.h>
+
+#ifdef CONFIG_IA32_EMULATION
+
+void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+	long error_code, siginfo_t *info);
+
+asmlinkage
+void do_ia32_denied_sysenter(struct pt_regs *regs)
+{
+	current->thread.error_code = 0;
+	current->thread.trap_no = 13;
+
+	if (printk_ratelimit()) {
+		pr_err("%s[%d] attempt to use denied 32-bit sysenter, ip:%lx sp:%lx",
+			current->comm, task_pid_nr(current),
+			regs->ip, regs->sp);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+	force_sig(SIGSEGV, current);
+	return;
+
+}
+
+asmlinkage
+void do_ia32_denied_int(struct pt_regs *regs)
+{
+	if (printk_ratelimit()) {
+		pr_err("%s[%d] attempt to use denied 32-bit int80h, ip :%lx sp:%lx",
+			current->comm, task_pid_nr(current),
+			regs->ip, regs->sp);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+	do_trap(11, SIGBUS, "segment not present", regs, 0, NULL);
+}
+
+asmlinkage
+void do_denied_syscall(struct pt_regs *regs)
+{
+	siginfo_t info = {
+		.si_signo = SIGILL,
+		.si_errno = 0,
+		.si_code = ILL_ILLOPN,
+		.si_addr = (void __user *)regs->ip
+	};
+
+	if (printk_ratelimit()) {
+		pr_err("%s[%d] attempt to use denied 64-bit syscall, ip:%lx sp:%lx",
+			current->comm, task_pid_nr(current),
+			regs->ip, regs->sp);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+
+	do_trap(6, SIGILL, "invalid opcode", regs, 0, &info);
+}
+
+static int task_get_bitness(struct task_struct *task)
+{
+	if (test_ti_thread_flag(task_thread_info(task), TIF_IA32))
+		return 32;
+	else
+		return 64;
+}
+
+static bool pidns_locked(struct pid_namespace *pid_ns)
+{
+	struct thread_info *ti = task_thread_info(pid_ns->child_reaper);
+
+	return test_ti_thread_flag(ti, TIF_SYSCALL32_DENIED) ||
+	       test_ti_thread_flag(ti, TIF_SYSCALL64_DENIED);
+}
+
+static int bits_to_flags(int bits)
+{
+	if (bits == 32)
+		return TIF_SYSCALL64_DENIED;
+	else
+		return TIF_SYSCALL32_DENIED;
+}
+
+void arch_post_fork(struct task_struct *task)
+{
+	int clear_bit_nr;
+
+	if (!pidns_locked(current->nsproxy->pid_ns))
+		return;
+
+	clear_bit_nr = bits_to_flags(task_get_bitness(current));
+	set_tsk_thread_flag(task, clear_bit_nr);
+}
+
+/* Called under rcu_read_lock and write_lock_irq(tasklist) */
+static int __pidns_may_lock_bitness(struct pid_namespace *pid_ns, int bits)
+{
+	struct task_struct *task;
+	int old_bits;
+	int nr;
+
+	for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (!task)
+			continue;
+
+		old_bits = task_get_bitness(task);
+		if (old_bits != bits) {
+			pr_err("Inconsistent syscall restriction detected! "
+				"Parent ns tries to restrict syscalls to %d "
+				"bits while some task is %d bit.",
+				bits, old_bits);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/* Called under rcu_read_lock and write_lock_irq(tasklist) */
+static void __bitness_lock(struct pid_namespace *pid_ns, int bits)
+{
+	u32 clear_bit_nr;
+	struct task_struct *task;
+	int nr;
+
+	clear_bit_nr = bits_to_flags(bits);
+
+	for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (task)
+			set_tsk_thread_flag(task, clear_bit_nr);
+	}
+}
+
+static int bitness_lock(struct pid_namespace *pid_ns)
+{
+	int rc, new_bits;
+
+	rcu_read_lock();
+	write_lock_irq(&tasklist_lock);
+
+	new_bits = task_get_bitness(pid_ns->child_reaper);
+	rc = __pidns_may_lock_bitness(pid_ns, new_bits);
+	if (!rc)
+		__bitness_lock(pid_ns, new_bits);
+
+	write_unlock_irq(&tasklist_lock);
+	rcu_read_unlock();
+	return rc;
+}
+
+static int bitness_locked_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int rc, new_bits, old_bits;
+	struct ctl_table tbl = {
+		.procname	= table->procname,
+		.data		= &new_bits,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+	};
+
+	old_bits = new_bits = pidns_locked(current->nsproxy->pid_ns);
+	rc = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+	if (rc || !write)
+		return rc;
+
+	if (!capable(CAP_SYS_ADMIN) || (new_bits == 0 && old_bits))
+		return -EACCES;
+	if (new_bits && old_bits)
+		return 0;
+	return bitness_lock(current->nsproxy->pid_ns);
+}
+
+static struct ctl_table abi_syscall_restrict[] = {
+	{
+		.procname = "bitness_locked",
+		.mode = 0644,
+		.proc_handler = bitness_locked_handler
+	},
+	{}
+};
+
+#else /* CONFIG_IA32_EMULATION */
+
+static int one = 1;
+
+static struct ctl_table abi_syscall_restrict[] = {
+	{
+		.procname	= "bitness_locked",
+		.data		= &one,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+	},
+	{}
+};
+
+#endif /* CONFIG_IA32_EMULATION */
+
+
+static struct ctl_table abi_root[] = {
+	{
+		.procname = "abi",
+		.mode = 0555,
+		.child = abi_syscall_restrict
+	},
+	{}
+};
+
+__init int syscall_restrict_init(void)
+{
+	register_sysctl_table(abi_root);
+	return 0;
+}
+device_initcall(syscall_restrict_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9682ec5..a9bf9cf 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -116,7 +116,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	dec_preempt_count();
 }
 
-static void __kprobes
+void __kprobes
 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	long error_code, siginfo_t *info)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index e7ceaca..55e4455 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1039,6 +1039,10 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 }
 
+#ifndef __HAVE_ARCH_POST_FORK
+#define arch_post_fork(p)
+#endif
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1374,6 +1378,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+	arch_post_fork(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	if (clone_flags & CLONE_THREAD)
-- 
Vasiliy

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-12 15:03 ` Vasiliy Kulikov
  0 siblings, 0 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-12 15:03 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar
  Cc: H. Peter Anvin, James Morris, kernel-hardening, x86,
	linux-kernel, linux-security-module

This patch allows x86-64 systems with 32 bit syscalls support to lock a
pid namespace to 32 or 64 bitness syscalls/tasks.  By denying rarely
used compatibility syscalls it reduces an attack surface for 32 bit
containers.

The new sysctl is introduced, abi.bitness_locked.  If set to 1, it locks
all tasks inside of current pid namespace to the bitness of init task
(pid_ns->child_reaper).  After that:

1) a task trying to do a syscall of other bitness would get a signal as
if the corresponding syscall is not enabled (IDT entry/MSR is not
initialized).

2) loading ELF binaries of another bitness is prohibited (as if the
corresponding CONFIG_BINFMT_*=N).

If there is any task which differs in bitness, the lockup fails.


In this patch version the lockup is handled by sysctl.  In the future I
plan to do it via prctl() to handle situations of container root
compromize.  For now, the lockup can be configured by init scripts,
which parse /etc/sysctl.conf and set the sysctl variable.  But if
/sbin/init is compromized, the malicious code would gain a possibility
to do arbitrary syscalls.  So, it should be possible to lockup the
container before the init execution.

( The asm stubs for denied syscalls might be buggy, if so - please
ignore them :) it is just a PoC. )

Qestions/thoughts:

The patch adds a check in syscalls code.  Is it a significant
slowdown for fast syscalls?  If so, probably it worth moving the check
into scheduler code and enabling/disabling corresponding interrupt/MSRs
on each task switch?


Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
---
 arch/x86/ia32/ia32entry.S          |   33 +++++
 arch/x86/include/asm/elf.h         |    5 +-
 arch/x86/include/asm/thread_info.h |   13 ++-
 arch/x86/kernel/Makefile           |    1 +
 arch/x86/kernel/entry_64.S         |   12 ++-
 arch/x86/kernel/syscall_restrict.c |  229 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c            |    2 +-
 kernel/fork.c                      |    5 +
 8 files changed, 293 insertions(+), 7 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..5bc1882 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -151,6 +151,8 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_denied_sysenter
 	orl    $TS_COMPAT,TI_status(%r10)
 	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
@@ -310,6 +312,8 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_denied_syscall
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
@@ -421,6 +425,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,1,0
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_denied_int
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	jnz ia32_tracesys
@@ -453,6 +459,33 @@ ia32_badsys:
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
+ia32_denied_sysenter:
+	SAVE_REST
+	CLEAR_RREGS
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	call	do_ia32_denied_sysenter
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
+ia32_denied_syscall:
+	SAVE_REST
+	CLEAR_RREGS
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	movq $-ENOSYS,%rax
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
+ia32_denied_int:
+	SAVE_REST
+	CLEAR_RREGS
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	call	do_ia32_denied_int
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
 quiet_ni_syscall:
 	movq $-ENOSYS,%rax
 	ret
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..fb054c7 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -153,9 +153,10 @@ do {						\
  * This is used to ensure we don't load something for the wrong architecture.
  */
 #define elf_check_arch(x)			\
-	((x)->e_machine == EM_X86_64)
+	((x)->e_machine == EM_X86_64 && !test_thread_flag(TIF_SYSCALL64_DENIED))
 
-#define compat_elf_check_arch(x)	elf_check_arch_ia32(x)
+#define compat_elf_check_arch(x)		\
+	(elf_check_arch_ia32(x) && !test_thread_flag(TIF_SYSCALL32_DENIED))
 
 static inline void elf_common_init(struct thread_struct *t,
 				   struct pt_regs *regs, const u16 ds)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..1e93040 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,8 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_SYSCALL32_DENIED	29	/* 32 bit syscalls are denied */
+#define TIF_SYSCALL64_DENIED	30	/* 64 bit syscalls are denied */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -117,6 +119,8 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL32_DENIED	(1 << TIF_SYSCALL32_DENIED)
+#define _TIF_SYSCALL64_DENIED	(1 << TIF_SYSCALL64_DENIED)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -259,9 +263,14 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
-#endif	/* !__ASSEMBLY__ */
 
-#ifndef __ASSEMBLY__
+#ifdef CONFIG_IA32_EMULATION
+#define __HAVE_ARCH_POST_FORK
+
+extern void arch_post_fork(struct task_struct *task);
+
+#endif /* CONFIG_IA32_EMULATION */
+
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0410557..a200ff3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SYSCTL)		+= syscall_restrict.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e13329d..b184a45 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -474,6 +474,8 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
+	testl $_TIF_SYSCALL64_DENIED,TI_flags(%rcx)
+	jnz denied_sys
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
 	jnz tracesys
 system_call_fastpath:
@@ -539,8 +541,14 @@ sysret_signal:
 	jmp int_check_syscall_exit_work
 
 badsys:
-	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
-	jmp ret_from_sys_call
+	SAVE_REST
+	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+	FIXUP_TOP_OF_STACK %rdi
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	call do_denied_syscall
+	LOAD_ARGS ARGOFFSET, 1
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
 
 #ifdef CONFIG_AUDITSYSCALL
 	/*
diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c
new file mode 100644
index 0000000..a676f22
--- /dev/null
+++ b/arch/x86/kernel/syscall_restrict.c
@@ -0,0 +1,229 @@
+#include <linux/thread_info.h>
+#include <linux/pid_namespace.h>
+#include <linux/sysctl.h>
+#include <linux/kprobes.h>
+#include <asm/kdebug.h>
+#include <linux/kdebug.h>
+
+#ifdef CONFIG_IA32_EMULATION
+
+void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+	long error_code, siginfo_t *info);
+
+asmlinkage
+void do_ia32_denied_sysenter(struct pt_regs *regs)
+{
+	current->thread.error_code = 0;
+	current->thread.trap_no = 13;
+
+	if (printk_ratelimit()) {
+		pr_err("%s[%d] attempt to use denied 32-bit sysenter, ip:%lx sp:%lx",
+			current->comm, task_pid_nr(current),
+			regs->ip, regs->sp);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+	force_sig(SIGSEGV, current);
+	return;
+
+}
+
+asmlinkage
+void do_ia32_denied_int(struct pt_regs *regs)
+{
+	if (printk_ratelimit()) {
+		pr_err("%s[%d] attempt to use denied 32-bit int80h, ip :%lx sp:%lx",
+			current->comm, task_pid_nr(current),
+			regs->ip, regs->sp);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+	do_trap(11, SIGBUS, "segment not present", regs, 0, NULL);
+}
+
+asmlinkage
+void do_denied_syscall(struct pt_regs *regs)
+{
+	siginfo_t info = {
+		.si_signo = SIGILL,
+		.si_errno = 0,
+		.si_code = ILL_ILLOPN,
+		.si_addr = (void __user *)regs->ip
+	};
+
+	if (printk_ratelimit()) {
+		pr_err("%s[%d] attempt to use denied 64-bit syscall, ip:%lx sp:%lx",
+			current->comm, task_pid_nr(current),
+			regs->ip, regs->sp);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+
+	do_trap(6, SIGILL, "invalid opcode", regs, 0, &info);
+}
+
+static int task_get_bitness(struct task_struct *task)
+{
+	if (test_ti_thread_flag(task_thread_info(task), TIF_IA32))
+		return 32;
+	else
+		return 64;
+}
+
+static bool pidns_locked(struct pid_namespace *pid_ns)
+{
+	struct thread_info *ti = task_thread_info(pid_ns->child_reaper);
+
+	return test_ti_thread_flag(ti, TIF_SYSCALL32_DENIED) ||
+	       test_ti_thread_flag(ti, TIF_SYSCALL64_DENIED);
+}
+
+static int bits_to_flags(int bits)
+{
+	if (bits == 32)
+		return TIF_SYSCALL64_DENIED;
+	else
+		return TIF_SYSCALL32_DENIED;
+}
+
+void arch_post_fork(struct task_struct *task)
+{
+	int clear_bit_nr;
+
+	if (!pidns_locked(current->nsproxy->pid_ns))
+		return;
+
+	clear_bit_nr = bits_to_flags(task_get_bitness(current));
+	set_tsk_thread_flag(task, clear_bit_nr);
+}
+
+/* Called under rcu_read_lock and write_lock_irq(tasklist) */
+static int __pidns_may_lock_bitness(struct pid_namespace *pid_ns, int bits)
+{
+	struct task_struct *task;
+	int old_bits;
+	int nr;
+
+	for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (!task)
+			continue;
+
+		old_bits = task_get_bitness(task);
+		if (old_bits != bits) {
+			pr_err("Inconsistent syscall restriction detected! "
+				"Parent ns tries to restrict syscalls to %d "
+				"bits while some task is %d bit.",
+				bits, old_bits);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/* Called under rcu_read_lock and write_lock_irq(tasklist) */
+static void __bitness_lock(struct pid_namespace *pid_ns, int bits)
+{
+	u32 clear_bit_nr;
+	struct task_struct *task;
+	int nr;
+
+	clear_bit_nr = bits_to_flags(bits);
+
+	for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (task)
+			set_tsk_thread_flag(task, clear_bit_nr);
+	}
+}
+
+static int bitness_lock(struct pid_namespace *pid_ns)
+{
+	int rc, new_bits;
+
+	rcu_read_lock();
+	write_lock_irq(&tasklist_lock);
+
+	new_bits = task_get_bitness(pid_ns->child_reaper);
+	rc = __pidns_may_lock_bitness(pid_ns, new_bits);
+	if (!rc)
+		__bitness_lock(pid_ns, new_bits);
+
+	write_unlock_irq(&tasklist_lock);
+	rcu_read_unlock();
+	return rc;
+}
+
+static int bitness_locked_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int rc, new_bits, old_bits;
+	struct ctl_table tbl = {
+		.procname	= table->procname,
+		.data		= &new_bits,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+	};
+
+	old_bits = new_bits = pidns_locked(current->nsproxy->pid_ns);
+	rc = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+	if (rc || !write)
+		return rc;
+
+	if (!capable(CAP_SYS_ADMIN) || (new_bits == 0 && old_bits))
+		return -EACCES;
+	if (new_bits && old_bits)
+		return 0;
+	return bitness_lock(current->nsproxy->pid_ns);
+}
+
+static struct ctl_table abi_syscall_restrict[] = {
+	{
+		.procname = "bitness_locked",
+		.mode = 0644,
+		.proc_handler = bitness_locked_handler
+	},
+	{}
+};
+
+#else /* CONFIG_IA32_EMULATION */
+
+static int one = 1;
+
+static struct ctl_table abi_syscall_restrict[] = {
+	{
+		.procname	= "bitness_locked",
+		.data		= &one,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+	},
+	{}
+};
+
+#endif /* CONFIG_IA32_EMULATION */
+
+
+static struct ctl_table abi_root[] = {
+	{
+		.procname = "abi",
+		.mode = 0555,
+		.child = abi_syscall_restrict
+	},
+	{}
+};
+
+__init int syscall_restrict_init(void)
+{
+	register_sysctl_table(abi_root);
+	return 0;
+}
+device_initcall(syscall_restrict_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9682ec5..a9bf9cf 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -116,7 +116,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	dec_preempt_count();
 }
 
-static void __kprobes
+void __kprobes
 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	long error_code, siginfo_t *info)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index e7ceaca..55e4455 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1039,6 +1039,10 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 }
 
+#ifndef __HAVE_ARCH_POST_FORK
+#define arch_post_fork(p)
+#endif
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1374,6 +1378,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+	arch_post_fork(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	if (clone_flags & CLONE_THREAD)
-- 
Vasiliy

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-12 15:03 ` [kernel-hardening] " Vasiliy Kulikov
@ 2011-08-12 20:08   ` H. Peter Anvin
  -1 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-12 20:08 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Thomas Gleixner, Ingo Molnar, James Morris, kernel-hardening,
	x86, linux-kernel, linux-security-module

On 08/12/2011 10:03 AM, Vasiliy Kulikov wrote:
> This patch allows x86-64 systems with 32 bit syscalls support to lock a
> pid namespace to 32 or 64 bitness syscalls/tasks.  By denying rarely
> used compatibility syscalls it reduces an attack surface for 32 bit
> containers.
> 
> The new sysctl is introduced, abi.bitness_locked.  If set to 1, it locks
> all tasks inside of current pid namespace to the bitness of init task
> (pid_ns->child_reaper).  After that:
> 
> 1) a task trying to do a syscall of other bitness would get a signal as
> if the corresponding syscall is not enabled (IDT entry/MSR is not
> initialized).
> 
> 2) loading ELF binaries of another bitness is prohibited (as if the
> corresponding CONFIG_BINFMT_*=N).
> 
> If there is any task which differs in bitness, the lockup fails.
> 
> In this patch version the lockup is handled by sysctl.  In the future I
> plan to do it via prctl() to handle situations of container root
> compromize.  For now, the lockup can be configured by init scripts,
> which parse /etc/sysctl.conf and set the sysctl variable.  But if
> /sbin/init is compromized, the malicious code would gain a possibility
> to do arbitrary syscalls.  So, it should be possible to lockup the
> container before the init execution.
> 
> ( The asm stubs for denied syscalls might be buggy, if so - please
> ignore them :) it is just a PoC. )
> 

NAK on this in its current form, as it breaks the upcoming x32 ABI.
Selection by ABI needs to be more specific.

However, I have to question the value of this... if this is enabled in
the system as a whole (as opposed to compiled out) it seems kind of
pointless... if there are bugs we need to deal with them anyway.

> Qestions/thoughts:
> 
> The patch adds a check in syscalls code.  Is it a significant
> slowdown for fast syscalls?  If so, probably it worth moving the check
> into scheduler code and enabling/disabling corresponding interrupt/MSRs
> on each task switch?
> 

*YOU* are the person who needs to answer that question by providing
measurements.  Quite frankly I suspect checks in the syscall code *or*
task switching MSRs are going to be unacceptable from a performance
point of view.

	-hpa

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-12 20:08   ` H. Peter Anvin
  0 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-12 20:08 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Thomas Gleixner, Ingo Molnar, James Morris, kernel-hardening,
	x86, linux-kernel, linux-security-module

On 08/12/2011 10:03 AM, Vasiliy Kulikov wrote:
> This patch allows x86-64 systems with 32 bit syscalls support to lock a
> pid namespace to 32 or 64 bitness syscalls/tasks.  By denying rarely
> used compatibility syscalls it reduces an attack surface for 32 bit
> containers.
> 
> The new sysctl is introduced, abi.bitness_locked.  If set to 1, it locks
> all tasks inside of current pid namespace to the bitness of init task
> (pid_ns->child_reaper).  After that:
> 
> 1) a task trying to do a syscall of other bitness would get a signal as
> if the corresponding syscall is not enabled (IDT entry/MSR is not
> initialized).
> 
> 2) loading ELF binaries of another bitness is prohibited (as if the
> corresponding CONFIG_BINFMT_*=N).
> 
> If there is any task which differs in bitness, the lockup fails.
> 
> In this patch version the lockup is handled by sysctl.  In the future I
> plan to do it via prctl() to handle situations of container root
> compromize.  For now, the lockup can be configured by init scripts,
> which parse /etc/sysctl.conf and set the sysctl variable.  But if
> /sbin/init is compromized, the malicious code would gain a possibility
> to do arbitrary syscalls.  So, it should be possible to lockup the
> container before the init execution.
> 
> ( The asm stubs for denied syscalls might be buggy, if so - please
> ignore them :) it is just a PoC. )
> 

NAK on this in its current form, as it breaks the upcoming x32 ABI.
Selection by ABI needs to be more specific.

However, I have to question the value of this... if this is enabled in
the system as a whole (as opposed to compiled out) it seems kind of
pointless... if there are bugs we need to deal with them anyway.

> Qestions/thoughts:
> 
> The patch adds a check in syscalls code.  Is it a significant
> slowdown for fast syscalls?  If so, probably it worth moving the check
> into scheduler code and enabling/disabling corresponding interrupt/MSRs
> on each task switch?
> 

*YOU* are the person who needs to answer that question by providing
measurements.  Quite frankly I suspect checks in the syscall code *or*
task switching MSRs are going to be unacceptable from a performance
point of view.

	-hpa

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-12 20:08   ` [kernel-hardening] " H. Peter Anvin
@ 2011-08-13  6:22     ` Vasiliy Kulikov
  -1 siblings, 0 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-13  6:22 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, James Morris, kernel-hardening,
	x86, linux-kernel, linux-security-module

On Fri, Aug 12, 2011 at 15:08 -0500, H. Peter Anvin wrote:
> On 08/12/2011 10:03 AM, Vasiliy Kulikov wrote:
> > This patch allows x86-64 systems with 32 bit syscalls support to lock a
> > pid namespace to 32 or 64 bitness syscalls/tasks.  By denying rarely
> > used compatibility syscalls it reduces an attack surface for 32 bit
> > containers.
> > 
> > The new sysctl is introduced, abi.bitness_locked.  If set to 1, it locks
> > all tasks inside of current pid namespace to the bitness of init task
> > (pid_ns->child_reaper).  After that:
> > 
> > 1) a task trying to do a syscall of other bitness would get a signal as
> > if the corresponding syscall is not enabled (IDT entry/MSR is not
> > initialized).
> > 
> > 2) loading ELF binaries of another bitness is prohibited (as if the
> > corresponding CONFIG_BINFMT_*=N).
[...]
> However, I have to question the value of this... if this is enabled in
> the system as a whole (as opposed to compiled out) it seems kind of
> pointless...

No, it is not for the system as a whole, but for containers (however,
it's possible to lock the whole system).  We use OpenVZ kernels with
multiple containers, some of them are 32 bit, some are 64 bit.  64 bit
syscalls are not needed for 32 bit containers and 32 bit syscalls are
not needed for 64 bit containers.  As a needless interfaces they
unreasonably increase the kernel attack surface.  Some compatibility 32
bit syscalls are rarely used, sometimes they are not tested well.

In IA-64 the IA-32 compatibility support was broken for 2 years:

http://www.spinics.net/lists/linux-ia64/msg07840.html

In amd64 some specific rarely used syscalls might behave similar way.
Removing this attack vector is the goal of the patch.

> if there are bugs we need to deal with them anyway.

Definitely.

> > Qestions/thoughts:
> > 
> > The patch adds a check in syscalls code.  Is it a significant
> > slowdown for fast syscalls?  If so, probably it worth moving the check
> > into scheduler code and enabling/disabling corresponding interrupt/MSRs
> > on each task switch?
> > 
> 
> *YOU* are the person who needs to answer that question by providing
> measurements.  Quite frankly I suspect checks in the syscall code *or*
> task switching MSRs are going to be unacceptable from a performance
> point of view.

OK, I'll do it.

Thank you,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-13  6:22     ` Vasiliy Kulikov
  0 siblings, 0 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-13  6:22 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, James Morris, kernel-hardening,
	x86, linux-kernel, linux-security-module

On Fri, Aug 12, 2011 at 15:08 -0500, H. Peter Anvin wrote:
> On 08/12/2011 10:03 AM, Vasiliy Kulikov wrote:
> > This patch allows x86-64 systems with 32 bit syscalls support to lock a
> > pid namespace to 32 or 64 bitness syscalls/tasks.  By denying rarely
> > used compatibility syscalls it reduces an attack surface for 32 bit
> > containers.
> > 
> > The new sysctl is introduced, abi.bitness_locked.  If set to 1, it locks
> > all tasks inside of current pid namespace to the bitness of init task
> > (pid_ns->child_reaper).  After that:
> > 
> > 1) a task trying to do a syscall of other bitness would get a signal as
> > if the corresponding syscall is not enabled (IDT entry/MSR is not
> > initialized).
> > 
> > 2) loading ELF binaries of another bitness is prohibited (as if the
> > corresponding CONFIG_BINFMT_*=N).
[...]
> However, I have to question the value of this... if this is enabled in
> the system as a whole (as opposed to compiled out) it seems kind of
> pointless...

No, it is not for the system as a whole, but for containers (however,
it's possible to lock the whole system).  We use OpenVZ kernels with
multiple containers, some of them are 32 bit, some are 64 bit.  64 bit
syscalls are not needed for 32 bit containers and 32 bit syscalls are
not needed for 64 bit containers.  As a needless interfaces they
unreasonably increase the kernel attack surface.  Some compatibility 32
bit syscalls are rarely used, sometimes they are not tested well.

In IA-64 the IA-32 compatibility support was broken for 2 years:

http://www.spinics.net/lists/linux-ia64/msg07840.html

In amd64 some specific rarely used syscalls might behave similar way.
Removing this attack vector is the goal of the patch.

> if there are bugs we need to deal with them anyway.

Definitely.

> > Qestions/thoughts:
> > 
> > The patch adds a check in syscalls code.  Is it a significant
> > slowdown for fast syscalls?  If so, probably it worth moving the check
> > into scheduler code and enabling/disabling corresponding interrupt/MSRs
> > on each task switch?
> > 
> 
> *YOU* are the person who needs to answer that question by providing
> measurements.  Quite frankly I suspect checks in the syscall code *or*
> task switching MSRs are going to be unacceptable from a performance
> point of view.

OK, I'll do it.

Thank you,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-13  6:22     ` [kernel-hardening] " Vasiliy Kulikov
@ 2011-08-13 15:41       ` H. Peter Anvin
  -1 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-13 15:41 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Thomas Gleixner, Ingo Molnar, James Morris, kernel-hardening,
	x86, linux-kernel, linux-security-module

Vasiliy Kulikov <segoon@openwall.com> wrote:

>On Fri, Aug 12, 2011 at 15:08 -0500, H. Peter Anvin wrote:
>> On 08/12/2011 10:03 AM, Vasiliy Kulikov wrote:
>> > This patch allows x86-64 systems with 32 bit syscalls support to
>lock a
>> > pid namespace to 32 or 64 bitness syscalls/tasks.  By denying
>rarely
>> > used compatibility syscalls it reduces an attack surface for 32 bit
>> > containers.
>> > 
>> > The new sysctl is introduced, abi.bitness_locked.  If set to 1, it
>locks
>> > all tasks inside of current pid namespace to the bitness of init
>task
>> > (pid_ns->child_reaper).  After that:
>> > 
>> > 1) a task trying to do a syscall of other bitness would get a
>signal as
>> > if the corresponding syscall is not enabled (IDT entry/MSR is not
>> > initialized).
>> > 
>> > 2) loading ELF binaries of another bitness is prohibited (as if the
>> > corresponding CONFIG_BINFMT_*=N).
>[...]
>> However, I have to question the value of this... if this is enabled
>in
>> the system as a whole (as opposed to compiled out) it seems kind of
>> pointless...
>
>No, it is not for the system as a whole, but for containers (however,
>it's possible to lock the whole system).  We use OpenVZ kernels with
>multiple containers, some of them are 32 bit, some are 64 bit.  64 bit
>syscalls are not needed for 32 bit containers and 32 bit syscalls are
>not needed for 64 bit containers.  As a needless interfaces they
>unreasonably increase the kernel attack surface.  Some compatibility 32
>bit syscalls are rarely used, sometimes they are not tested well.
>
>In IA-64 the IA-32 compatibility support was broken for 2 years:
>
>http://www.spinics.net/lists/linux-ia64/msg07840.html
>
>In amd64 some specific rarely used syscalls might behave similar way.
>Removing this attack vector is the goal of the patch.
>
>> if there are bugs we need to deal with them anyway.
>
>Definitely.
>
>> > Qestions/thoughts:
>> > 
>> > The patch adds a check in syscalls code.  Is it a significant
>> > slowdown for fast syscalls?  If so, probably it worth moving the
>check
>> > into scheduler code and enabling/disabling corresponding
>interrupt/MSRs
>> > on each task switch?
>> > 
>> 
>> *YOU* are the person who needs to answer that question by providing
>> measurements.  Quite frankly I suspect checks in the syscall code
>*or*
>> task switching MSRs are going to be unacceptable from a performance
>> point of view.
>
>OK, I'll do it.
>
>Thank you,
>
>-- 
>Vasiliy Kulikov
>http://www.openwall.com - bringing security into open computing
>environments

IA64 is totally different.  I'm extremely sceptical to this patch; it feels like putting code in a super-hot path to paper over a problem that has to be fixed anyway.
-- 
Sent from my mobile phone. Please excuse my brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-13 15:41       ` H. Peter Anvin
  0 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-13 15:41 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Thomas Gleixner, Ingo Molnar, James Morris, kernel-hardening,
	x86, linux-kernel, linux-security-module

Vasiliy Kulikov <segoon@openwall.com> wrote:

>On Fri, Aug 12, 2011 at 15:08 -0500, H. Peter Anvin wrote:
>> On 08/12/2011 10:03 AM, Vasiliy Kulikov wrote:
>> > This patch allows x86-64 systems with 32 bit syscalls support to
>lock a
>> > pid namespace to 32 or 64 bitness syscalls/tasks.  By denying
>rarely
>> > used compatibility syscalls it reduces an attack surface for 32 bit
>> > containers.
>> > 
>> > The new sysctl is introduced, abi.bitness_locked.  If set to 1, it
>locks
>> > all tasks inside of current pid namespace to the bitness of init
>task
>> > (pid_ns->child_reaper).  After that:
>> > 
>> > 1) a task trying to do a syscall of other bitness would get a
>signal as
>> > if the corresponding syscall is not enabled (IDT entry/MSR is not
>> > initialized).
>> > 
>> > 2) loading ELF binaries of another bitness is prohibited (as if the
>> > corresponding CONFIG_BINFMT_*=N).
>[...]
>> However, I have to question the value of this... if this is enabled
>in
>> the system as a whole (as opposed to compiled out) it seems kind of
>> pointless...
>
>No, it is not for the system as a whole, but for containers (however,
>it's possible to lock the whole system).  We use OpenVZ kernels with
>multiple containers, some of them are 32 bit, some are 64 bit.  64 bit
>syscalls are not needed for 32 bit containers and 32 bit syscalls are
>not needed for 64 bit containers.  As a needless interfaces they
>unreasonably increase the kernel attack surface.  Some compatibility 32
>bit syscalls are rarely used, sometimes they are not tested well.
>
>In IA-64 the IA-32 compatibility support was broken for 2 years:
>
>http://www.spinics.net/lists/linux-ia64/msg07840.html
>
>In amd64 some specific rarely used syscalls might behave similar way.
>Removing this attack vector is the goal of the patch.
>
>> if there are bugs we need to deal with them anyway.
>
>Definitely.
>
>> > Qestions/thoughts:
>> > 
>> > The patch adds a check in syscalls code.  Is it a significant
>> > slowdown for fast syscalls?  If so, probably it worth moving the
>check
>> > into scheduler code and enabling/disabling corresponding
>interrupt/MSRs
>> > on each task switch?
>> > 
>> 
>> *YOU* are the person who needs to answer that question by providing
>> measurements.  Quite frankly I suspect checks in the syscall code
>*or*
>> task switching MSRs are going to be unacceptable from a performance
>> point of view.
>
>OK, I'll do it.
>
>Thank you,
>
>-- 
>Vasiliy Kulikov
>http://www.openwall.com - bringing security into open computing
>environments

IA64 is totally different.  I'm extremely sceptical to this patch; it feels like putting code in a super-hot path to paper over a problem that has to be fixed anyway.
-- 
Sent from my mobile phone. Please excuse my brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-13 15:41       ` [kernel-hardening] " H. Peter Anvin
  (?)
@ 2011-08-13 16:32       ` Vasiliy Kulikov
  2011-08-14  9:09           ` [kernel-hardening] " Solar Designer
  2011-08-18 14:40           ` [kernel-hardening] " Vasiliy Kulikov
  -1 siblings, 2 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-13 16:32 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Thomas Gleixner, Ingo Molnar, James Morris, x86,
	kernel-hardening, linux-kernel, linux-security-module

On Sat, Aug 13, 2011 at 10:41 -0500, H. Peter Anvin wrote:
> IA64 is totally different.

I didn't say all IA-32 compatibility layer of x86 is a crap, surely no.
But there is some code, which is poorly tested exactly because it is
compatibility code.  One relatively recent example:

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=3e645d6b485446c54c6745c5e2cf5c528fe4deec

>  I'm extremely sceptical to this patch;
> it feels like putting code in a super-hot path to paper over a problem that has to be fixed anyway.

I'll move the check to the tracesys branch, which is not a hot path, in
the next RFC version, so this should not be a problem.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-13 15:41       ` [kernel-hardening] " H. Peter Anvin
@ 2011-08-14  2:38         ` Andi Kleen
  -1 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-14  2:38 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar, James Morris,
	kernel-hardening, x86, linux-kernel, linux-security-module

"H. Peter Anvin" <hpa@zytor.com> writes:
>
> IA64 is totally different.  I'm extremely sceptical to this patch; it feels like putting code in a super-hot path to paper over a problem that has to be fixed anyway.

Sounds to me a better alternative would be more aggressive, pro-active
fuzzing of the compat calls.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-14  2:38         ` Andi Kleen
  0 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-14  2:38 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar, James Morris,
	kernel-hardening, x86, linux-kernel, linux-security-module

"H. Peter Anvin" <hpa@zytor.com> writes:
>
> IA64 is totally different.  I'm extremely sceptical to this patch; it feels like putting code in a super-hot path to paper over a problem that has to be fixed anyway.

Sounds to me a better alternative would be more aggressive, pro-active
fuzzing of the compat calls.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-14  2:38         ` [kernel-hardening] " Andi Kleen
@ 2011-08-14  5:08           ` H. Peter Anvin
  -1 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-14  5:08 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar, James Morris,
	kernel-hardening, x86, linux-kernel, linux-security-module

Andi Kleen <andi@firstfloor.org> wrote:

>"H. Peter Anvin" <hpa@zytor.com> writes:
>>
>> IA64 is totally different.  I'm extremely sceptical to this patch; it
>feels like putting code in a super-hot path to paper over a problem
>that has to be fixed anyway.
>
>Sounds to me a better alternative would be more aggressive, pro-active
>fuzzing of the compat calls.
>
>-Andi
>
>-- 
>ak@linux.intel.com -- Speaking for myself only

Agreed.  Other than that, I can see a fine-grained permission filter, but the compat vs noncompat axis is just spurious.
-- 
Sent from my mobile phone. Please excuse my brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-14  5:08           ` H. Peter Anvin
  0 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-14  5:08 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar, James Morris,
	kernel-hardening, x86, linux-kernel, linux-security-module

Andi Kleen <andi@firstfloor.org> wrote:

>"H. Peter Anvin" <hpa@zytor.com> writes:
>>
>> IA64 is totally different.  I'm extremely sceptical to this patch; it
>feels like putting code in a super-hot path to paper over a problem
>that has to be fixed anyway.
>
>Sounds to me a better alternative would be more aggressive, pro-active
>fuzzing of the compat calls.
>
>-Andi
>
>-- 
>ak@linux.intel.com -- Speaking for myself only

Agreed.  Other than that, I can see a fine-grained permission filter, but the compat vs noncompat axis is just spurious.
-- 
Sent from my mobile phone. Please excuse my brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-13 16:32       ` Vasiliy Kulikov
@ 2011-08-14  9:09           ` Solar Designer
  2011-08-18 14:40           ` [kernel-hardening] " Vasiliy Kulikov
  1 sibling, 0 replies; 48+ messages in thread
From: Solar Designer @ 2011-08-14  9:09 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: H. Peter Anvin, Thomas Gleixner, Ingo Molnar, James Morris, x86,
	kernel-hardening, linux-kernel, linux-security-module

On Sat, Aug 13, 2011 at 08:32:52PM +0400, Vasiliy Kulikov wrote:
> I didn't say all IA-32 compatibility layer of x86 is a crap, surely no.
> But there is some code, which is poorly tested exactly because it is
> compatibility code.  One relatively recent example:
> 
> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=3e645d6b485446c54c6745c5e2cf5c528fe4deec

Here's another one:

http://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2010-3081
https://bugzilla.redhat.com/show_bug.cgi?id=634457
https://access.redhat.com/kb/docs/DOC-40265

"The compat_alloc_user_space functions in include/asm/compat.h files in
the Linux kernel before 2.6.36-rc4-git2 on 64-bit platforms do not
properly allocate the userspace memory required for the 32-bit
compatibility layer, which allows local users to gain privileges by
leveraging the ability of the compat_mc_getsockopt function (aka the
MCAST_MSFILTER getsockopt support) to control a certain length value,
related to a "stack pointer underflow" issue, as exploited in the wild
in September 2010."

It would have been nice if this one were not exploitable from 64-bit
OpenVZ containers at the time, which, if I understand correctly, would
be the case with Vasiliy's patch (and the corresponding change to vzctl
to make use of the feature, which we're planning to make).

Similarly, it would be nice if 32-bit compat issues like this would not
be triggerable from privsep child processes of vsftpd, sshd, etc. -
those programs would need to set a flag via prctl(), which we'll add
support for.

> I'll move the check to the tracesys branch, which is not a hot path, in
> the next RFC version, so this should not be a problem.

Vasiliy is going to reuse a check (of multiple flags at once) that is
already in the code, so the change will have no performance impact for
permitted and non-traced syscalls (the case where we care about
performance).

Alexander

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-14  9:09           ` Solar Designer
  0 siblings, 0 replies; 48+ messages in thread
From: Solar Designer @ 2011-08-14  9:09 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: H. Peter Anvin, Thomas Gleixner, Ingo Molnar, James Morris, x86,
	kernel-hardening, linux-kernel, linux-security-module

On Sat, Aug 13, 2011 at 08:32:52PM +0400, Vasiliy Kulikov wrote:
> I didn't say all IA-32 compatibility layer of x86 is a crap, surely no.
> But there is some code, which is poorly tested exactly because it is
> compatibility code.  One relatively recent example:
> 
> http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=3e645d6b485446c54c6745c5e2cf5c528fe4deec

Here's another one:

http://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2010-3081
https://bugzilla.redhat.com/show_bug.cgi?id=634457
https://access.redhat.com/kb/docs/DOC-40265

"The compat_alloc_user_space functions in include/asm/compat.h files in
the Linux kernel before 2.6.36-rc4-git2 on 64-bit platforms do not
properly allocate the userspace memory required for the 32-bit
compatibility layer, which allows local users to gain privileges by
leveraging the ability of the compat_mc_getsockopt function (aka the
MCAST_MSFILTER getsockopt support) to control a certain length value,
related to a "stack pointer underflow" issue, as exploited in the wild
in September 2010."

It would have been nice if this one were not exploitable from 64-bit
OpenVZ containers at the time, which, if I understand correctly, would
be the case with Vasiliy's patch (and the corresponding change to vzctl
to make use of the feature, which we're planning to make).

Similarly, it would be nice if 32-bit compat issues like this would not
be triggerable from privsep child processes of vsftpd, sshd, etc. -
those programs would need to set a flag via prctl(), which we'll add
support for.

> I'll move the check to the tracesys branch, which is not a hot path, in
> the next RFC version, so this should not be a problem.

Vasiliy is going to reuse a check (of multiple flags at once) that is
already in the code, so the change will have no performance impact for
permitted and non-traced syscalls (the case where we care about
performance).

Alexander

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-14  5:08           ` [kernel-hardening] " H. Peter Anvin
@ 2011-08-14  9:20             ` Solar Designer
  -1 siblings, 0 replies; 48+ messages in thread
From: Solar Designer @ 2011-08-14  9:20 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module

On Sat, Aug 13, 2011 at 10:08:57PM -0700, H. Peter Anvin wrote:
> Andi Kleen <andi@firstfloor.org> wrote:
> 
> >Sounds to me a better alternative would be more aggressive, pro-active
> >fuzzing of the compat calls.
[...]
> Agreed.  Other than that, I can see a fine-grained permission filter, but the compat vs noncompat axis is just spurious.

In case anyone cares, I respectfully disagree.  I am with Vasiliy on
this.  I think that proactive fuzzing is great, but it is not an
alternative - we can also do both fuzzing and reduction of attack
surface at once.  With Vasiliy reusing an existing check (in a future
revision of the patch), there's not going to be any performance impact.
Fine-grained restrictions would be great, but the 32- vs. 64-bit
restriction makes sense to me as well.  I expect different systems to
use these different kinds of restrictions in different cases.

We will definitely want to support x32 as well.  We'd appreciate any
suggestions on how to do it best.

Thanks,

Alexander

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-14  9:20             ` Solar Designer
  0 siblings, 0 replies; 48+ messages in thread
From: Solar Designer @ 2011-08-14  9:20 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module

On Sat, Aug 13, 2011 at 10:08:57PM -0700, H. Peter Anvin wrote:
> Andi Kleen <andi@firstfloor.org> wrote:
> 
> >Sounds to me a better alternative would be more aggressive, pro-active
> >fuzzing of the compat calls.
[...]
> Agreed.  Other than that, I can see a fine-grained permission filter, but the compat vs noncompat axis is just spurious.

In case anyone cares, I respectfully disagree.  I am with Vasiliy on
this.  I think that proactive fuzzing is great, but it is not an
alternative - we can also do both fuzzing and reduction of attack
surface at once.  With Vasiliy reusing an existing check (in a future
revision of the patch), there's not going to be any performance impact.
Fine-grained restrictions would be great, but the 32- vs. 64-bit
restriction makes sense to me as well.  I expect different systems to
use these different kinds of restrictions in different cases.

We will definitely want to support x32 as well.  We'd appreciate any
suggestions on how to do it best.

Thanks,

Alexander

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-14  9:20             ` [kernel-hardening] " Solar Designer
@ 2011-08-14 14:48               ` H. Peter Anvin
  -1 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-14 14:48 UTC (permalink / raw)
  To: Solar Designer
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module

Solar Designer <solar@openwall.com> wrote:

>On Sat, Aug 13, 2011 at 10:08:57PM -0700, H. Peter Anvin wrote:
>> Andi Kleen <andi@firstfloor.org> wrote:
>> 
>> >Sounds to me a better alternative would be more aggressive,
>pro-active
>> >fuzzing of the compat calls.
>[...]
>> Agreed.  Other than that, I can see a fine-grained permission filter,

>but the compat vs noncompat axis is just spurious.
>
>In case anyone cares, I respectfully disagree.  I am with Vasiliy on
>this.  I think that proactive fuzzing is great, but it is not an
>alternative - we can also do both fuzzing and reduction of attack
>surface at once.  With Vasiliy reusing an existing check (in a future
>revision of the patch), there's not going to be any performance impact.
>Fine-grained restrictions would be great, but the 32- vs. 64-bit
>restriction makes sense to me as well.  I expect different systems to
>use these different kinds of restrictions in different cases.
>
>We will definitely want to support x32 as well.  We'd appreciate any
>suggestions on how to do it best.
>
>Thanks,
>
>Alexander

i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?
-- 
Sent from my mobile phone. Please excuse my brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-14 14:48               ` H. Peter Anvin
  0 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-14 14:48 UTC (permalink / raw)
  To: Solar Designer
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module

Solar Designer <solar@openwall.com> wrote:

>On Sat, Aug 13, 2011 at 10:08:57PM -0700, H. Peter Anvin wrote:
>> Andi Kleen <andi@firstfloor.org> wrote:
>> 
>> >Sounds to me a better alternative would be more aggressive,
>pro-active
>> >fuzzing of the compat calls.
>[...]
>> Agreed.  Other than that, I can see a fine-grained permission filter,

>but the compat vs noncompat axis is just spurious.
>
>In case anyone cares, I respectfully disagree.  I am with Vasiliy on
>this.  I think that proactive fuzzing is great, but it is not an
>alternative - we can also do both fuzzing and reduction of attack
>surface at once.  With Vasiliy reusing an existing check (in a future
>revision of the patch), there's not going to be any performance impact.
>Fine-grained restrictions would be great, but the 32- vs. 64-bit
>restriction makes sense to me as well.  I expect different systems to
>use these different kinds of restrictions in different cases.
>
>We will definitely want to support x32 as well.  We'd appreciate any
>suggestions on how to do it best.
>
>Thanks,
>
>Alexander

i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?
-- 
Sent from my mobile phone. Please excuse my brevity and lack of formatting.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-14 14:48               ` [kernel-hardening] " H. Peter Anvin
@ 2011-08-14 15:27                 ` Andi Kleen
  -1 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-14 15:27 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Solar Designer, Andi Kleen, Vasiliy Kulikov, Thomas Gleixner,
	Ingo Molnar, James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module

> i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?

I believe this is already in the newer versions of seccomp.

-Andi

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-14 15:27                 ` Andi Kleen
  0 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-14 15:27 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Solar Designer, Andi Kleen, Vasiliy Kulikov, Thomas Gleixner,
	Ingo Molnar, James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module

> i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?

I believe this is already in the newer versions of seccomp.

-Andi

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-14 15:27                 ` [kernel-hardening] " Andi Kleen
@ 2011-08-14 15:36                   ` H. Peter Anvin
  -1 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-14 15:36 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Solar Designer, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module

On 08/14/2011 08:27 AM, Andi Kleen wrote:
>> i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?
> 
> I believe this is already in the newer versions of seccomp.
> 

Last I looked seccomp still had a hardcoded list of system calls, but
perhaps I've been looking in the wrong place.  However, since that's
exactly what seccomp is -- a system call filter -- this can, and should,
be unified that way.

	-hpa



^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-14 15:36                   ` H. Peter Anvin
  0 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-14 15:36 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Solar Designer, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module

On 08/14/2011 08:27 AM, Andi Kleen wrote:
>> i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?
> 
> I believe this is already in the newer versions of seccomp.
> 

Last I looked seccomp still had a hardcoded list of system calls, but
perhaps I've been looking in the wrong place.  However, since that's
exactly what seccomp is -- a system call filter -- this can, and should,
be unified that way.

	-hpa

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-14 15:27                 ` [kernel-hardening] " Andi Kleen
  (?)
@ 2011-08-14 16:08                     ` Vasiliy Kulikov
  -1 siblings, 0 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-14 16:08 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Will Drewry, kernel-hardening-ZwoEplunGu1jrUoiu81ncdBPR1lH4CV8,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	x86-DgEjT+Ai2ygdnm+yROfE0A, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-security-module-u79uwXL29TY76Z2rM5mHXA, Ingo Molnar,
	Solar Designer, H. Peter Anvin, Thomas Gleixner

(CC'ed Will Drewry, the author of new seccomp version, and 
containers list)

On Sun, Aug 14, 2011 at 17:27 +0200, Andi Kleen wrote:
> > i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?
> 
> I believe this is already in the newer versions of seccomp.

The "newer versions of seccomp" are NAK'ed by Ingo.  AFAIU, Ingo wants
more generic filters to filter much more than syscalls.  But it
contradicts the security by simplicity, which we're trying to achieve
with this patch.

Compatibility syscalls are much more error prone than common syscalls
as they lack good testing or sometimes lack it at all, unfortunately.
The link I've posted is about a crazy bug - a completely uninitialized
structure was used in copy_from_user() function.  The function was not
tested _at all_.  I doubt any non-compatibility syscall (ioctl()
handler, etc.) can be completely untested.

Also we already have CONFIG_IA32_EMULATION, this patch only moves the
configuration mechanism from the compilation stage to the runtime stage,
it doesn't draw the new line.  It grants the permissions to use the
feature to some containers, but denies to other containers, which is an
rather expected property of containers separation.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-14 16:08                     ` Vasiliy Kulikov
  0 siblings, 0 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-14 16:08 UTC (permalink / raw)
  To: Andi Kleen
  Cc: H. Peter Anvin, Solar Designer, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry, containers

(CC'ed Will Drewry, the author of new seccomp version, and 
containers list)

On Sun, Aug 14, 2011 at 17:27 +0200, Andi Kleen wrote:
> > i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?
> 
> I believe this is already in the newer versions of seccomp.

The "newer versions of seccomp" are NAK'ed by Ingo.  AFAIU, Ingo wants
more generic filters to filter much more than syscalls.  But it
contradicts the security by simplicity, which we're trying to achieve
with this patch.

Compatibility syscalls are much more error prone than common syscalls
as they lack good testing or sometimes lack it at all, unfortunately.
The link I've posted is about a crazy bug - a completely uninitialized
structure was used in copy_from_user() function.  The function was not
tested _at all_.  I doubt any non-compatibility syscall (ioctl()
handler, etc.) can be completely untested.

Also we already have CONFIG_IA32_EMULATION, this patch only moves the
configuration mechanism from the compilation stage to the runtime stage,
it doesn't draw the new line.  It grants the permissions to use the
feature to some containers, but denies to other containers, which is an
rather expected property of containers separation.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-14 16:08                     ` Vasiliy Kulikov
  0 siblings, 0 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-14 16:08 UTC (permalink / raw)
  To: Andi Kleen
  Cc: H. Peter Anvin, Solar Designer, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry, containers

(CC'ed Will Drewry, the author of new seccomp version, and 
containers list)

On Sun, Aug 14, 2011 at 17:27 +0200, Andi Kleen wrote:
> > i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?
> 
> I believe this is already in the newer versions of seccomp.

The "newer versions of seccomp" are NAK'ed by Ingo.  AFAIU, Ingo wants
more generic filters to filter much more than syscalls.  But it
contradicts the security by simplicity, which we're trying to achieve
with this patch.

Compatibility syscalls are much more error prone than common syscalls
as they lack good testing or sometimes lack it at all, unfortunately.
The link I've posted is about a crazy bug - a completely uninitialized
structure was used in copy_from_user() function.  The function was not
tested _at all_.  I doubt any non-compatibility syscall (ioctl()
handler, etc.) can be completely untested.

Also we already have CONFIG_IA32_EMULATION, this patch only moves the
configuration mechanism from the compilation stage to the runtime stage,
it doesn't draw the new line.  It grants the permissions to use the
feature to some containers, but denies to other containers, which is an
rather expected property of containers separation.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-14 15:36                   ` [kernel-hardening] " H. Peter Anvin
@ 2011-08-14 23:29                     ` James Morris
  -1 siblings, 0 replies; 48+ messages in thread
From: James Morris @ 2011-08-14 23:29 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Solar Designer, Vasiliy Kulikov, Thomas Gleixner,
	Ingo Molnar, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On Sun, 14 Aug 2011, H. Peter Anvin wrote:

> Last I looked seccomp still had a hardcoded list of system calls, but
> perhaps I've been looking in the wrong place.  However, since that's
> exactly what seccomp is -- a system call filter -- this can, and should,
> be unified that way.

Yes, we should definitely look at incorporating this into seccomp v2, 
which is still under discussion (and a topic at KS).


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-14 23:29                     ` James Morris
  0 siblings, 0 replies; 48+ messages in thread
From: James Morris @ 2011-08-14 23:29 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Solar Designer, Vasiliy Kulikov, Thomas Gleixner,
	Ingo Molnar, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On Sun, 14 Aug 2011, H. Peter Anvin wrote:

> Last I looked seccomp still had a hardcoded list of system calls, but
> perhaps I've been looking in the wrong place.  However, since that's
> exactly what seccomp is -- a system call filter -- this can, and should,
> be unified that way.

Yes, we should definitely look at incorporating this into seccomp v2, 
which is still under discussion (and a topic at KS).


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-14 15:36                   ` [kernel-hardening] " H. Peter Anvin
@ 2011-08-15  0:18                     ` Andi Kleen
  -1 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-15  0:18 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Solar Designer, Vasiliy Kulikov, Thomas Gleixner,
	Ingo Molnar, James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module

On Sun, Aug 14, 2011 at 08:36:57AM -0700, H. Peter Anvin wrote:
> On 08/14/2011 08:27 AM, Andi Kleen wrote:
> >> i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?
> > 
> > I believe this is already in the newer versions of seccomp.
> > 
> 
> Last I looked seccomp still had a hardcoded list of system calls, but
> perhaps I've been looking in the wrong place.  However, since that's
> exactly what seccomp is -- a system call filter -- this can, and should,
> be unified that way.

True. I guess I confused the endless l-k threads with actual code.

I guess the code was too expensive for the talk back then @)

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-15  0:18                     ` Andi Kleen
  0 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-15  0:18 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Solar Designer, Vasiliy Kulikov, Thomas Gleixner,
	Ingo Molnar, James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module

On Sun, Aug 14, 2011 at 08:36:57AM -0700, H. Peter Anvin wrote:
> On 08/14/2011 08:27 AM, Andi Kleen wrote:
> >> i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?
> > 
> > I believe this is already in the newer versions of seccomp.
> > 
> 
> Last I looked seccomp still had a hardcoded list of system calls, but
> perhaps I've been looking in the wrong place.  However, since that's
> exactly what seccomp is -- a system call filter -- this can, and should,
> be unified that way.

True. I guess I confused the endless l-k threads with actual code.

I guess the code was too expensive for the talk back then @)

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-15  0:18                     ` [kernel-hardening] " Andi Kleen
  (?)
@ 2011-08-15  0:32                     ` Will Drewry
  2011-08-15  0:58                       ` Andi Kleen
  -1 siblings, 1 reply; 48+ messages in thread
From: Will Drewry @ 2011-08-15  0:32 UTC (permalink / raw)
  To: kernel-hardening
  Cc: H. Peter Anvin, Andi Kleen, Solar Designer, Vasiliy Kulikov,
	Thomas Gleixner, Ingo Molnar, James Morris, x86, linux-kernel,
	linux-security-module

On Sun, Aug 14, 2011 at 7:18 PM, Andi Kleen <andi@firstfloor.org> wrote:
> On Sun, Aug 14, 2011 at 08:36:57AM -0700, H. Peter Anvin wrote:
>> On 08/14/2011 08:27 AM, Andi Kleen wrote:
>> >> i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?
>> >
>> > I believe this is already in the newer versions of seccomp.
>> >
>>
>> Last I looked seccomp still had a hardcoded list of system calls, but
>> perhaps I've been looking in the wrong place.  However, since that's
>> exactly what seccomp is -- a system call filter -- this can, and should,
>> be unified that way.
>
> True. I guess I confused the endless l-k threads with actual code.
>
> I guess the code was too expensive for the talk back then @)

Perhaps :) I wish it had landed after 9 revisions and at least two
variant patches. Despite that, I think it's great to pull in
additional requirements, like COMPAT locking, to make sure that the
solution is really a good one.  It may also be that my entire original
approach was wrong and should be revisited too.  Everyone's comments
here and the proposed patch itself certainly have me thinking.

cheers!
will

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-15  0:32                     ` Will Drewry
@ 2011-08-15  0:58                       ` Andi Kleen
  0 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-15  0:58 UTC (permalink / raw)
  To: Will Drewry
  Cc: kernel-hardening, H. Peter Anvin, Andi Kleen, Solar Designer,
	Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar, James Morris, x86,
	linux-kernel, linux-security-module

> Perhaps :) I wish it had landed after 9 revisions and at least two
> variant patches. Despite that, I think it's great to pull in
> additional requirements, like COMPAT locking, to make sure that the
> solution is really a good one.  It may also be that my entire original
> approach was wrong and should be revisited too.  Everyone's comments
> here and the proposed patch itself certainly have me thinking.

I didn't see anything wrong with it. Also the first try doesn't 
need to be perfect anyways, it can be always changed later.
 
How about you just repost it?

-Andi

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-14 14:48               ` [kernel-hardening] " H. Peter Anvin
@ 2011-08-15 18:51                 ` Solar Designer
  -1 siblings, 0 replies; 48+ messages in thread
From: Solar Designer @ 2011-08-15 18:51 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On Sun, Aug 14, 2011 at 07:48:51AM -0700, H. Peter Anvin wrote:
> i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?

I agree with you that i386 vs x86-64 vs x32 is one axis and syscall
number is another axis.  I'd like to be able to setup restrictions on
both.  So I support both Vasiliy's patch (a future revision of it; his
RFC posting was just to get the discussion started) and Will's seccomp
patch (maybe with further changes for inheritance on fork and execve).

On specific systems I (co-)administer, I have immediate need for the 32-
vs. 64-bit restrictions.  These are easy to put to use, with changes
only to the kernel (Vasiliy's patch) and to the vzctl program (read a
setting from a per-container config file, make the right prctl() call).

Per-syscall restrictions are also useful, but primarily at a different
level - I'd expect them to be used in specific programs, such as Chrome
and vsftpd.  Those programs may also want to limit themselves to a
certain type of syscalls (that is, on the i386 vs x86-64 vs x32 axis),
thereby making use of both features at once.  Or they might even have to
do that, depending on how we implement the syscall restrictions.

Per your suggestion, if I understand correctly, any task that wants to
restrict itself on the i386 vs x86-64 vs x32 axis will have TIF_SECCOMP
set and will incur calls into __secure_computing().  This is unnecessary
overhead for the case when we have a restriction over this axis only,
without per-syscall restrictions.  Vasiliy's patch avoids such overhead.

Alexander

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-15 18:51                 ` Solar Designer
  0 siblings, 0 replies; 48+ messages in thread
From: Solar Designer @ 2011-08-15 18:51 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On Sun, Aug 14, 2011 at 07:48:51AM -0700, H. Peter Anvin wrote:
> i386 vs x86-64 vs x32 is just one of many axes along which syscalls can be restricted (and for that matter, one axis if backward compatibility), and it does not make sense to burden the code with ad hoc filters.  Designing a general filter facility which can be used to restrict any container to the subset of system calls it actually needs would make more sense, no?

I agree with you that i386 vs x86-64 vs x32 is one axis and syscall
number is another axis.  I'd like to be able to setup restrictions on
both.  So I support both Vasiliy's patch (a future revision of it; his
RFC posting was just to get the discussion started) and Will's seccomp
patch (maybe with further changes for inheritance on fork and execve).

On specific systems I (co-)administer, I have immediate need for the 32-
vs. 64-bit restrictions.  These are easy to put to use, with changes
only to the kernel (Vasiliy's patch) and to the vzctl program (read a
setting from a per-container config file, make the right prctl() call).

Per-syscall restrictions are also useful, but primarily at a different
level - I'd expect them to be used in specific programs, such as Chrome
and vsftpd.  Those programs may also want to limit themselves to a
certain type of syscalls (that is, on the i386 vs x86-64 vs x32 axis),
thereby making use of both features at once.  Or they might even have to
do that, depending on how we implement the syscall restrictions.

Per your suggestion, if I understand correctly, any task that wants to
restrict itself on the i386 vs x86-64 vs x32 axis will have TIF_SECCOMP
set and will incur calls into __secure_computing().  This is unnecessary
overhead for the case when we have a restriction over this axis only,
without per-syscall restrictions.  Vasiliy's patch avoids such overhead.

Alexander

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-15 18:51                 ` [kernel-hardening] " Solar Designer
@ 2011-08-15 18:59                   ` H. Peter Anvin
  -1 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-15 18:59 UTC (permalink / raw)
  To: Solar Designer
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On 08/15/2011 11:51 AM, Solar Designer wrote:
> I agree with you that i386 vs x86-64 vs x32 is one axis and syscall
> number is another axis.

They are not.  ABI is ONE SUBSET OF SYSCALL NUMBERS.

> Per-syscall restrictions are also useful, but primarily at a different
> level - I'd expect them to be used in specific programs, such as Chrome
> and vsftpd.  Those programs may also want to limit themselves to a
> certain type of syscalls (that is, on the i386 vs x86-64 vs x32 axis),
> thereby making use of both features at once.  Or they might even have to
> do that, depending on how we implement the syscall restrictions.
> 
> Per your suggestion, if I understand correctly, any task that wants to
> restrict itself on the i386 vs x86-64 vs x32 axis will have TIF_SECCOMP
> set and will incur calls into __secure_computing().  This is unnecessary
> overhead for the case when we have a restriction over this axis only,
> without per-syscall restrictions.  Vasiliy's patch avoids such overhead.

There is really no bloody difference between i386 vs x86-64 and, say,
sys_oldstat versus sys_stat, or anything else along those lines.
Putting in a bunch of ad hoc facilities because of semi-plausible
performance wins rather than building a sane filtering facility which
can be optimized as a single path is ridiculous.

	-hpa

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-15 18:59                   ` H. Peter Anvin
  0 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-15 18:59 UTC (permalink / raw)
  To: Solar Designer
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On 08/15/2011 11:51 AM, Solar Designer wrote:
> I agree with you that i386 vs x86-64 vs x32 is one axis and syscall
> number is another axis.

They are not.  ABI is ONE SUBSET OF SYSCALL NUMBERS.

> Per-syscall restrictions are also useful, but primarily at a different
> level - I'd expect them to be used in specific programs, such as Chrome
> and vsftpd.  Those programs may also want to limit themselves to a
> certain type of syscalls (that is, on the i386 vs x86-64 vs x32 axis),
> thereby making use of both features at once.  Or they might even have to
> do that, depending on how we implement the syscall restrictions.
> 
> Per your suggestion, if I understand correctly, any task that wants to
> restrict itself on the i386 vs x86-64 vs x32 axis will have TIF_SECCOMP
> set and will incur calls into __secure_computing().  This is unnecessary
> overhead for the case when we have a restriction over this axis only,
> without per-syscall restrictions.  Vasiliy's patch avoids such overhead.

There is really no bloody difference between i386 vs x86-64 and, say,
sys_oldstat versus sys_stat, or anything else along those lines.
Putting in a bunch of ad hoc facilities because of semi-plausible
performance wins rather than building a sane filtering facility which
can be optimized as a single path is ridiculous.

	-hpa

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-15 18:59                   ` [kernel-hardening] " H. Peter Anvin
@ 2011-08-15 20:14                     ` Solar Designer
  -1 siblings, 0 replies; 48+ messages in thread
From: Solar Designer @ 2011-08-15 20:14 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On Mon, Aug 15, 2011 at 11:59:56AM -0700, H. Peter Anvin wrote:
> There is really no bloody difference between i386 vs x86-64 and, say,
> sys_oldstat versus sys_stat, or anything else along those lines.

There is a difference from a sysadmin standpoint: a sysadmin knows that
certain containers have Linux distro userlands for i386 and certain
others for x86-64, so he/she can configure things accordingly.  Even if
a customer using one of those containers installs extra software
packages, this extra software will work just fine as long as it's for
the same ABI.  The same doesn't hold true for sys_oldstat versus
sys_stat, etc.

> Putting in a bunch of ad hoc facilities because of semi-plausible
> performance wins rather than building a sane filtering facility which
> can be optimized as a single path is ridiculous.

I don't mind having a general filtering facility if it gets accepted
into the kernel (somehow Will's patch is not applied yet), and I don't
mind optimizing it to the point where it's not any slower for the "all
syscalls permitted but not all ABIs are" case.  I suspect that the
result of such optimizations will be similar to having these things
implemented separately, though - but I could be wrong.

So how do we proceed from here?  Start by getting Will's patch applied?

Alexander

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-15 20:14                     ` Solar Designer
  0 siblings, 0 replies; 48+ messages in thread
From: Solar Designer @ 2011-08-15 20:14 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On Mon, Aug 15, 2011 at 11:59:56AM -0700, H. Peter Anvin wrote:
> There is really no bloody difference between i386 vs x86-64 and, say,
> sys_oldstat versus sys_stat, or anything else along those lines.

There is a difference from a sysadmin standpoint: a sysadmin knows that
certain containers have Linux distro userlands for i386 and certain
others for x86-64, so he/she can configure things accordingly.  Even if
a customer using one of those containers installs extra software
packages, this extra software will work just fine as long as it's for
the same ABI.  The same doesn't hold true for sys_oldstat versus
sys_stat, etc.

> Putting in a bunch of ad hoc facilities because of semi-plausible
> performance wins rather than building a sane filtering facility which
> can be optimized as a single path is ridiculous.

I don't mind having a general filtering facility if it gets accepted
into the kernel (somehow Will's patch is not applied yet), and I don't
mind optimizing it to the point where it's not any slower for the "all
syscalls permitted but not all ABIs are" case.  I suspect that the
result of such optimizations will be similar to having these things
implemented separately, though - but I could be wrong.

So how do we proceed from here?  Start by getting Will's patch applied?

Alexander

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-15 20:14                     ` [kernel-hardening] " Solar Designer
@ 2011-08-15 20:27                       ` Andi Kleen
  -1 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-15 20:27 UTC (permalink / raw)
  To: Solar Designer
  Cc: H. Peter Anvin, Andi Kleen, Vasiliy Kulikov, Thomas Gleixner,
	Ingo Molnar, James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

> Start by getting Will's patch applied?

That's it.

-Andi

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-15 20:27                       ` Andi Kleen
  0 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-15 20:27 UTC (permalink / raw)
  To: Solar Designer
  Cc: H. Peter Anvin, Andi Kleen, Vasiliy Kulikov, Thomas Gleixner,
	Ingo Molnar, James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

> Start by getting Will's patch applied?

That's it.

-Andi

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-15 20:14                     ` [kernel-hardening] " Solar Designer
@ 2011-08-15 20:48                       ` H. Peter Anvin
  -1 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-15 20:48 UTC (permalink / raw)
  To: Solar Designer
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On 08/15/2011 01:14 PM, Solar Designer wrote:
> 
> There is a difference from a sysadmin standpoint
>

Sysadmin differences don't belong in the kernel interfaces or internals;
they are packaging.

	-hpa


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-15 20:48                       ` H. Peter Anvin
  0 siblings, 0 replies; 48+ messages in thread
From: H. Peter Anvin @ 2011-08-15 20:48 UTC (permalink / raw)
  To: Solar Designer
  Cc: Andi Kleen, Vasiliy Kulikov, Thomas Gleixner, Ingo Molnar,
	James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On 08/15/2011 01:14 PM, Solar Designer wrote:
> 
> There is a difference from a sysadmin standpoint
>

Sysadmin differences don't belong in the kernel interfaces or internals;
they are packaging.

	-hpa

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-15 20:14                     ` [kernel-hardening] " Solar Designer
@ 2011-08-15 22:13                       ` Eric Paris
  -1 siblings, 0 replies; 48+ messages in thread
From: Eric Paris @ 2011-08-15 22:13 UTC (permalink / raw)
  To: Solar Designer
  Cc: H. Peter Anvin, Andi Kleen, Vasiliy Kulikov, Thomas Gleixner,
	Ingo Molnar, James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On Mon, Aug 15, 2011 at 4:14 PM, Solar Designer <solar@openwall.com> wrote:

> So how do we proceed from here?  Start by getting Will's patch applied?

How to move forward with Will's patch is supposed to be discussed at
the invite only kernel summit October 23 - 25, 2011.  As one of the
many people who have written a replacement for seccomp I wished I had
been invited to participate, but was not.  Will is in a hard place
because Ingo insists he take his patch in one direction while tglx and
Steven Rostedt have both explicitly NAKd such a direction.  Hopefully
when those 3 sit down in person a solution can be found and we can get
some traction on a useful seccomp interface, but it's still a while
out....

-Eric

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-15 22:13                       ` Eric Paris
  0 siblings, 0 replies; 48+ messages in thread
From: Eric Paris @ 2011-08-15 22:13 UTC (permalink / raw)
  To: Solar Designer
  Cc: H. Peter Anvin, Andi Kleen, Vasiliy Kulikov, Thomas Gleixner,
	Ingo Molnar, James Morris, kernel-hardening, x86, linux-kernel,
	linux-security-module, Will Drewry

On Mon, Aug 15, 2011 at 4:14 PM, Solar Designer <solar@openwall.com> wrote:

> So how do we proceed from here?  Start by getting Will's patch applied?

How to move forward with Will's patch is supposed to be discussed at
the invite only kernel summit October 23 - 25, 2011.  As one of the
many people who have written a replacement for seccomp I wished I had
been invited to participate, but was not.  Will is in a hard place
because Ingo insists he take his patch in one direction while tglx and
Steven Rostedt have both explicitly NAKd such a direction.  Hopefully
when those 3 sit down in person a solution can be found and we can get
some traction on a useful seccomp interface, but it's still a while
out....

-Eric

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-15 22:13                       ` [kernel-hardening] " Eric Paris
@ 2011-08-16  1:18                         ` Andi Kleen
  -1 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-16  1:18 UTC (permalink / raw)
  To: Eric Paris
  Cc: Solar Designer, H. Peter Anvin, Andi Kleen, Vasiliy Kulikov,
	Thomas Gleixner, Ingo Molnar, James Morris, kernel-hardening,
	x86, linux-kernel, linux-security-module, Will Drewry

On Mon, Aug 15, 2011 at 06:13:51PM -0400, Eric Paris wrote:
> On Mon, Aug 15, 2011 at 4:14 PM, Solar Designer <solar@openwall.com> wrote:
> 
> > So how do we proceed from here?  Start by getting Will's patch applied?
> 
> How to move forward with Will's patch is supposed to be discussed at
> the invite only kernel summit October 23 - 25, 2011.  As one of the

linux-kernel is really the primary forum. If you want to discuss
something just discuss it here (ideally with patches)

May the best patch win.

-Andi

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] Re: [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-16  1:18                         ` Andi Kleen
  0 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-16  1:18 UTC (permalink / raw)
  To: Eric Paris
  Cc: Solar Designer, H. Peter Anvin, Andi Kleen, Vasiliy Kulikov,
	Thomas Gleixner, Ingo Molnar, James Morris, kernel-hardening,
	x86, linux-kernel, linux-security-module, Will Drewry

On Mon, Aug 15, 2011 at 06:13:51PM -0400, Eric Paris wrote:
> On Mon, Aug 15, 2011 at 4:14 PM, Solar Designer <solar@openwall.com> wrote:
> 
> > So how do we proceed from here?  Start by getting Will's patch applied?
> 
> How to move forward with Will's patch is supposed to be discussed at
> the invite only kernel summit October 23 - 25, 2011.  As one of the

linux-kernel is really the primary forum. If you want to discuss
something just discuss it here (ideally with patches)

May the best patch win.

-Andi

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [RFC v2] x86: restrict pid namespaces to 32 or 64 bit syscalls
  2011-08-13 16:32       ` Vasiliy Kulikov
@ 2011-08-18 14:40           ` Vasiliy Kulikov
  2011-08-18 14:40           ` [kernel-hardening] " Vasiliy Kulikov
  1 sibling, 0 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-18 14:40 UTC (permalink / raw)
  To: kernel-hardening
  Cc: Thomas Gleixner, Ingo Molnar, James Morris, x86, H. Peter Anvin,
	linux-kernel, linux-security-module, Will Drewry

Hi,

In case someone is still interested in the patch - here it is.
The slowdown issue is solved in this version.  Given that it
significantly correlates with seccomp patch, I don't expect it to be
applied.  However, if anybody want to discuss it - I don't mind :)

================================
Subject: [RFC v2] x86: restrict pid namespaces to 32 or 64 bit syscalls
From: Vasiliy Kulikov <segoon@openwall.com>

This patch allows x86-64 systems with 32 bit syscalls support to lock a
task to 32 or 64 bitness syscalls/tasks.  By denying rarely used
compatibility syscalls it reduces an attack surface for untrusted 
containers.

Two new prctl() commands are introduced: PR_BITNESS_LOCK_ON_EXEC and
PR_BITNESS_LOCK.  PR_BITNESS_LOCK immediately locks the current task to
the current bitness.  The restriction is inherited via fork() and cannot
be removed.  PR_BITNESS_LOCK_ON_EXEC locks the task on the next
execve() call.  It's possible to limit the next execve() to the bitness
of the executed binary or to the specific bitness.  If the specified
bitness differs from the binary bitness, execve() fails.  The flag is
cleared if execve() fails.

After the task is locked to some bitness (1) all syscalls of other bitness
cause sending SIGKILL and (2) loading ELF binaries of another bitness
(or non-ELF binfmt, except scripts) is prohibited (as if the
corresponding CONFIG_BINFMT_*=N).

v2 - Changed interface from sysctl locking a pid namespace to prctl()
     locking current task.
   - Used _TIF_WORK_SYSCALL_ENTRY macros to remove a slowdown of not
     locked tasks.

--
 arch/x86/ia32/ia32entry.S          |   10 +-
 arch/x86/include/asm/elf.h         |    6 +
 arch/x86/include/asm/ptrace.h      |    6 +
 arch/x86/include/asm/thread_info.h |   36 ++++++-
 arch/x86/kernel/Makefile           |    1 +
 arch/x86/kernel/entry_64.S         |    4 +-
 arch/x86/kernel/ptrace.c           |   11 ++-
 arch/x86/kernel/syscall_restrict.c |  200 ++++++++++++++++++++++++++++++++++++
 fs/binfmt_elf.c                    |    7 +-
 fs/binfmt_script.c                 |    2 +-
 fs/exec.c                          |   13 +++
 include/linux/prctl.h              |    3 +
 include/linux/sched.h              |    5 +
 kernel/sys.c                       |    6 +-
 14 files changed, 295 insertions(+), 15 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..c7960f3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -152,7 +152,7 @@ ENTRY(ia32_sysenter_target)
  	.previous	
 	GET_THREAD_INFO(%r10)
 	orl    $TS_COMPAT,TI_status(%r10)
-	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	testl  $_TIF_WORK_SYSCALL_ENTRY_32,TI_flags(%r10)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	cmpq	$(IA32_NR_syscalls-1),%rax
@@ -238,7 +238,7 @@ sysexit_audit:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY_32 & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
 	jz	sysenter_auditsys
 #endif
 	SAVE_REST
@@ -311,7 +311,7 @@ ENTRY(ia32_cstar_target)
 	.previous	
 	GET_THREAD_INFO(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	testl $_TIF_WORK_SYSCALL_ENTRY_32,TI_flags(%r10)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	cmpq $IA32_NR_syscalls-1,%rax
@@ -355,7 +355,7 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	testl $(_TIF_WORK_SYSCALL_ENTRY_32 & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
@@ -422,7 +422,7 @@ ENTRY(ia32_syscall)
 	SAVE_ARGS 0,1,0
 	GET_THREAD_INFO(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	testl $_TIF_WORK_SYSCALL_ENTRY_32,TI_flags(%r10)
 	jnz ia32_tracesys
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..b292580 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -320,4 +320,10 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
 #define arch_randomize_brk arch_randomize_brk
 
+extern void arch_post_exec_elf(int retval, int elf_class);
+#define arch_post_exec_elf arch_post_exec_elf
+
+extern void arch_post_execve(void);
+#define arch_post_execve arch_post_execve
+
 #endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 94e7618..e95986e 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -281,6 +281,12 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
+extern int syscall_bitness_check(void);
+
+extern long arch_prctl(int option, unsigned long arg2, unsigned long arg3,
+		unsigned long arg4, unsigned long arg5);
+#define arch_prctl arch_prctl
+
 #endif /* __KERNEL__ */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..69e8f68 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,8 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_SYSCALL32_DENIED	29	/* 32 bit syscalls are denied */
+#define TIF_SYSCALL64_DENIED	30	/* 64 bit syscalls are denied */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -117,11 +119,20 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL32_DENIED	(1 << TIF_SYSCALL32_DENIED)
+#define _TIF_SYSCALL64_DENIED	(1 << TIF_SYSCALL64_DENIED)
 
 /* work to do in syscall_trace_enter() */
-#define _TIF_WORK_SYSCALL_ENTRY	\
+#define _TIF_WORK_SYSCALL_ENTRY_64	\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
-	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
+	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_SYSCALL64_DENIED)
+
+/* work to do in syscall_trace_enter() */
+#define _TIF_WORK_SYSCALL_ENTRY_32	\
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
+	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_SYSCALL32_DENIED)
 
 /* work to do in syscall_trace_leave() */
 #define _TIF_WORK_SYSCALL_EXIT	\
@@ -259,12 +270,29 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
-#endif	/* !__ASSEMBLY__ */
 
-#ifndef __ASSEMBLY__
+#ifdef CONFIG_IA32_EMULATION
+
+extern void arch_post_fork(struct task_struct *task);
+#define arch_post_fork arch_post_fork
+
+#define BITNESS_LOCK_32 1
+#define BITNESS_LOCK_64 2
+
+struct bitness_lock_on_exec {
+	int lock;
+};
+#define bitness_lock_on_exec bitness_lock_on_exec
+
+#endif /* CONFIG_IA32_EMULATION */
+
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 #define arch_task_cache_init arch_task_cache_init
+
+struct linux_binfmt;
+extern bool arch_check_interpreter(struct linux_binfmt *fmt);
+#define arch_check_interpreter arch_check_interpreter
 #endif
 #endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0410557..a200ff3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SYSCTL)		+= syscall_restrict.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e13329d..77534b7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -474,7 +474,7 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+	testl $_TIF_WORK_SYSCALL_ENTRY_64,TI_flags(%rcx)
 	jnz tracesys
 system_call_fastpath:
 	cmpq $__NR_syscall_max,%rax
@@ -578,7 +578,7 @@ sysret_audit:
 	/* Do syscall tracing */
 tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+	testl $(_TIF_WORK_SYSCALL_ENTRY_64 & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
 	jz auditsys
 #endif
 	SAVE_REST
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8252879..39d0a85 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1378,7 +1378,16 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (test_thread_flag(TIF_SINGLESTEP))
 		regs->flags |= X86_EFLAGS_TF;
 
-	/* do the secure computing check first */
+	/*
+	 * Do the syscall bitness check first.
+	 *
+	 * If the bitness is denied, exit immediatelly to reduce
+	 * the size of executed code.
+	 */
+	if (syscall_bitness_check())
+		return -1L;
+
+	/* Then check the syscall number. */
 	secure_computing(regs->orig_ax);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c
new file mode 100644
index 0000000..6001b3a
--- /dev/null
+++ b/arch/x86/kernel/syscall_restrict.c
@@ -0,0 +1,200 @@
+#include <linux/thread_info.h>
+#include <linux/pid_namespace.h>
+#include <linux/sysctl.h>
+#include <linux/kprobes.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/kdebug.h>
+#include <linux/elf.h>
+#include <linux/prctl.h>
+#include <linux/binfmts.h>
+#include <asm/kdebug.h>
+#include <asm/compat.h>
+
+#ifdef CONFIG_IA32_EMULATION
+
+int syscall_bitness_check(void)
+{
+	int flag;
+
+	if (is_compat_task())
+		flag = TIF_SYSCALL32_DENIED;
+	else
+		flag = TIF_SYSCALL64_DENIED;
+
+	if (test_thread_flag(flag)) {
+		pr_err_ratelimited("%s[%d]: attempt to do a syscall of denied "
+			"bitness\n", current->comm, task_pid_nr(current));
+		force_sig(SIGKILL, current);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int bits_to_flags(int bits)
+{
+	switch (bits) {
+	case 32:
+		return TIF_SYSCALL64_DENIED;
+	case 64:
+		return TIF_SYSCALL32_DENIED;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int __bitness_lock(int bits)
+{
+	int clear_bit_nr = bits_to_flags(bits);
+
+	if (clear_bit_nr < 0)
+		return clear_bit_nr;
+
+	set_tsk_thread_flag(current, clear_bit_nr);
+	return 0;
+}
+
+static int bitness_set_lock_on_exec(int bits, int val)
+{
+	int mask;
+
+	switch (bits) {
+	case 32:
+		mask = BITNESS_LOCK_32;
+		break;
+	case 64:
+		mask = BITNESS_LOCK_64;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	current->bitness_lock_on_exec.lock &= ~mask;
+	if (val)
+		current->bitness_lock_on_exec.lock |= mask;
+
+	return 0;
+}
+
+static bool elf_may_exec(void)
+{
+	if (test_thread_flag(TIF_SYSCALL64_DENIED))
+		return false;
+
+	/* Either we're don't want to be locked or
+	 * we're ok to lock to the ELF's bitness */
+	if (current->bitness_lock_on_exec.lock == 0 ||
+	    current->bitness_lock_on_exec.lock & BITNESS_LOCK_64)
+		return true;
+
+	return false;
+}
+
+bool compat_elf_may_exec(void)
+{
+	if (test_thread_flag(TIF_SYSCALL32_DENIED))
+		return false;
+
+	/* Either we're don't want to be locked or
+	 * we're ok to lock to the ELF's bitness */
+	if (current->bitness_lock_on_exec.lock == 0 ||
+	    current->bitness_lock_on_exec.lock & BITNESS_LOCK_32)
+		return true;
+
+	return false;
+}
+
+extern struct linux_binfmt elf_format, compat_elf_format, script_format;
+
+bool arch_check_interpreter(struct linux_binfmt *fmt)
+{
+	if (fmt == &compat_elf_format)
+		return compat_elf_may_exec();
+
+	if (fmt == &elf_format)
+		return elf_may_exec();
+
+	/* We're ok with loading script, which interpreter is legitimate ELF */
+	if (fmt == &script_format)
+		return true;
+
+	if (current->bitness_lock_on_exec.lock == 0)
+		return true;
+
+	return false;
+}
+
+/*
+ * We cannot do it in arch_post_exec_elf() as it can be called from
+ * binfmt_script's handler, which may fail. If the call sequence is
+ *
+ *  binfmt_script -> binfmt_elf => FAIL
+ *  binfmt_elf => OK
+ *
+ * We should be locked.  To keep code simple, we just clear ->.bitness.lock
+ * on each execve() regardless of return code.
+ */
+void arch_post_execve(void)
+{
+	current->bitness_lock_on_exec.lock = 0;
+}
+
+void arch_post_exec_elf(int retval, int elf_class)
+{
+	if (retval == 0 && current->bitness_lock_on_exec.lock) {
+		int bits = (elf_class == ELFCLASS32) ? 32 : 64;
+		__bitness_lock(bits);
+	}
+}
+
+#else /* CONFIG_IA32_EMULATION */
+
+bool arch_check_interpreter(struct linux_binfmt *fmt) { return true; }
+void arch_post_execve(void) {}
+void arch_post_exec_elf(int retval, int elf_class) {}
+
+static int bitness_set_lock_on_exec(int bits, int val)
+{
+	if (bits == 64)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+static int __bitness_lock(int bits)
+{
+	if (bits == 64)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+#endif /* CONFIG_IA32_EMULATION */
+
+int current_bitness(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (test_thread_flag(TIF_IA32))
+		return 32;
+	else
+#endif
+		return 64;
+}
+
+long arch_prctl(int option, unsigned long arg2, unsigned long arg3,
+		unsigned long arg4, unsigned long arg5)
+{
+	switch (option) {
+	case PR_BITNESS_LOCK_ON_EXEC:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return bitness_set_lock_on_exec(arg3, !!arg2);
+	case PR_BITNESS_LOCK:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return __bitness_lock(current_bitness());
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index dd0fdfc..41a86fb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -65,7 +65,7 @@ static int elf_core_dump(struct coredump_params *cprm);
 #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
 #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
 
-static struct linux_binfmt elf_format = {
+struct linux_binfmt elf_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_elf_binary,
 	.load_shlib	= load_elf_library,
@@ -556,6 +556,10 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 #endif
 }
 
+#ifndef arch_post_exec_elf
+#define arch_post_exec_elf(rc, class)
+#endif
+
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 {
 	struct file *interpreter = NULL; /* to shut gcc up */
@@ -979,6 +983,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 out:
 	kfree(loc);
 out_ret:
+	arch_post_exec_elf(retval, ELF_CLASS);
 	return retval;
 
 	/* error cleanup */
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 396a988..be0e4c5 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -98,7 +98,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 	return search_binary_handler(bprm,regs);
 }
 
-static struct linux_binfmt script_format = {
+struct linux_binfmt script_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_script,
 };
diff --git a/fs/exec.c b/fs/exec.c
index da80612..784c48a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1390,6 +1390,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
 			if (!fn)
 				continue;
+			if (!arch_check_interpreter(fmt))
+				continue;
 			if (!try_module_get(fmt->module))
 				continue;
 			read_unlock(&binfmt_lock);
@@ -1441,11 +1443,20 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 		break;
 #endif
 	}
+
 	return retval;
 }
 
 EXPORT_SYMBOL(search_binary_handler);
 
+#ifndef arch_post_execve
+#define arch_post_execve()
+#endif
+
+#ifndef arch_check_interpreter
+#define arch_check_interpreter(x) true
+#endif
+
 /*
  * sys_execve() executes a new program.
  */
@@ -1527,6 +1538,7 @@ static int do_execve_common(const char *filename,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	acct_update_integrals(current);
+	arch_post_execve();
 	free_bprm(bprm);
 	if (displaced)
 		put_files_struct(displaced);
@@ -1556,6 +1568,7 @@ out_files:
 	if (displaced)
 		reset_files_struct(displaced);
 out_ret:
+	arch_post_execve();
 	return retval;
 }
 
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..91edb9c 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,7 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_BITNESS_LOCK_ON_EXEC	35
+#define PR_BITNESS_LOCK		36
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20b03bf..ee5ba82 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,6 +1046,10 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 }
 #endif	/* !CONFIG_SMP */
 
+#ifndef bitness_lock_on_exec
+struct bitness_lock_on_exec {};
+#define bitness_lock_on_exec bitness_lock_on_exec
+#endif /* bitness_lock_t */
 
 struct io_context;			/* See blkdev.h */
 
@@ -1405,6 +1409,7 @@ struct task_struct {
 	unsigned int sessionid;
 #endif
 	seccomp_t seccomp;
+	struct bitness_lock_on_exec bitness_lock_on_exec;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/kernel/sys.c b/kernel/sys.c
index a101ba3..e7faa8b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1644,6 +1644,10 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
+#ifndef arch_prctl
+#define arch_prctl(o, a2, a3, a4, a5) (-EINVAL)
+#endif
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -1793,7 +1797,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 				error = PR_MCE_KILL_DEFAULT;
 			break;
 		default:
-			error = -EINVAL;
+			error = arch_prctl(option, arg2, arg3, arg4, arg5);
 			break;
 	}
 	return error;

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [kernel-hardening] [RFC v2] x86: restrict pid namespaces to 32 or 64 bit syscalls
@ 2011-08-18 14:40           ` Vasiliy Kulikov
  0 siblings, 0 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-18 14:40 UTC (permalink / raw)
  To: kernel-hardening
  Cc: Thomas Gleixner, Ingo Molnar, James Morris, x86, H. Peter Anvin,
	linux-kernel, linux-security-module, Will Drewry

Hi,

In case someone is still interested in the patch - here it is.
The slowdown issue is solved in this version.  Given that it
significantly correlates with seccomp patch, I don't expect it to be
applied.  However, if anybody want to discuss it - I don't mind :)

================================
Subject: [RFC v2] x86: restrict pid namespaces to 32 or 64 bit syscalls
From: Vasiliy Kulikov <segoon@openwall.com>

This patch allows x86-64 systems with 32 bit syscalls support to lock a
task to 32 or 64 bitness syscalls/tasks.  By denying rarely used
compatibility syscalls it reduces an attack surface for untrusted 
containers.

Two new prctl() commands are introduced: PR_BITNESS_LOCK_ON_EXEC and
PR_BITNESS_LOCK.  PR_BITNESS_LOCK immediately locks the current task to
the current bitness.  The restriction is inherited via fork() and cannot
be removed.  PR_BITNESS_LOCK_ON_EXEC locks the task on the next
execve() call.  It's possible to limit the next execve() to the bitness
of the executed binary or to the specific bitness.  If the specified
bitness differs from the binary bitness, execve() fails.  The flag is
cleared if execve() fails.

After the task is locked to some bitness (1) all syscalls of other bitness
cause sending SIGKILL and (2) loading ELF binaries of another bitness
(or non-ELF binfmt, except scripts) is prohibited (as if the
corresponding CONFIG_BINFMT_*=N).

v2 - Changed interface from sysctl locking a pid namespace to prctl()
     locking current task.
   - Used _TIF_WORK_SYSCALL_ENTRY macros to remove a slowdown of not
     locked tasks.

--
 arch/x86/ia32/ia32entry.S          |   10 +-
 arch/x86/include/asm/elf.h         |    6 +
 arch/x86/include/asm/ptrace.h      |    6 +
 arch/x86/include/asm/thread_info.h |   36 ++++++-
 arch/x86/kernel/Makefile           |    1 +
 arch/x86/kernel/entry_64.S         |    4 +-
 arch/x86/kernel/ptrace.c           |   11 ++-
 arch/x86/kernel/syscall_restrict.c |  200 ++++++++++++++++++++++++++++++++++++
 fs/binfmt_elf.c                    |    7 +-
 fs/binfmt_script.c                 |    2 +-
 fs/exec.c                          |   13 +++
 include/linux/prctl.h              |    3 +
 include/linux/sched.h              |    5 +
 kernel/sys.c                       |    6 +-
 14 files changed, 295 insertions(+), 15 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..c7960f3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -152,7 +152,7 @@ ENTRY(ia32_sysenter_target)
  	.previous	
 	GET_THREAD_INFO(%r10)
 	orl    $TS_COMPAT,TI_status(%r10)
-	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	testl  $_TIF_WORK_SYSCALL_ENTRY_32,TI_flags(%r10)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	cmpq	$(IA32_NR_syscalls-1),%rax
@@ -238,7 +238,7 @@ sysexit_audit:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY_32 & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
 	jz	sysenter_auditsys
 #endif
 	SAVE_REST
@@ -311,7 +311,7 @@ ENTRY(ia32_cstar_target)
 	.previous	
 	GET_THREAD_INFO(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	testl $_TIF_WORK_SYSCALL_ENTRY_32,TI_flags(%r10)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	cmpq $IA32_NR_syscalls-1,%rax
@@ -355,7 +355,7 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	testl $(_TIF_WORK_SYSCALL_ENTRY_32 & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
@@ -422,7 +422,7 @@ ENTRY(ia32_syscall)
 	SAVE_ARGS 0,1,0
 	GET_THREAD_INFO(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	testl $_TIF_WORK_SYSCALL_ENTRY_32,TI_flags(%r10)
 	jnz ia32_tracesys
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..b292580 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -320,4 +320,10 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
 #define arch_randomize_brk arch_randomize_brk
 
+extern void arch_post_exec_elf(int retval, int elf_class);
+#define arch_post_exec_elf arch_post_exec_elf
+
+extern void arch_post_execve(void);
+#define arch_post_execve arch_post_execve
+
 #endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 94e7618..e95986e 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -281,6 +281,12 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
+extern int syscall_bitness_check(void);
+
+extern long arch_prctl(int option, unsigned long arg2, unsigned long arg3,
+		unsigned long arg4, unsigned long arg5);
+#define arch_prctl arch_prctl
+
 #endif /* __KERNEL__ */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..69e8f68 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,8 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_SYSCALL32_DENIED	29	/* 32 bit syscalls are denied */
+#define TIF_SYSCALL64_DENIED	30	/* 64 bit syscalls are denied */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -117,11 +119,20 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL32_DENIED	(1 << TIF_SYSCALL32_DENIED)
+#define _TIF_SYSCALL64_DENIED	(1 << TIF_SYSCALL64_DENIED)
 
 /* work to do in syscall_trace_enter() */
-#define _TIF_WORK_SYSCALL_ENTRY	\
+#define _TIF_WORK_SYSCALL_ENTRY_64	\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
-	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
+	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_SYSCALL64_DENIED)
+
+/* work to do in syscall_trace_enter() */
+#define _TIF_WORK_SYSCALL_ENTRY_32	\
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
+	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_SYSCALL32_DENIED)
 
 /* work to do in syscall_trace_leave() */
 #define _TIF_WORK_SYSCALL_EXIT	\
@@ -259,12 +270,29 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
-#endif	/* !__ASSEMBLY__ */
 
-#ifndef __ASSEMBLY__
+#ifdef CONFIG_IA32_EMULATION
+
+extern void arch_post_fork(struct task_struct *task);
+#define arch_post_fork arch_post_fork
+
+#define BITNESS_LOCK_32 1
+#define BITNESS_LOCK_64 2
+
+struct bitness_lock_on_exec {
+	int lock;
+};
+#define bitness_lock_on_exec bitness_lock_on_exec
+
+#endif /* CONFIG_IA32_EMULATION */
+
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 #define arch_task_cache_init arch_task_cache_init
+
+struct linux_binfmt;
+extern bool arch_check_interpreter(struct linux_binfmt *fmt);
+#define arch_check_interpreter arch_check_interpreter
 #endif
 #endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0410557..a200ff3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SYSCTL)		+= syscall_restrict.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e13329d..77534b7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -474,7 +474,7 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+	testl $_TIF_WORK_SYSCALL_ENTRY_64,TI_flags(%rcx)
 	jnz tracesys
 system_call_fastpath:
 	cmpq $__NR_syscall_max,%rax
@@ -578,7 +578,7 @@ sysret_audit:
 	/* Do syscall tracing */
 tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+	testl $(_TIF_WORK_SYSCALL_ENTRY_64 & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
 	jz auditsys
 #endif
 	SAVE_REST
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8252879..39d0a85 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1378,7 +1378,16 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (test_thread_flag(TIF_SINGLESTEP))
 		regs->flags |= X86_EFLAGS_TF;
 
-	/* do the secure computing check first */
+	/*
+	 * Do the syscall bitness check first.
+	 *
+	 * If the bitness is denied, exit immediatelly to reduce
+	 * the size of executed code.
+	 */
+	if (syscall_bitness_check())
+		return -1L;
+
+	/* Then check the syscall number. */
 	secure_computing(regs->orig_ax);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c
new file mode 100644
index 0000000..6001b3a
--- /dev/null
+++ b/arch/x86/kernel/syscall_restrict.c
@@ -0,0 +1,200 @@
+#include <linux/thread_info.h>
+#include <linux/pid_namespace.h>
+#include <linux/sysctl.h>
+#include <linux/kprobes.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/kdebug.h>
+#include <linux/elf.h>
+#include <linux/prctl.h>
+#include <linux/binfmts.h>
+#include <asm/kdebug.h>
+#include <asm/compat.h>
+
+#ifdef CONFIG_IA32_EMULATION
+
+int syscall_bitness_check(void)
+{
+	int flag;
+
+	if (is_compat_task())
+		flag = TIF_SYSCALL32_DENIED;
+	else
+		flag = TIF_SYSCALL64_DENIED;
+
+	if (test_thread_flag(flag)) {
+		pr_err_ratelimited("%s[%d]: attempt to do a syscall of denied "
+			"bitness\n", current->comm, task_pid_nr(current));
+		force_sig(SIGKILL, current);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int bits_to_flags(int bits)
+{
+	switch (bits) {
+	case 32:
+		return TIF_SYSCALL64_DENIED;
+	case 64:
+		return TIF_SYSCALL32_DENIED;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int __bitness_lock(int bits)
+{
+	int clear_bit_nr = bits_to_flags(bits);
+
+	if (clear_bit_nr < 0)
+		return clear_bit_nr;
+
+	set_tsk_thread_flag(current, clear_bit_nr);
+	return 0;
+}
+
+static int bitness_set_lock_on_exec(int bits, int val)
+{
+	int mask;
+
+	switch (bits) {
+	case 32:
+		mask = BITNESS_LOCK_32;
+		break;
+	case 64:
+		mask = BITNESS_LOCK_64;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	current->bitness_lock_on_exec.lock &= ~mask;
+	if (val)
+		current->bitness_lock_on_exec.lock |= mask;
+
+	return 0;
+}
+
+static bool elf_may_exec(void)
+{
+	if (test_thread_flag(TIF_SYSCALL64_DENIED))
+		return false;
+
+	/* Either we're don't want to be locked or
+	 * we're ok to lock to the ELF's bitness */
+	if (current->bitness_lock_on_exec.lock == 0 ||
+	    current->bitness_lock_on_exec.lock & BITNESS_LOCK_64)
+		return true;
+
+	return false;
+}
+
+bool compat_elf_may_exec(void)
+{
+	if (test_thread_flag(TIF_SYSCALL32_DENIED))
+		return false;
+
+	/* Either we're don't want to be locked or
+	 * we're ok to lock to the ELF's bitness */
+	if (current->bitness_lock_on_exec.lock == 0 ||
+	    current->bitness_lock_on_exec.lock & BITNESS_LOCK_32)
+		return true;
+
+	return false;
+}
+
+extern struct linux_binfmt elf_format, compat_elf_format, script_format;
+
+bool arch_check_interpreter(struct linux_binfmt *fmt)
+{
+	if (fmt == &compat_elf_format)
+		return compat_elf_may_exec();
+
+	if (fmt == &elf_format)
+		return elf_may_exec();
+
+	/* We're ok with loading script, which interpreter is legitimate ELF */
+	if (fmt == &script_format)
+		return true;
+
+	if (current->bitness_lock_on_exec.lock == 0)
+		return true;
+
+	return false;
+}
+
+/*
+ * We cannot do it in arch_post_exec_elf() as it can be called from
+ * binfmt_script's handler, which may fail. If the call sequence is
+ *
+ *  binfmt_script -> binfmt_elf => FAIL
+ *  binfmt_elf => OK
+ *
+ * We should be locked.  To keep code simple, we just clear ->.bitness.lock
+ * on each execve() regardless of return code.
+ */
+void arch_post_execve(void)
+{
+	current->bitness_lock_on_exec.lock = 0;
+}
+
+void arch_post_exec_elf(int retval, int elf_class)
+{
+	if (retval == 0 && current->bitness_lock_on_exec.lock) {
+		int bits = (elf_class == ELFCLASS32) ? 32 : 64;
+		__bitness_lock(bits);
+	}
+}
+
+#else /* CONFIG_IA32_EMULATION */
+
+bool arch_check_interpreter(struct linux_binfmt *fmt) { return true; }
+void arch_post_execve(void) {}
+void arch_post_exec_elf(int retval, int elf_class) {}
+
+static int bitness_set_lock_on_exec(int bits, int val)
+{
+	if (bits == 64)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+static int __bitness_lock(int bits)
+{
+	if (bits == 64)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+#endif /* CONFIG_IA32_EMULATION */
+
+int current_bitness(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (test_thread_flag(TIF_IA32))
+		return 32;
+	else
+#endif
+		return 64;
+}
+
+long arch_prctl(int option, unsigned long arg2, unsigned long arg3,
+		unsigned long arg4, unsigned long arg5)
+{
+	switch (option) {
+	case PR_BITNESS_LOCK_ON_EXEC:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return bitness_set_lock_on_exec(arg3, !!arg2);
+	case PR_BITNESS_LOCK:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return __bitness_lock(current_bitness());
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index dd0fdfc..41a86fb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -65,7 +65,7 @@ static int elf_core_dump(struct coredump_params *cprm);
 #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
 #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
 
-static struct linux_binfmt elf_format = {
+struct linux_binfmt elf_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_elf_binary,
 	.load_shlib	= load_elf_library,
@@ -556,6 +556,10 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 #endif
 }
 
+#ifndef arch_post_exec_elf
+#define arch_post_exec_elf(rc, class)
+#endif
+
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 {
 	struct file *interpreter = NULL; /* to shut gcc up */
@@ -979,6 +983,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 out:
 	kfree(loc);
 out_ret:
+	arch_post_exec_elf(retval, ELF_CLASS);
 	return retval;
 
 	/* error cleanup */
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 396a988..be0e4c5 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -98,7 +98,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 	return search_binary_handler(bprm,regs);
 }
 
-static struct linux_binfmt script_format = {
+struct linux_binfmt script_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_script,
 };
diff --git a/fs/exec.c b/fs/exec.c
index da80612..784c48a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1390,6 +1390,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
 			if (!fn)
 				continue;
+			if (!arch_check_interpreter(fmt))
+				continue;
 			if (!try_module_get(fmt->module))
 				continue;
 			read_unlock(&binfmt_lock);
@@ -1441,11 +1443,20 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 		break;
 #endif
 	}
+
 	return retval;
 }
 
 EXPORT_SYMBOL(search_binary_handler);
 
+#ifndef arch_post_execve
+#define arch_post_execve()
+#endif
+
+#ifndef arch_check_interpreter
+#define arch_check_interpreter(x) true
+#endif
+
 /*
  * sys_execve() executes a new program.
  */
@@ -1527,6 +1538,7 @@ static int do_execve_common(const char *filename,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	acct_update_integrals(current);
+	arch_post_execve();
 	free_bprm(bprm);
 	if (displaced)
 		put_files_struct(displaced);
@@ -1556,6 +1568,7 @@ out_files:
 	if (displaced)
 		reset_files_struct(displaced);
 out_ret:
+	arch_post_execve();
 	return retval;
 }
 
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..91edb9c 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,7 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_BITNESS_LOCK_ON_EXEC	35
+#define PR_BITNESS_LOCK		36
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20b03bf..ee5ba82 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,6 +1046,10 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 }
 #endif	/* !CONFIG_SMP */
 
+#ifndef bitness_lock_on_exec
+struct bitness_lock_on_exec {};
+#define bitness_lock_on_exec bitness_lock_on_exec
+#endif /* bitness_lock_t */
 
 struct io_context;			/* See blkdev.h */
 
@@ -1405,6 +1409,7 @@ struct task_struct {
 	unsigned int sessionid;
 #endif
 	seccomp_t seccomp;
+	struct bitness_lock_on_exec bitness_lock_on_exec;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/kernel/sys.c b/kernel/sys.c
index a101ba3..e7faa8b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1644,6 +1644,10 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
+#ifndef arch_prctl
+#define arch_prctl(o, a2, a3, a4, a5) (-EINVAL)
+#endif
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -1793,7 +1797,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 				error = PR_MCE_KILL_DEFAULT;
 			break;
 		default:
-			error = -EINVAL;
+			error = arch_prctl(option, arg2, arg3, arg4, arg5);
 			break;
 	}
 	return error;

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2011-08-18 14:41 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-12 15:03 [RFC] x86: restrict pid namespaces to 32 or 64 bit syscalls Vasiliy Kulikov
2011-08-12 15:03 ` [kernel-hardening] " Vasiliy Kulikov
2011-08-12 20:08 ` H. Peter Anvin
2011-08-12 20:08   ` [kernel-hardening] " H. Peter Anvin
2011-08-13  6:22   ` Vasiliy Kulikov
2011-08-13  6:22     ` [kernel-hardening] " Vasiliy Kulikov
2011-08-13 15:41     ` H. Peter Anvin
2011-08-13 15:41       ` [kernel-hardening] " H. Peter Anvin
2011-08-13 16:32       ` Vasiliy Kulikov
2011-08-14  9:09         ` Solar Designer
2011-08-14  9:09           ` [kernel-hardening] " Solar Designer
2011-08-18 14:40         ` [RFC v2] " Vasiliy Kulikov
2011-08-18 14:40           ` [kernel-hardening] " Vasiliy Kulikov
2011-08-14  2:38       ` [RFC] " Andi Kleen
2011-08-14  2:38         ` [kernel-hardening] " Andi Kleen
2011-08-14  5:08         ` H. Peter Anvin
2011-08-14  5:08           ` [kernel-hardening] " H. Peter Anvin
2011-08-14  9:20           ` Solar Designer
2011-08-14  9:20             ` [kernel-hardening] " Solar Designer
2011-08-14 14:48             ` H. Peter Anvin
2011-08-14 14:48               ` [kernel-hardening] " H. Peter Anvin
2011-08-14 15:27               ` Andi Kleen
2011-08-14 15:27                 ` [kernel-hardening] " Andi Kleen
2011-08-14 15:36                 ` H. Peter Anvin
2011-08-14 15:36                   ` [kernel-hardening] " H. Peter Anvin
2011-08-14 23:29                   ` James Morris
2011-08-14 23:29                     ` [kernel-hardening] " James Morris
2011-08-15  0:18                   ` Andi Kleen
2011-08-15  0:18                     ` [kernel-hardening] " Andi Kleen
2011-08-15  0:32                     ` Will Drewry
2011-08-15  0:58                       ` Andi Kleen
     [not found]                 ` <20110814152729.GU5782-qrUzlfsMFqo/4alezvVtWx2eb7JE58TQ@public.gmane.org>
2011-08-14 16:08                   ` Vasiliy Kulikov
2011-08-14 16:08                     ` [kernel-hardening] " Vasiliy Kulikov
2011-08-14 16:08                     ` Vasiliy Kulikov
2011-08-15 18:51               ` Solar Designer
2011-08-15 18:51                 ` [kernel-hardening] " Solar Designer
2011-08-15 18:59                 ` H. Peter Anvin
2011-08-15 18:59                   ` [kernel-hardening] " H. Peter Anvin
2011-08-15 20:14                   ` Solar Designer
2011-08-15 20:14                     ` [kernel-hardening] " Solar Designer
2011-08-15 20:27                     ` Andi Kleen
2011-08-15 20:27                       ` [kernel-hardening] " Andi Kleen
2011-08-15 20:48                     ` H. Peter Anvin
2011-08-15 20:48                       ` [kernel-hardening] " H. Peter Anvin
2011-08-15 22:13                     ` Eric Paris
2011-08-15 22:13                       ` [kernel-hardening] " Eric Paris
2011-08-16  1:18                       ` Andi Kleen
2011-08-16  1:18                         ` [kernel-hardening] " Andi Kleen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.