[kernel-hardening] 32/64 bitness restriction for pid namespace

All of lore.kernel.org
 help / color / mirror / Atom feed

* [kernel-hardening] 32/64 bitness restriction for pid namespace
@ 2011-08-07 11:00 Vasiliy Kulikov
  2011-08-08 17:39 ` [kernel-hardening] " Vasiliy Kulikov
  0 siblings, 1 reply; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-07 11:00 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Will Drewry

Solar, Will, all -

The new sysctl is introduced, abi.bitness_locked.  If set to 1, it locks
all tasks inside of current pid namespace to the bitness of init task
(pid_ns->child_reaper).  After that (1) all syscalls of other bitness
return -ENOSYS and (2) loading ELF binaries of another bitness is
prohibited (as if the corresponding CONFIG_BINFMT_*=N).  If there is any
task which differs in bitness, the lockup fails.

TODO:

 * Fix a race of sysctl against fork().
 * Denied syscall should behave as if it doesn't exist.

The patch was tested very roughly.

diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c
index 1a2bf1c..b2bfd8f 100644
--- a/arch/x86/kernel/syscall_restrict.c
+++ b/arch/x86/kernel/syscall_restrict.c
@@ -31,8 +31,8 @@ static int task_get_bitness(struct task_struct *task)
 static bool pidns_locked(struct pid_namespace *pid_ns)
 {
 	struct task_struct *init = pid_ns->child_reaper;
-	return (test_ti_thread_flag(task_thread_info(task), TIF_SYSCALL32_DENIED) ||
-		test_ti_thread_flag(task_thread_info(task), TIF_SYSCALL64_DENIED));
+	return (test_ti_thread_flag(task_thread_info(init), TIF_SYSCALL32_DENIED) ||
+		test_ti_thread_flag(task_thread_info(init), TIF_SYSCALL64_DENIED));
 }
 
 static int bits_to_flags(int bits)
@@ -69,7 +69,7 @@ static int __pidns_may_lock_bitness(struct pid_namespace *pid_ns, int bits)
 }
 
 /* Called with hold tasklist_lock and rcu */
-static int __change_syscall_restrict(struct pid_namespace *pid_ns, int bits)
+static int __bitness_lock(struct pid_namespace *pid_ns, int bits)
 {
 	u32 clear_bit_nr;
 	struct task_struct *p, *thread;
@@ -90,7 +90,7 @@ static int __change_syscall_restrict(struct pid_namespace *pid_ns, int bits)
 	return 0;
 }
 
-static int syscall_bitness_lock(struct pid_namespace *pid_ns)
+static int bitness_lock(struct pid_namespace *pid_ns)
 {
 	int rc, new_bits;
 
@@ -100,14 +100,14 @@ static int syscall_bitness_lock(struct pid_namespace *pid_ns)
 	new_bits = task_get_bitness(pid_ns->child_reaper);
 	rc = __pidns_may_lock_bitness(pid_ns, new_bits);
 	if (!rc)
-		rc = __change_syscall_restrict(pid_ns, new_bits);
+		rc = __bitness_lock(pid_ns, new_bits);
 
 	write_unlock_irq(&tasklist_lock);
 	rcu_read_unlock();
 	return rc;
 }
 
-static int syscall_bitness_locked_handler(struct ctl_table *table, int write,
+static int bitness_locked_handler(struct ctl_table *table, int write,
 				void __user *buffer, size_t *lenp,
 				loff_t *ppos)
 {
@@ -128,14 +128,14 @@ static int syscall_bitness_locked_handler(struct ctl_table *table, int write,
 		return -EACCES;
 	if (new_bits && old_bits)
 		return 0;
-	return syscall_bitness_lock(current->nsproxy->pid_ns);
+	return bitness_lock(current->nsproxy->pid_ns);
 }
 
 static struct ctl_table abi_syscall_restrict[] = {
 	{
-		.procname = "syscall_bitness_locked",
+		.procname = "bitness_locked",
 		.mode = 0644,
-		.proc_handler = syscall_bitness_locked_handler
+		.proc_handler = bitness_locked_handler
 	},
 	{}
 };
---

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [kernel-hardening] Re: 32/64 bitness restriction for pid namespace
  2011-08-07 11:00 [kernel-hardening] 32/64 bitness restriction for pid namespace Vasiliy Kulikov
@ 2011-08-08 17:39 ` Vasiliy Kulikov
  2011-08-10  9:52   ` Vasiliy Kulikov
  0 siblings, 1 reply; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-08 17:39 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Will Drewry

On Sun, Aug 07, 2011 at 15:00 +0400, Vasiliy Kulikov wrote:
> Solar, Will, all -
> 
> The new sysctl is introduced, abi.bitness_locked.  If set to 1, it locks
> all tasks inside of current pid namespace to the bitness of init task
> (pid_ns->child_reaper).  After that (1) all syscalls of other bitness
> return -ENOSYS and (2) loading ELF binaries of another bitness is
> prohibited (as if the corresponding CONFIG_BINFMT_*=N).  If there is any
> task which differs in bitness, the lockup fails.
> 
> TODO:
> 
>  * Fix a race of sysctl against fork().

Done.

>  * Denied syscall should behave as if it doesn't exist.

I suppose the best way of handling denied 32 bit syscalls is pretending
IA32_EMULATION=n, 64 bit syscalls - as if it is 32-bit kernel on 64-bit
CPU.  Simplified handling copied from interrupts handling with proper
signal delivery will be implemented.


64 bit SYSCALL - #UD => SIGILL.

32-bit SYSCALL - 64-bit kernel without IA32_EMULATION simply returns -ENOSYS here.

32-bit SYSENTER - #GP(0) => SIGSEGV.

32-bit int 80h - #NP => SIGBUS.


Other changes:
 - fixed an unitialized variable usage.
 - moved the check before "orl $TS_COMPAT,TI_status(%r10)".
 - sysctl is persistent with IA32_EMULATION=n.


The new version:

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..39a6544 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -151,6 +151,8 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_deniedsys
 	orl    $TS_COMPAT,TI_status(%r10)
 	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
@@ -310,6 +312,8 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_deniedsys
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
@@ -421,6 +425,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,1,0
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_deniedsys
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	jnz ia32_tracesys
@@ -453,6 +459,12 @@ ia32_badsys:
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
+ia32_deniedsys:
+	/* FIXME: need SIGSEGV delivery or similar */
+	movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+	movq $-ENOSYS,%rax
+	jmp ia32_sysret
+
 quiet_ni_syscall:
 	movq $-ENOSYS,%rax
 	ret
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..fb054c7 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -153,9 +153,10 @@ do {						\
  * This is used to ensure we don't load something for the wrong architecture.
  */
 #define elf_check_arch(x)			\
-	((x)->e_machine == EM_X86_64)
+	((x)->e_machine == EM_X86_64 && !test_thread_flag(TIF_SYSCALL64_DENIED))
 
-#define compat_elf_check_arch(x)	elf_check_arch_ia32(x)
+#define compat_elf_check_arch(x)		\
+	(elf_check_arch_ia32(x) && !test_thread_flag(TIF_SYSCALL32_DENIED))
 
 static inline void elf_common_init(struct thread_struct *t,
 				   struct pt_regs *regs, const u16 ds)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..7faebde 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,8 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_SYSCALL32_DENIED	29	/* 32 bit syscalls are allowed */
+#define TIF_SYSCALL64_DENIED	30	/* 64 bit syscalls are allowed */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -117,6 +119,8 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL32_DENIED	(1 << TIF_SYSCALL32_DENIED)
+#define _TIF_SYSCALL64_DENIED	(1 << TIF_SYSCALL64_DENIED)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -259,9 +263,14 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
-#endif	/* !__ASSEMBLY__ */
 
-#ifndef __ASSEMBLY__
+#ifdef CONFIG_IA32_EMULATION
+#define __HAVE_ARCH_POST_FORK
+
+extern void arch_post_fork(struct task_struct *task);
+
+#endif /* CONFIG_IA32_EMULATION */
+
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0410557..a200ff3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SYSCTL)		+= syscall_restrict.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e13329d..1774685 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -474,6 +474,8 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
+	testl $_TIF_SYSCALL64_DENIED,TI_flags(%rcx)
+	jnz deniedsys
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
 	jnz tracesys
 system_call_fastpath:
@@ -541,6 +543,10 @@ sysret_signal:
 badsys:
 	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
 	jmp ret_from_sys_call
+deniedsys:
+	/* FIXME: need SIGSEGV delivery or similar */
+	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+	jmp ret_from_sys_call
 
 #ifdef CONFIG_AUDITSYSCALL
 	/*
diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c
new file mode 100644
index 0000000..a5c8ffa
--- /dev/null
+++ b/arch/x86/kernel/syscall_restrict.c
@@ -0,0 +1,187 @@
+#include <linux/thread_info.h>
+#include <linux/pid_namespace.h>
+#include <linux/sysctl.h>
+
+#ifdef CONFIG_IA32_EMULATION
+
+static bool pid_ns_contains_task(struct pid_namespace *pid_ns,
+				 struct task_struct *task)
+{
+	struct pid_namespace *ns = NULL;
+
+	if (task->nsproxy)
+		ns = task->nsproxy->pid_ns;
+
+	for (; ns; ns = ns->parent) {
+		if (ns == pid_ns)
+			return true;
+	}
+
+	return false;
+}
+
+static int task_get_bitness(struct task_struct *task)
+{
+	if (test_ti_thread_flag(task_thread_info(task), TIF_IA32))
+		return 32;
+	else
+		return 64;
+}
+
+static bool pidns_locked(struct pid_namespace *pid_ns)
+{
+	struct thread_info *ti = task_thread_info(pid_ns->child_reaper);
+
+	return test_ti_thread_flag(ti, TIF_SYSCALL32_DENIED) ||
+	       test_ti_thread_flag(ti, TIF_SYSCALL64_DENIED);
+}
+
+static int bits_to_flags(int bits)
+{
+	switch (bits) {
+	case 32:
+		return TIF_SYSCALL64_DENIED;
+	case 64:
+		return TIF_SYSCALL32_DENIED;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int __pidns_may_lock_bitness(struct pid_namespace *pid_ns, int bits)
+{
+	struct task_struct *p, *thread;
+	int old_bits;
+
+	do_each_thread(p, thread) {
+		if (!pid_ns_contains_task(pid_ns, thread))
+			continue;
+
+		old_bits = task_get_bitness(thread);
+		if (old_bits != bits) {
+			pr_err("Inconsistent syscall restriction detected! "
+				"Parent ns tries to restrict syscalls to %d "
+				"bits while some task is %d bit.",
+				bits, old_bits);
+			return -EINVAL;
+		}
+	} while_each_thread(p, thread);
+
+	return 0;
+}
+
+void arch_post_fork(struct task_struct *task)
+{
+	int clear_bit_nr;
+
+	if (!pidns_locked(current->nsproxy->pid_ns))
+		return;
+
+	clear_bit_nr = bits_to_flags(task_get_bitness(current));
+	set_tsk_thread_flag(task, clear_bit_nr);
+}
+
+/* Called with hold tasklist_lock and rcu */
+static int __bitness_lock(struct pid_namespace *pid_ns, int bits)
+{
+	u32 clear_bit_nr;
+	struct task_struct *p, *thread;
+
+	clear_bit_nr = bits_to_flags(bits);
+
+	/* Yes, it is awfully slow, but it is called once per ns (if any) */
+	do_each_thread(p, thread) {
+		if (!pid_ns_contains_task(pid_ns, thread))
+			continue;
+
+		set_tsk_thread_flag(thread, clear_bit_nr);
+	} while_each_thread(p, thread);
+
+	return 0;
+}
+
+static int bitness_lock(struct pid_namespace *pid_ns)
+{
+	int rc, new_bits;
+
+	rcu_read_lock();
+	write_lock_irq(&tasklist_lock);
+
+	new_bits = task_get_bitness(pid_ns->child_reaper);
+	rc = __pidns_may_lock_bitness(pid_ns, new_bits);
+	if (!rc)
+		rc = __bitness_lock(pid_ns, new_bits);
+
+	write_unlock_irq(&tasklist_lock);
+	rcu_read_unlock();
+	return rc;
+}
+
+static int bitness_locked_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int rc, new_bits, old_bits;
+	struct ctl_table tbl = {
+		.procname	= table->procname,
+		.data		= &new_bits,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+	};
+
+	old_bits = new_bits = pidns_locked(current->nsproxy->pid_ns);
+	rc = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+	if (rc || !write)
+		return rc;
+
+	if (!capable(CAP_SYS_ADMIN) || (new_bits == 0 && old_bits))
+		return -EACCES;
+	if (new_bits && old_bits)
+		return 0;
+	return bitness_lock(current->nsproxy->pid_ns);
+}
+
+static struct ctl_table abi_syscall_restrict[] = {
+	{
+		.procname = "bitness_locked",
+		.mode = 0644,
+		.proc_handler = bitness_locked_handler
+	},
+	{}
+};
+
+#else /* CONFIG_IA32_EMULATION */
+
+static int one = 1;
+
+static struct ctl_table abi_syscall_restrict[] = {
+	{
+		.procname	= "bitness_locked",
+		.data		= &one,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+	},
+	{}
+};
+
+#endif /* CONFIG_IA32_EMULATION */
+
+
+static struct ctl_table abi_root[] = {
+	{
+		.procname = "abi",
+		.mode = 0555,
+		.child = abi_syscall_restrict
+	},
+	{}
+};
+
+__init int syscall_restrict_init(void)
+{
+	register_sysctl_table(abi_root);
+	return 0;
+}
+device_initcall(syscall_restrict_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index e7ceaca..55e4455 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1039,6 +1039,10 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 }
 
+#ifndef __HAVE_ARCH_POST_FORK
+#define arch_post_fork(p)
+#endif
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1374,6 +1378,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+	arch_post_fork(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	if (clone_flags & CLONE_THREAD)
--

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [kernel-hardening] Re: 32/64 bitness restriction for pid namespace
  2011-08-08 17:39 ` [kernel-hardening] " Vasiliy Kulikov
@ 2011-08-10  9:52   ` Vasiliy Kulikov
  2011-08-10 13:03     ` [kernel-hardening] " Solar Designer
  0 siblings, 1 reply; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-10  9:52 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Will Drewry

Hi,

A simplified task list looping version (based on
zap_pid_ns_processes()).

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..39a6544 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -151,6 +151,8 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_deniedsys
 	orl    $TS_COMPAT,TI_status(%r10)
 	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
@@ -310,6 +312,8 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_deniedsys
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
@@ -421,6 +425,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,1,0
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_deniedsys
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	jnz ia32_tracesys
@@ -453,6 +459,12 @@ ia32_badsys:
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
+ia32_deniedsys:
+	/* FIXME: need SIGSEGV delivery or similar */
+	movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+	movq $-ENOSYS,%rax
+	jmp ia32_sysret
+
 quiet_ni_syscall:
 	movq $-ENOSYS,%rax
 	ret
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..fb054c7 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -153,9 +153,10 @@ do {						\
  * This is used to ensure we don't load something for the wrong architecture.
  */
 #define elf_check_arch(x)			\
-	((x)->e_machine == EM_X86_64)
+	((x)->e_machine == EM_X86_64 && !test_thread_flag(TIF_SYSCALL64_DENIED))
 
-#define compat_elf_check_arch(x)	elf_check_arch_ia32(x)
+#define compat_elf_check_arch(x)		\
+	(elf_check_arch_ia32(x) && !test_thread_flag(TIF_SYSCALL32_DENIED))
 
 static inline void elf_common_init(struct thread_struct *t,
 				   struct pt_regs *regs, const u16 ds)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..1e93040 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,8 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_SYSCALL32_DENIED	29	/* 32 bit syscalls are denied */
+#define TIF_SYSCALL64_DENIED	30	/* 64 bit syscalls are denied */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -117,6 +119,8 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL32_DENIED	(1 << TIF_SYSCALL32_DENIED)
+#define _TIF_SYSCALL64_DENIED	(1 << TIF_SYSCALL64_DENIED)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -259,9 +263,14 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
-#endif	/* !__ASSEMBLY__ */
 
-#ifndef __ASSEMBLY__
+#ifdef CONFIG_IA32_EMULATION
+#define __HAVE_ARCH_POST_FORK
+
+extern void arch_post_fork(struct task_struct *task);
+
+#endif /* CONFIG_IA32_EMULATION */
+
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0410557..a200ff3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SYSCTL)		+= syscall_restrict.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e13329d..2725810 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -474,6 +474,8 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
+	testl $_TIF_SYSCALL64_DENIED,TI_flags(%rcx)
+	jnz denied_sys
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
 	jnz tracesys
 system_call_fastpath:
@@ -541,6 +543,10 @@ sysret_signal:
 badsys:
 	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
 	jmp ret_from_sys_call
+denied_sys:
+	/* FIXME: need SIGSEGV delivery or similar */
+	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+	jmp ret_from_sys_call
 
 #ifdef CONFIG_AUDITSYSCALL
 	/*
diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c
new file mode 100644
index 0000000..7962d23
--- /dev/null
+++ b/arch/x86/kernel/syscall_restrict.c
@@ -0,0 +1,167 @@
+#include <linux/thread_info.h>
+#include <linux/pid_namespace.h>
+#include <linux/sysctl.h>
+
+#ifdef CONFIG_IA32_EMULATION
+
+static int task_get_bitness(struct task_struct *task)
+{
+	if (test_ti_thread_flag(task_thread_info(task), TIF_IA32))
+		return 32;
+	else
+		return 64;
+}
+
+static bool pidns_locked(struct pid_namespace *pid_ns)
+{
+	struct thread_info *ti = task_thread_info(pid_ns->child_reaper);
+
+	return test_ti_thread_flag(ti, TIF_SYSCALL32_DENIED) ||
+	       test_ti_thread_flag(ti, TIF_SYSCALL64_DENIED);
+}
+
+static int bits_to_flags(int bits)
+{
+	if (bits == 32)
+		return TIF_SYSCALL64_DENIED;
+	else
+		return TIF_SYSCALL32_DENIED;
+}
+
+void arch_post_fork(struct task_struct *task)
+{
+	int clear_bit_nr;
+
+	if (!pidns_locked(current->nsproxy->pid_ns))
+		return;
+
+	clear_bit_nr = bits_to_flags(task_get_bitness(current));
+	set_tsk_thread_flag(task, clear_bit_nr);
+}
+
+/* Called under rcu_read_lock and write_lock_irq(tasklist) */
+static int __pidns_may_lock_bitness(struct pid_namespace *pid_ns, int bits)
+{
+	struct task_struct *task;
+	int old_bits;
+	int nr;
+
+	for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (!task)
+			continue;
+
+		old_bits = task_get_bitness(task);
+		if (old_bits != bits) {
+			pr_err("Inconsistent syscall restriction detected! "
+				"Parent ns tries to restrict syscalls to %d "
+				"bits while some task is %d bit.",
+				bits, old_bits);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/* Called under rcu_read_lock and write_lock_irq(tasklist) */
+static void __bitness_lock(struct pid_namespace *pid_ns, int bits)
+{
+	u32 clear_bit_nr;
+	struct task_struct *task;
+	int nr;
+
+	clear_bit_nr = bits_to_flags(bits);
+
+	for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (task)
+			set_tsk_thread_flag(task, clear_bit_nr);
+	}
+}
+
+static int bitness_lock(struct pid_namespace *pid_ns)
+{
+	int rc, new_bits;
+
+	rcu_read_lock();
+	write_lock_irq(&tasklist_lock);
+
+	new_bits = task_get_bitness(pid_ns->child_reaper);
+	rc = __pidns_may_lock_bitness(pid_ns, new_bits);
+	if (!rc)
+		__bitness_lock(pid_ns, new_bits);
+
+	write_unlock_irq(&tasklist_lock);
+	rcu_read_unlock();
+	return rc;
+}
+
+static int bitness_locked_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int rc, new_bits, old_bits;
+	struct ctl_table tbl = {
+		.procname	= table->procname,
+		.data		= &new_bits,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+	};
+
+	old_bits = new_bits = pidns_locked(current->nsproxy->pid_ns);
+	rc = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+	if (rc || !write)
+		return rc;
+
+	if (!capable(CAP_SYS_ADMIN) || (new_bits == 0 && old_bits))
+		return -EACCES;
+	if (new_bits && old_bits)
+		return 0;
+	return bitness_lock(current->nsproxy->pid_ns);
+}
+
+static struct ctl_table abi_syscall_restrict[] = {
+	{
+		.procname = "bitness_locked",
+		.mode = 0644,
+		.proc_handler = bitness_locked_handler
+	},
+	{}
+};
+
+#else /* CONFIG_IA32_EMULATION */
+
+static int one = 1;
+
+static struct ctl_table abi_syscall_restrict[] = {
+	{
+		.procname	= "bitness_locked",
+		.data		= &one,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+	},
+	{}
+};
+
+#endif /* CONFIG_IA32_EMULATION */
+
+
+static struct ctl_table abi_root[] = {
+	{
+		.procname = "abi",
+		.mode = 0555,
+		.child = abi_syscall_restrict
+	},
+	{}
+};
+
+__init int syscall_restrict_init(void)
+{
+	register_sysctl_table(abi_root);
+	return 0;
+}
+device_initcall(syscall_restrict_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index e7ceaca..55e4455 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1039,6 +1039,10 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 }
 
+#ifndef __HAVE_ARCH_POST_FORK
+#define arch_post_fork(p)
+#endif
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1374,6 +1378,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+	arch_post_fork(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	if (clone_flags & CLONE_THREAD)
---

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-10  9:52   ` Vasiliy Kulikov
@ 2011-08-10 13:03     ` Solar Designer
  2011-08-10 13:27       ` Vasiliy Kulikov
  0 siblings, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-10 13:03 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Will Drewry

On Wed, Aug 10, 2011 at 01:52:01PM +0400, Vasiliy Kulikov wrote:
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -151,6 +151,8 @@ ENTRY(ia32_sysenter_target)
>   	.quad 1b,ia32_badarg
>   	.previous	
>  	GET_THREAD_INFO(%r10)
> +	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
> +	jnz ia32_deniedsys

Things like this work for the initial RFC posting, but something will
need to be done to eliminate the performance impact later.

Perhaps bitness-restricted processes will need to be switched to
directly use different syscall entry code.  Maybe you'll have to do such
switching on context switches, which is hopefully a lower performance
impact than the check on syscall entry.  On the other hand, this is more
complicated and may have extra risks.

Alternatively, you may do the test/jnz thing on some syscall mechanisms
(legacy), but do something more efficient on others (meant to be fast).

You'll need to research this and propose something.  Or maybe folks on
LKML will propose something.

> +ia32_deniedsys:
> +	/* FIXME: need SIGSEGV delivery or similar */

I think the action on error should be exactly the same as if the kernel
is compiled without CONFIG_IA32_EMULATION.

> +static struct ctl_table abi_syscall_restrict[] = {
> +	{
> +		.procname = "bitness_locked",
> +		.mode = 0644,
> +		.proc_handler = bitness_locked_handler
> +	},
> +	{}
> +};

How would we actually configure it, say, for an OpenVZ container before
we let any program in the container run (including /sbin/init, because
we assume that the container's root account may have been compromised
and is now trying to attack the kernel to escape)?  With OpenVZ, this
setting will need to be in /etc/vz/conf/100.conf, etc. - and vzctl will
need to configure it in the kernel.  Will it have to mount the
container's procfs early for this?  Currently, this step is left for
the guest Linux distro's startup scripts.

Also, what are the possible settings?  Is this tri-state - any bitness
allowed, 32-bit only, or 64-bit only?

Thanks,

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-10 13:03     ` [kernel-hardening] " Solar Designer
@ 2011-08-10 13:27       ` Vasiliy Kulikov
  2011-08-10 14:26         ` Solar Designer
  0 siblings, 1 reply; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-10 13:27 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Will Drewry

Solar,

On Wed, Aug 10, 2011 at 17:03 +0400, Solar Designer wrote:
> On Wed, Aug 10, 2011 at 01:52:01PM +0400, Vasiliy Kulikov wrote:
> > +++ b/arch/x86/ia32/ia32entry.S
> > @@ -151,6 +151,8 @@ ENTRY(ia32_sysenter_target)
> >   	.quad 1b,ia32_badarg
> >   	.previous	
> >  	GET_THREAD_INFO(%r10)
> > +	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
> > +	jnz ia32_deniedsys
> 
> Things like this work for the initial RFC posting, but something will
> need to be done to eliminate the performance impact later.

IMO a single check is awfully cheap.  Look at audit checks - the same one
bit check.  (btw, I'll guard it with #ifdef CONFIG_IA32_EMULATION for
64-bit syscall.)

> Perhaps bitness-restricted processes will need to be switched to
> directly use different syscall entry code.

Then the check is moved from a syscall to a switch plus additional cost
of IDT changing, which is IIRC very expensive.  And I bet it would cause
numerous complains from LKML folks.

> Alternatively, you may do the test/jnz thing on some syscall mechanisms
> (legacy), but do something more efficient on others (meant to be fast).

Sorry, I don't understand what you're trying to say.  What legacy
syscall mechanisms?

> > +ia32_deniedsys:
> > +	/* FIXME: need SIGSEGV delivery or similar */
> 
> I think the action on error should be exactly the same as if the kernel
> is compiled without CONFIG_IA32_EMULATION.

OK, but then it needs the stack preparation in asm code (at least to get
struct pt_regs from C code).

> > +static struct ctl_table abi_syscall_restrict[] = {
> > +	{
> > +		.procname = "bitness_locked",
> > +		.mode = 0644,
> > +		.proc_handler = bitness_locked_handler
> > +	},
> > +	{}
> > +};
> 
> How would we actually configure it, say, for an OpenVZ container before
> we let any program in the container run (including /sbin/init, because
> we assume that the container's root account may have been compromised
> and is now trying to attack the kernel to escape)?  With OpenVZ, this
> setting will need to be in /etc/vz/conf/100.conf, etc. - and vzctl will
> need to configure it in the kernel.  Will it have to mount the
> container's procfs early for this?  Currently, this step is left for
> the guest Linux distro's startup scripts.

Hm, if we assume root is _already_ compromized, then I see one way (a
hack, actually): open sysctl file, create container environment, write 1
to the file and execve() init image.  I don't know whether vzctl already
use procfs for any internal things, I have to investigate it.

> Also, what are the possible settings?  Is this tri-state - any bitness
> allowed, 32-bit only, or 64-bit only?

No, two-state.  I tried to make it as simple as possible.  As there is
at least one process in current pid ns - init - and it already has some
specific bitness, locking procedure locks the whole container to the
init's bitness.  Otherwise, init would die on the next syscall.

And it is one way ticket (like modules_disabled) - once set to 1 it
can never be cleared (and there is no code for it ;).

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-10 13:27       ` Vasiliy Kulikov
@ 2011-08-10 14:26         ` Solar Designer
  2011-08-10 15:02           ` Vasiliy Kulikov
  0 siblings, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-10 14:26 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

I feel that your patch works fine for an RFC posting to LKML, so please
do that.  As to potential further changes, see below:

On Wed, Aug 10, 2011 at 05:27:17PM +0400, Vasiliy Kulikov wrote:
> On Wed, Aug 10, 2011 at 17:03 +0400, Solar Designer wrote:
> > On Wed, Aug 10, 2011 at 01:52:01PM +0400, Vasiliy Kulikov wrote:
> > > +++ b/arch/x86/ia32/ia32entry.S
> > > @@ -151,6 +151,8 @@ ENTRY(ia32_sysenter_target)
> > >   	.quad 1b,ia32_badarg
> > >   	.previous	
> > >  	GET_THREAD_INFO(%r10)
> > > +	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
> > > +	jnz ia32_deniedsys
> > 
> > Things like this work for the initial RFC posting, but something will
> > need to be done to eliminate the performance impact later.
> 
> IMO a single check is awfully cheap.

I agree, but not everyone will.

> Look at audit checks - the same one
> bit check.  (btw, I'll guard it with #ifdef CONFIG_IA32_EMULATION for
> 64-bit syscall.)

Yes, "#ifdef CONFIG_IA32_EMULATION" will be good, although almost all
distro kernels will include CONFIG_IA32_EMULATION.

> > Perhaps bitness-restricted processes will need to be switched to
> > directly use different syscall entry code.
> 
> Then the check is moved from a syscall to a switch plus additional cost
> of IDT changing, which is IIRC very expensive.  And I bet it would cause
> numerous complains from LKML folks.

Right.  Maybe you could have separate IDTs for each processor, and then
you'd be patching them rather than invoking the costly lidt instruction.

Also, this switching on context switch would only occur when the bitness
restriction of the two processes differs.  So systems that don't make
use of this hardening feature would see almost no performance impact
(just one check on context switch).

And syscalls are expected to be far more frequent than context switches.

Anyway, I agree with you that a simple check may be better.

> > Alternatively, you may do the test/jnz thing on some syscall mechanisms
> > (legacy), but do something more efficient on others (meant to be fast).
> 
> Sorry, I don't understand what you're trying to say.  What legacy
> syscall mechanisms?

int 0x80 (IDT) vs. syscall/sysenter (MSRs).

> > How would we actually configure it, say, for an OpenVZ container before
> > we let any program in the container run (including /sbin/init, because
> > we assume that the container's root account may have been compromised
> > and is now trying to attack the kernel to escape)?  With OpenVZ, this
> > setting will need to be in /etc/vz/conf/100.conf, etc. - and vzctl will
> > need to configure it in the kernel.  Will it have to mount the
> > container's procfs early for this?  Currently, this step is left for
> > the guest Linux distro's startup scripts.
> 
> Hm, if we assume root is _already_ compromized, then I see one way (a
> hack, actually): open sysctl file, create container environment, write 1
> to the file and execve() init image.  I don't know whether vzctl already
> use procfs for any internal things, I have to investigate it.

Isn't this what I wrote above - having vzctl mount the container's
procfs early?  I think it's better to avoid this.  Maybe we need a
prctl() that will tell a subsequent execve() to lock bitness (and fail
if the /sbin/init binary being loaded is of the wrong bitness).

> > Also, what are the possible settings?  Is this tri-state - any bitness
> > allowed, 32-bit only, or 64-bit only?
> 
> No, two-state.  I tried to make it as simple as possible.  As there is
> at least one process in current pid ns - init - and it already has some
> specific bitness, locking procedure locks the whole container to the
> init's bitness.  Otherwise, init would die on the next syscall.

This is a desired mode as well, yes.  I suspect OpenVZ may even make
this their default.  However, we also need a way to control this
pre-init, in case /sbin/init is already replaced by the attacker.
I think we need a prctl() that will let us configure things in one of
four ways for the very next execve() call:

0. Don't lock bitness.

1. Lock bitness to that of the next binary invoked.

2. Lock bitness to 32-bit, fail the next execve() if not 32-bit.

3. Lock bitness to 64-bit, fail the next execve() if not 64-bit.

> And it is one way ticket (like modules_disabled) - once set to 1 it
> can never be cleared (and there is no code for it ;).

Right.

Thanks,

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-10 14:26         ` Solar Designer
@ 2011-08-10 15:02           ` Vasiliy Kulikov
  2011-08-10 15:40             ` Solar Designer
  0 siblings, 1 reply; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-10 15:02 UTC (permalink / raw)
  To: kernel-hardening

On Wed, Aug 10, 2011 at 18:26 +0400, Solar Designer wrote:
> I feel that your patch works fine for an RFC posting to LKML, so please
> do that.

OK!

> > > Alternatively, you may do the test/jnz thing on some syscall mechanisms
> > > (legacy), but do something more efficient on others (meant to be fast).
> > 
> > Sorry, I don't understand what you're trying to say.  What legacy
> > syscall mechanisms?
> 
> int 0x80 (IDT) vs. syscall/sysenter (MSRs).

All three mechanisms are already guarded.

> > > How would we actually configure it, say, for an OpenVZ container before
> > > we let any program in the container run (including /sbin/init, because
> > > we assume that the container's root account may have been compromised
> > > and is now trying to attack the kernel to escape)?  With OpenVZ, this
> > > setting will need to be in /etc/vz/conf/100.conf, etc. - and vzctl will
> > > need to configure it in the kernel.  Will it have to mount the
> > > container's procfs early for this?  Currently, this step is left for
> > > the guest Linux distro's startup scripts.
> > 
> > Hm, if we assume root is _already_ compromized, then I see one way (a
> > hack, actually): open sysctl file, create container environment, write 1
> > to the file and execve() init image.  I don't know whether vzctl already
> > use procfs for any internal things, I have to investigate it.
> 
> Isn't this what I wrote above - having vzctl mount the container's
> procfs early?  I think it's better to avoid this.

No, no early procfs mounting.  I mean keeping fd from the HW' procfs
mount point.  Writing to bitness_locked is equivalent regardless a used
mount point.  However, prctl() is much cleaner.

> > No, two-state.  I tried to make it as simple as possible.  As there is
> > at least one process in current pid ns - init - and it already has some
> > specific bitness, locking procedure locks the whole container to the
> > init's bitness.  Otherwise, init would die on the next syscall.
> 
> This is a desired mode as well, yes.  I suspect OpenVZ may even make
> this their default.  However, we also need a way to control this
> pre-init, in case /sbin/init is already replaced by the attacker.
> I think we need a prctl() that will let us configure things in one of
> four ways for the very next execve() call:
> 
> 0. Don't lock bitness.
> 
> 1. Lock bitness to that of the next binary invoked.
> 
> 2. Lock bitness to 32-bit, fail the next execve() if not 32-bit.
> 
> 3. Lock bitness to 64-bit, fail the next execve() if not 64-bit.

Is there any need for 2 and 3?  I feel 0 and 1 are fine.  KISS :)


I don't know whether it is OK to have 2 mechanisms for a rather limited
thing.  For OpenVZ prctl() should be OK as there are 2 ways to enter the
container:

1) vzctl start - a process creates an environment, does prctl() and
execve's init.

2) vzctl enter - a process does some ioctl() magic to enter already
created namespaces and vz environment.

For (1) prctl() is just what is needed.  For (2) IMO it's better to lock
the process in this ioctl() (keep it ovz-specific for now) as I don't
see how upstream can handle this kind of namespace shift.

(In 3.1 there is a possibility to change net/ipc/uts namespace, but it
is far from pid ns changing AFAICS.)


The clean mechanism for (2) can be simply added in the future.

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-10 15:02           ` Vasiliy Kulikov
@ 2011-08-10 15:40             ` Solar Designer
  2011-08-10 16:21               ` Vasiliy Kulikov
  0 siblings, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-10 15:40 UTC (permalink / raw)
  To: kernel-hardening

On Wed, Aug 10, 2011 at 07:02:57PM +0400, Vasiliy Kulikov wrote:
> On Wed, Aug 10, 2011 at 18:26 +0400, Solar Designer wrote:
> > > > Alternatively, you may do the test/jnz thing on some syscall mechanisms
> > > > (legacy), but do something more efficient on others (meant to be fast).
> > > 
> > > Sorry, I don't understand what you're trying to say.  What legacy
> > > syscall mechanisms?
> > 
> > int 0x80 (IDT) vs. syscall/sysenter (MSRs).
> 
> All three mechanisms are already guarded.

Yes.  I meant that when adding optimizations, it might make sense to do
so for some of these only.  Or it might not.

> > > > How would we actually configure it, say, for an OpenVZ container before
> > > > we let any program in the container run (including /sbin/init, because
> > > > we assume that the container's root account may have been compromised
> > > > and is now trying to attack the kernel to escape)?  With OpenVZ, this
> > > > setting will need to be in /etc/vz/conf/100.conf, etc. - and vzctl will
> > > > need to configure it in the kernel.  Will it have to mount the
> > > > container's procfs early for this?  Currently, this step is left for
> > > > the guest Linux distro's startup scripts.
> > > 
> > > Hm, if we assume root is _already_ compromized, then I see one way (a
> > > hack, actually): open sysctl file, create container environment, write 1
> > > to the file and execve() init image.  I don't know whether vzctl already
> > > use procfs for any internal things, I have to investigate it.
> > 
> > Isn't this what I wrote above - having vzctl mount the container's
> > procfs early?  I think it's better to avoid this.
> 
> No, no early procfs mounting.  I mean keeping fd from the HW' procfs
> mount point.  Writing to bitness_locked is equivalent regardless a used
> mount point.

Oh, I did not realize this.

> However, prctl() is much cleaner.

> > > No, two-state.  I tried to make it as simple as possible.  As there is
> > > at least one process in current pid ns - init - and it already has some
> > > specific bitness, locking procedure locks the whole container to the
> > > init's bitness.  Otherwise, init would die on the next syscall.
> > 
> > This is a desired mode as well, yes.  I suspect OpenVZ may even make
> > this their default.  However, we also need a way to control this
> > pre-init, in case /sbin/init is already replaced by the attacker.
> > I think we need a prctl() that will let us configure things in one of
> > four ways for the very next execve() call:
> > 
> > 0. Don't lock bitness.
> > 
> > 1. Lock bitness to that of the next binary invoked.
> > 
> > 2. Lock bitness to 32-bit, fail the next execve() if not 32-bit.
> > 
> > 3. Lock bitness to 64-bit, fail the next execve() if not 64-bit.
> 
> Is there any need for 2 and 3?  I feel 0 and 1 are fine.  KISS :)

Yes, we also need 2 and 3, for the reason I mentioned: /sbin/init might
be already replaced by the attacker during the guest system's previous
uptime, specifically to bypass our restriction and attack the other
bitness' syscalls.

> I don't know whether it is OK to have 2 mechanisms for a rather limited
> thing.  For OpenVZ prctl() should be OK as there are 2 ways to enter the
> container:
> 
> 1) vzctl start - a process creates an environment, does prctl() and
> execve's init.
> 
> 2) vzctl enter - a process does some ioctl() magic to enter already
> created namespaces and vz environment.
> 
> For (1) prctl() is just what is needed.  For (2) IMO it's better to lock
> the process in this ioctl() (keep it ovz-specific for now) as I don't
> see how upstream can handle this kind of namespace shift.

Why not use the same prctl() for both?  (There's also vzctl exec, but
it's similar to vzctl enter for the purpose of this discussion.)

There's not much of a difference between execve() of /sbin/init and of
the shell.

I agree that your proposed procfs/sysctl interface seems excessive if we
add the prctl().

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-10 15:40             ` Solar Designer
@ 2011-08-10 16:21               ` Vasiliy Kulikov
  2011-08-10 16:42                 ` Solar Designer
  2011-08-12  9:09                 ` Vasiliy Kulikov
  0 siblings, 2 replies; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-10 16:21 UTC (permalink / raw)
  To: kernel-hardening

On Wed, Aug 10, 2011 at 19:40 +0400, Solar Designer wrote:
> > > > No, two-state.  I tried to make it as simple as possible.  As there is
> > > > at least one process in current pid ns - init - and it already has some
> > > > specific bitness, locking procedure locks the whole container to the
> > > > init's bitness.  Otherwise, init would die on the next syscall.
> > > 
> > > This is a desired mode as well, yes.  I suspect OpenVZ may even make
> > > this their default.  However, we also need a way to control this
> > > pre-init, in case /sbin/init is already replaced by the attacker.
> > > I think we need a prctl() that will let us configure things in one of
> > > four ways for the very next execve() call:
> > > 
> > > 0. Don't lock bitness.
> > > 
> > > 1. Lock bitness to that of the next binary invoked.
> > > 
> > > 2. Lock bitness to 32-bit, fail the next execve() if not 32-bit.
> > > 
> > > 3. Lock bitness to 64-bit, fail the next execve() if not 64-bit.
> > 
> > Is there any need for 2 and 3?  I feel 0 and 1 are fine.  KISS :)
> 
> Yes, we also need 2 and 3, for the reason I mentioned: /sbin/init might
> be already replaced by the attacker during the guest system's previous
> uptime, specifically to bypass our restriction and attack the other
> bitness' syscalls.

Ah, sure :)


> > I don't know whether it is OK to have 2 mechanisms for a rather limited
> > thing.  For OpenVZ prctl() should be OK as there are 2 ways to enter the
> > container:
> > 
> > 1) vzctl start - a process creates an environment, does prctl() and
> > execve's init.
> > 
> > 2) vzctl enter - a process does some ioctl() magic to enter already
> > created namespaces and vz environment.
> > 
> > For (1) prctl() is just what is needed.  For (2) IMO it's better to lock
> > the process in this ioctl() (keep it ovz-specific for now) as I don't
> > see how upstream can handle this kind of namespace shift.
> 
> Why not use the same prctl() for both?  (There's also vzctl exec, but
> it's similar to vzctl enter for the purpose of this discussion.)
> 
> There's not much of a difference between execve() of /sbin/init and of
> the shell.

There is - if we exec init, there is no process in the namespace yet.
If exec the shell, an already existing root process may ptrace vzctl
process, which hasn't exec'ed and hasn't locked itself yet.  I don't
know how vzctl is protected against such races.


> I agree that your proposed procfs/sysctl interface seems excessive if we
> add the prctl().

Actually, it makes sense if we want to lock a live CT, but it is a minor
thing.


Anyway, I'll send an RFC with a comment that the specific locking
interface is not defined yet.

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-10 16:21               ` Vasiliy Kulikov
@ 2011-08-10 16:42                 ` Solar Designer
  2011-08-12 12:07                   ` Vasiliy Kulikov
  2011-08-12  9:09                 ` Vasiliy Kulikov
  1 sibling, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-10 16:42 UTC (permalink / raw)
  To: kernel-hardening

On Wed, Aug 10, 2011 at 08:21:01PM +0400, Vasiliy Kulikov wrote:
> On Wed, Aug 10, 2011 at 19:40 +0400, Solar Designer wrote:
> > > 1) vzctl start - a process creates an environment, does prctl() and
> > > execve's init.
> > > 
> > > 2) vzctl enter - a process does some ioctl() magic to enter already
> > > created namespaces and vz environment.
> > > 
> > > For (1) prctl() is just what is needed.  For (2) IMO it's better to lock
> > > the process in this ioctl() (keep it ovz-specific for now) as I don't
> > > see how upstream can handle this kind of namespace shift.
> > 
> > Why not use the same prctl() for both?  (There's also vzctl exec, but
> > it's similar to vzctl enter for the purpose of this discussion.)
> > 
> > There's not much of a difference between execve() of /sbin/init and of
> > the shell.
> 
> There is - if we exec init, there is no process in the namespace yet.
> If exec the shell, an already existing root process may ptrace vzctl
> process, which hasn't exec'ed and hasn't locked itself yet.  I don't
> know how vzctl is protected against such races.

Good point, but I think it is protected against ptrace by the guest, or
at least it was.  My OpenVZ audit report from late 2005 includes this:

2.2. Testing and review of "strace" logs revealed that only the first 16
fd's were being closed on VPS entry.  This needs to be corrected.  Also,
the fd's are being closed _after_ the ioctl call, which is not great,
although the risk is now mitigated by having the VPS-entering process
protected from ptrace(2).

Of course, mainline code / LXC might differ from OpenVZ in this respect,
and the OpenVZ code might have changed.  So this is something to revisit
when we add the bitness restriction.

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-10 16:21               ` Vasiliy Kulikov
  2011-08-10 16:42                 ` Solar Designer
@ 2011-08-12  9:09                 ` Vasiliy Kulikov
  1 sibling, 0 replies; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-12  9:09 UTC (permalink / raw)
  To: kernel-hardening

Hi,

Minor thing.  ldd (at least from Ubuntu) tries to blindly apply all
existing loaders to the program:

$ LANG=C ldd /bin/bash
/usr/bin/ldd: line 161: /lib/ld-linux.so.2: cannot execute binary file
    linux-vdso.so.1 =>  (0x00007fffbc1ff000)
    libncurses.so.5 => /lib/libncurses.so.5 (0x00007f158eb27000)
    libdl.so.2 => /lib/libdl.so.2 (0x00007f158e923000)
    libc.so.6 => /lib/libc.so.6 (0x00007f158e59f000)
    /lib64/ld-linux-x86-64.so.2 (0x00007f158ed98000)

$ file /bin/bash
/bin/bash: ELF 64-bit LSB executable, x86-64, version 1 (SYSV),
dynamically linked (uses shared libs), for GNU/Linux 2.6.15, stripped
$ readelf -l /bin/bash | grep -A2 INTER
  INTERP         0x0000000000000238 0x0000000000400238 0x0000000000400238
                 0x000000000000001c 0x000000000000001c  R      1
      [Requesting program interpreter: /lib64/ld-linux-x86-64.so.2]

ld-linux.so loading fails because of 64 bit ELFs enforcement.

I consider it as a bug of ldd :)

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-10 16:42                 ` Solar Designer
@ 2011-08-12 12:07                   ` Vasiliy Kulikov
  2011-08-12 12:23                     ` Solar Designer
  0 siblings, 1 reply; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-12 12:07 UTC (permalink / raw)
  To: kernel-hardening

Hi,

This is the updated version.  It tries to handle denied syscalls as if
they are disabled (MSR/IDT entry is not initialized).  I've copied
handlers' from interrupt handlers and removed kprobes code, which is
looks like dead in this specific case.

---
 arch/x86/ia32/ia32entry.S          |   33 +++++
 arch/x86/include/asm/elf.h         |    5 +-
 arch/x86/include/asm/thread_info.h |   13 ++-
 arch/x86/kernel/Makefile           |    1 +
 arch/x86/kernel/entry_64.S         |   12 ++-
 arch/x86/kernel/syscall_restrict.c |  229 ++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c            |    2 +-
 kernel/fork.c                      |    5 +
 8 files changed, 293 insertions(+), 7 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..5bc1882 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -151,6 +151,8 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_denied_sysenter
 	orl    $TS_COMPAT,TI_status(%r10)
 	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
@@ -310,6 +312,8 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_denied_syscall
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	CFI_REMEMBER_STATE
@@ -421,6 +425,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,1,0
 	GET_THREAD_INFO(%r10)
+	testl  $_TIF_SYSCALL32_DENIED,TI_flags(%r10)
+	jnz ia32_denied_int
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
 	jnz ia32_tracesys
@@ -453,6 +459,33 @@ ia32_badsys:
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
+ia32_denied_sysenter:
+	SAVE_REST
+	CLEAR_RREGS
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	call	do_ia32_denied_sysenter
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
+ia32_denied_syscall:
+	SAVE_REST
+	CLEAR_RREGS
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	movq $-ENOSYS,%rax
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
+ia32_denied_int:
+	SAVE_REST
+	CLEAR_RREGS
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	call	do_ia32_denied_int
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
+
 quiet_ni_syscall:
 	movq $-ENOSYS,%rax
 	ret
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..fb054c7 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -153,9 +153,10 @@ do {						\
  * This is used to ensure we don't load something for the wrong architecture.
  */
 #define elf_check_arch(x)			\
-	((x)->e_machine == EM_X86_64)
+	((x)->e_machine == EM_X86_64 && !test_thread_flag(TIF_SYSCALL64_DENIED))
 
-#define compat_elf_check_arch(x)	elf_check_arch_ia32(x)
+#define compat_elf_check_arch(x)		\
+	(elf_check_arch_ia32(x) && !test_thread_flag(TIF_SYSCALL32_DENIED))
 
 static inline void elf_common_init(struct thread_struct *t,
 				   struct pt_regs *regs, const u16 ds)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..1e93040 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,8 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_SYSCALL32_DENIED	29	/* 32 bit syscalls are denied */
+#define TIF_SYSCALL64_DENIED	30	/* 64 bit syscalls are denied */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -117,6 +119,8 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL32_DENIED	(1 << TIF_SYSCALL32_DENIED)
+#define _TIF_SYSCALL64_DENIED	(1 << TIF_SYSCALL64_DENIED)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -259,9 +263,14 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
-#endif	/* !__ASSEMBLY__ */
 
-#ifndef __ASSEMBLY__
+#ifdef CONFIG_IA32_EMULATION
+#define __HAVE_ARCH_POST_FORK
+
+extern void arch_post_fork(struct task_struct *task);
+
+#endif /* CONFIG_IA32_EMULATION */
+
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0410557..a200ff3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SYSCTL)		+= syscall_restrict.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e13329d..b184a45 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -474,6 +474,8 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
+	testl $_TIF_SYSCALL64_DENIED,TI_flags(%rcx)
+	jnz denied_sys
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
 	jnz tracesys
 system_call_fastpath:
@@ -539,8 +541,14 @@ sysret_signal:
 	jmp int_check_syscall_exit_work
 
 badsys:
-	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
-	jmp ret_from_sys_call
+	SAVE_REST
+	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+	FIXUP_TOP_OF_STACK %rdi
+	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
+	call do_denied_syscall
+	LOAD_ARGS ARGOFFSET, 1
+	RESTORE_REST
+	jmp	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
 
 #ifdef CONFIG_AUDITSYSCALL
 	/*
diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c
new file mode 100644
index 0000000..a676f22
--- /dev/null
+++ b/arch/x86/kernel/syscall_restrict.c
@@ -0,0 +1,229 @@
+#include <linux/thread_info.h>
+#include <linux/pid_namespace.h>
+#include <linux/sysctl.h>
+#include <linux/kprobes.h>
+#include <asm/kdebug.h>
+#include <linux/kdebug.h>
+
+#ifdef CONFIG_IA32_EMULATION
+
+void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+	long error_code, siginfo_t *info);
+
+asmlinkage
+void do_ia32_denied_sysenter(struct pt_regs *regs)
+{
+	current->thread.error_code = 0;
+	current->thread.trap_no = 13;
+
+	if (printk_ratelimit()) {
+		pr_err("%s[%d] attempt to use denied 32-bit sysenter, ip:%lx sp:%lx",
+			current->comm, task_pid_nr(current),
+			regs->ip, regs->sp);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+	force_sig(SIGSEGV, current);
+	return;
+
+}
+
+asmlinkage
+void do_ia32_denied_int(struct pt_regs *regs)
+{
+	if (printk_ratelimit()) {
+		pr_err("%s[%d] attempt to use denied 32-bit int80h, ip :%lx sp:%lx",
+			current->comm, task_pid_nr(current),
+			regs->ip, regs->sp);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+	do_trap(11, SIGBUS, "segment not present", regs, 0, NULL);
+}
+
+asmlinkage
+void do_denied_syscall(struct pt_regs *regs)
+{
+	siginfo_t info = {
+		.si_signo = SIGILL,
+		.si_errno = 0,
+		.si_code = ILL_ILLOPN,
+		.si_addr = (void __user *)regs->ip
+	};
+
+	if (printk_ratelimit()) {
+		pr_err("%s[%d] attempt to use denied 64-bit syscall, ip:%lx sp:%lx",
+			current->comm, task_pid_nr(current),
+			regs->ip, regs->sp);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+
+
+	do_trap(6, SIGILL, "invalid opcode", regs, 0, &info);
+}
+
+static int task_get_bitness(struct task_struct *task)
+{
+	if (test_ti_thread_flag(task_thread_info(task), TIF_IA32))
+		return 32;
+	else
+		return 64;
+}
+
+static bool pidns_locked(struct pid_namespace *pid_ns)
+{
+	struct thread_info *ti = task_thread_info(pid_ns->child_reaper);
+
+	return test_ti_thread_flag(ti, TIF_SYSCALL32_DENIED) ||
+	       test_ti_thread_flag(ti, TIF_SYSCALL64_DENIED);
+}
+
+static int bits_to_flags(int bits)
+{
+	if (bits == 32)
+		return TIF_SYSCALL64_DENIED;
+	else
+		return TIF_SYSCALL32_DENIED;
+}
+
+void arch_post_fork(struct task_struct *task)
+{
+	int clear_bit_nr;
+
+	if (!pidns_locked(current->nsproxy->pid_ns))
+		return;
+
+	clear_bit_nr = bits_to_flags(task_get_bitness(current));
+	set_tsk_thread_flag(task, clear_bit_nr);
+}
+
+/* Called under rcu_read_lock and write_lock_irq(tasklist) */
+static int __pidns_may_lock_bitness(struct pid_namespace *pid_ns, int bits)
+{
+	struct task_struct *task;
+	int old_bits;
+	int nr;
+
+	for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (!task)
+			continue;
+
+		old_bits = task_get_bitness(task);
+		if (old_bits != bits) {
+			pr_err("Inconsistent syscall restriction detected! "
+				"Parent ns tries to restrict syscalls to %d "
+				"bits while some task is %d bit.",
+				bits, old_bits);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/* Called under rcu_read_lock and write_lock_irq(tasklist) */
+static void __bitness_lock(struct pid_namespace *pid_ns, int bits)
+{
+	u32 clear_bit_nr;
+	struct task_struct *task;
+	int nr;
+
+	clear_bit_nr = bits_to_flags(bits);
+
+	for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (task)
+			set_tsk_thread_flag(task, clear_bit_nr);
+	}
+}
+
+static int bitness_lock(struct pid_namespace *pid_ns)
+{
+	int rc, new_bits;
+
+	rcu_read_lock();
+	write_lock_irq(&tasklist_lock);
+
+	new_bits = task_get_bitness(pid_ns->child_reaper);
+	rc = __pidns_may_lock_bitness(pid_ns, new_bits);
+	if (!rc)
+		__bitness_lock(pid_ns, new_bits);
+
+	write_unlock_irq(&tasklist_lock);
+	rcu_read_unlock();
+	return rc;
+}
+
+static int bitness_locked_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int rc, new_bits, old_bits;
+	struct ctl_table tbl = {
+		.procname	= table->procname,
+		.data		= &new_bits,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+	};
+
+	old_bits = new_bits = pidns_locked(current->nsproxy->pid_ns);
+	rc = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+	if (rc || !write)
+		return rc;
+
+	if (!capable(CAP_SYS_ADMIN) || (new_bits == 0 && old_bits))
+		return -EACCES;
+	if (new_bits && old_bits)
+		return 0;
+	return bitness_lock(current->nsproxy->pid_ns);
+}
+
+static struct ctl_table abi_syscall_restrict[] = {
+	{
+		.procname = "bitness_locked",
+		.mode = 0644,
+		.proc_handler = bitness_locked_handler
+	},
+	{}
+};
+
+#else /* CONFIG_IA32_EMULATION */
+
+static int one = 1;
+
+static struct ctl_table abi_syscall_restrict[] = {
+	{
+		.procname	= "bitness_locked",
+		.data		= &one,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+	},
+	{}
+};
+
+#endif /* CONFIG_IA32_EMULATION */
+
+
+static struct ctl_table abi_root[] = {
+	{
+		.procname = "abi",
+		.mode = 0555,
+		.child = abi_syscall_restrict
+	},
+	{}
+};
+
+__init int syscall_restrict_init(void)
+{
+	register_sysctl_table(abi_root);
+	return 0;
+}
+device_initcall(syscall_restrict_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9682ec5..a9bf9cf 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -116,7 +116,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	dec_preempt_count();
 }
 
-static void __kprobes
+void __kprobes
 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	long error_code, siginfo_t *info)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index e7ceaca..55e4455 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1039,6 +1039,10 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 }
 
+#ifndef __HAVE_ARCH_POST_FORK
+#define arch_post_fork(p)
+#endif
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1374,6 +1378,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	total_forks++;
 	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
+	arch_post_fork(p);
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	if (clone_flags & CLONE_THREAD)
-- 

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-12 12:07                   ` Vasiliy Kulikov
@ 2011-08-12 12:23                     ` Solar Designer
  2011-08-13 15:12                       ` Vasiliy Kulikov
  0 siblings, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-12 12:23 UTC (permalink / raw)
  To: kernel-hardening

On Fri, Aug 12, 2011 at 04:07:48PM +0400, Vasiliy Kulikov wrote:
> This is the updated version.  It tries to handle denied syscalls as if
> they are disabled (MSR/IDT entry is not initialized).  I've copied
> handlers' from interrupt handlers and removed kprobes code, which is
> looks like dead in this specific case.

I did suggest this behavior, but I think it's more important to run the
overall idea of this patch by LKML.  So please do that without spending
any further time on the code yet.  Just post this revision as an RFC to
LKML.  Thanks!

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-12 12:23                     ` Solar Designer
@ 2011-08-13 15:12                       ` Vasiliy Kulikov
  2011-08-13 15:19                         ` Solar Designer
  0 siblings, 1 reply; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-13 15:12 UTC (permalink / raw)
  To: kernel-hardening

Solar,

Re: slowdown - my assumptions are:

1) we don't want any slowdown for legitimate tasks - 64 bit tasks for 64
bit containers and 32 bit tasks for 32 bit containers.

2) slowdown of malicious (or broken) tasks is not important.

Looking into asm code:

ENTRY(ia32_sysenter_target)
    ...
	GET_THREAD_INFO(%r10)
    ...
	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
	CFI_REMEMBER_STATE
	jnz  sysenter_tracesys
    ...

sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
	jz	sysenter_auditsys
#endif
    ...
	call	syscall_trace_enter
    ...

/* work to do in syscall_trace_enter() */
#define _TIF_WORK_SYSCALL_ENTRY	\
	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)

So, there is a mask, which is used to identify whether a syscall needs
additional pre/post processing.  If divide syscall_trace_enter() into 3
functions, we'll get what we want.  This will result in zero impact on
the legitimate code (relavite to current behaviour).

One drawback - *tracesys clobbers EAX/RAX, so I still have to patch asm.

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-13 15:12                       ` Vasiliy Kulikov
@ 2011-08-13 15:19                         ` Solar Designer
  2011-08-13 16:55                           ` Vasiliy Kulikov
  0 siblings, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-13 15:19 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Sat, Aug 13, 2011 at 07:12:20PM +0400, Vasiliy Kulikov wrote:
> Re: slowdown - my assumptions are:
> 
> 1) we don't want any slowdown for legitimate tasks - 64 bit tasks for 64
> bit containers and 32 bit tasks for 32 bit containers.
> 
> 2) slowdown of malicious (or broken) tasks is not important.

Right.

> /* work to do in syscall_trace_enter() */
> #define _TIF_WORK_SYSCALL_ENTRY	\
> 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
> 	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
> 
> 
> So, there is a mask, which is used to identify whether a syscall needs
> additional pre/post processing.  If divide syscall_trace_enter() into 3
> functions, we'll get what we want.  This will result in zero impact on
> the legitimate code (relavite to current behaviour).
> 
> One drawback - *tracesys clobbers EAX/RAX, so I still have to patch asm.

I haven't looked into the detail of this, but in general I like the
approach of reusing a check that is already in the code.  Please proceed
with this.

Thanks,

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-13 15:19                         ` Solar Designer
@ 2011-08-13 16:55                           ` Vasiliy Kulikov
  2011-08-13 17:31                             ` Vasiliy Kulikov
                                               ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-13 16:55 UTC (permalink / raw)
  To: kernel-hardening

Solar,

Some thoughts about prctl() approach.

I've decided to go with 2 flags of prctl() - whether 32 bit executable
is allowed on the next execve(), whether 64 bit exec is allowed.  If set
both, any bitness is allowed, and the bitness lock depends on the binary
bitness.  If none set, don't lock at all.

Questions here:

1) If execve() fails, e.g. because of missing binary, drop the flag or
keep it?  I think dropping is safer.

2) If the binary is non-ELF, e.g. a misc binary?  I think execve()
should fail as we expect to run 64/32 bit ELF.

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-13 16:55                           ` Vasiliy Kulikov
@ 2011-08-13 17:31                             ` Vasiliy Kulikov
  2011-08-13 19:25                               ` Solar Designer
  2011-08-13 19:22                             ` Solar Designer
  2011-08-14  9:50                             ` Solar Designer
  2 siblings, 1 reply; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-13 17:31 UTC (permalink / raw)
  To: kernel-hardening

Solar,

On Sat, Aug 13, 2011 at 20:55 +0400, Vasiliy Kulikov wrote:
> Some thoughts about prctl() approach.

Also, to keep the code clean and small, I think sysctl interface should
be dropped.  The only use case is locking a live container, which is
probably a very limited thing.  Also it will fail if any unprivileged task
already runs a binary of other bitness anyway.

Do you agree?

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-13 16:55                           ` Vasiliy Kulikov
  2011-08-13 17:31                             ` Vasiliy Kulikov
@ 2011-08-13 19:22                             ` Solar Designer
  2011-08-14  9:50                             ` Solar Designer
  2 siblings, 0 replies; 32+ messages in thread
From: Solar Designer @ 2011-08-13 19:22 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Sat, Aug 13, 2011 at 08:55:02PM +0400, Vasiliy Kulikov wrote:
> I've decided to go with 2 flags of prctl() - whether 32 bit executable
> is allowed on the next execve(), whether 64 bit exec is allowed.  If set
> both, any bitness is allowed, and the bitness lock depends on the binary
> bitness.  If none set, don't lock at all.

Sounds good.

> 1) If execve() fails, e.g. because of missing binary, drop the flag or
> keep it?  I think dropping is safer.

I wouldn't call this safer, but it does feel more logical.

> 2) If the binary is non-ELF, e.g. a misc binary?  I think execve()
> should fail as we expect to run 64/32 bit ELF.

This makes sense to me.

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-13 17:31                             ` Vasiliy Kulikov
@ 2011-08-13 19:25                               ` Solar Designer
  0 siblings, 0 replies; 32+ messages in thread
From: Solar Designer @ 2011-08-13 19:25 UTC (permalink / raw)
  To: kernel-hardening

On Sat, Aug 13, 2011 at 09:31:18PM +0400, Vasiliy Kulikov wrote:
> Also, to keep the code clean and small, I think sysctl interface should
> be dropped.  The only use case is locking a live container, which is
> probably a very limited thing.  Also it will fail if any unprivileged task
> already runs a binary of other bitness anyway.
> 
> Do you agree?

Yes.

BTW, another user of this feature may be vsftpd - but I guess it will
simply be enhanced to use our prctl().

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-13 16:55                           ` Vasiliy Kulikov
  2011-08-13 17:31                             ` Vasiliy Kulikov
  2011-08-13 19:22                             ` Solar Designer
@ 2011-08-14  9:50                             ` Solar Designer
  2011-08-14 10:16                               ` Vasiliy Kulikov
  2 siblings, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-14  9:50 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Chris Evans, djm

Vasiliy,

On Sat, Aug 13, 2011 at 08:55:02PM +0400, Vasiliy Kulikov wrote:
> Some thoughts about prctl() approach.

Here's a further thought:

The feature makes sense not only for pid namespaces, but also for some
daemons, and especially for their privsep child processes - regardless
of whether they make use of namespaces (vsftpd) or not (openssh).

So maybe the restriction should apply to a process tree rather than to a
namespace?  Sure, in some cases it could be bypassed via ptrace(), but
the intended uses would be either in containers (separate pid namespace)
or in daemons' privsep child processes, which are hopefully running as a
pseudo-user dedicated to the daemon - so can't ptrace() anything else
because of uid mismatch, and can't ptrace() other instances because of
their dumpable flag having been reset on setuid().

Specifically, the prctl() could set your two proposed flag bits
affecting the following execve(), but it could also set a third bit,
which, when set, would immediately lock the current process to its
current bitness (or to current ABI in general) and would also lock
execve()'d binaries in the same way (although in practice such
processes will tend to be chrooted to /var/empty).

So we have three bits:

bit 0 - lock next execve() to 32-bit
bit 1 - lock next execve() to 64-bit
bit 2 - lock current process and execve() to current bitness/ABI

Specific bit combinations are interpreted as follows:

0 - no locking (current behavior)
1 - lock next execve() to 32-bit, fail it if not 32-bit ELF
2 - lock next execve() to 64-bit, fail it if not 64-bit ELF
3 - lock the next execve()'d process to its actual bitness
4 - lock current process and execve() to current bitness
5 - lock current process to current bitness, but execve() to 32-bit ELF
6 - lock current process to current bitness, but execve() to 64-bit ELF
7 - lock current process to current bitness, but next execve() to its actual bitness

5 to 7 are probably not very useful, but are defined for consistency.

Maybe when bit 2 is set, it should also disable ptrace() for the current
process (so that it can't ptrace() other processes even of matching uid).

How does this sound to you?

Concern/to-do: need to figure out how x32 and other ABIs fit into this.

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-14  9:50                             ` Solar Designer
@ 2011-08-14 10:16                               ` Vasiliy Kulikov
  2011-08-14 11:29                                 ` Solar Designer
  0 siblings, 1 reply; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-14 10:16 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Chris Evans, djm

Solar,

On Sun, Aug 14, 2011 at 13:50 +0400, Solar Designer wrote:
> On Sat, Aug 13, 2011 at 08:55:02PM +0400, Vasiliy Kulikov wrote:
> > Some thoughts about prctl() approach.
> 
> Here's a further thought:
> 
> The feature makes sense not only for pid namespaces, but also for some
> daemons, and especially for their privsep child processes - regardless
> of whether they make use of namespaces (vsftpd) or not (openssh).

My current non-posted-yet patch handles the current process rather than
a namespace.  It sets a task-related flag(s), which is checked/dropped on
execve().  On execve() an actual locking is handled.  A locked process
may not drop the lock.

long arch_prctl(int option, unsigned long arg2, unsigned long arg3,
		unsigned long arg4, unsigned long arg5)
{
	switch (option) {
		case PR_BITNESS_LOCK_ON_EXEC:
			if (!capable(CAP_SYS_ADMIN))
				return -EACCES;
			return bitness_set_lock_on_exec(arg3, !!arg2);
            ...

int bitness_set_lock_on_exec(int bits, int val)
{
	switch (bits) {
	case 32:
		current->bitness.lock_32 = val;
		break;
	case 64:
		current->bitness.lock_64 = val;
		break;
	default:
		return -EINVAL;
	}

	return 0;
}

void arch_post_exec_elf(int retval, int elf_class)
{
	if (retval == 0 && current.bitness.lock) {
		int bits = (elf_class == ELFCLASS32) ? 32 : 64;
		__task_bitness_lock(current, bits);
	}
}

The only thing the patch lacks is an ability to lock the task immediately.

> So maybe the restriction should apply to a process tree rather than to a
> namespace?

A process tree is a horrible term.  What if some elements of fork chain
died?

> 5 - lock current process to current bitness, but execve() to 32-bit ELF
> 6 - lock current process to current bitness, but execve() to 64-bit ELF
> 7 - lock current process to current bitness, but next execve() to its actual bitness
> 
> 5 to 7 are probably not very useful, but are defined for consistency.

These are confusing and IMO meaningless modes.

I think a simple way to go is an addition of PR_BITNESS_LOCK prctl() option,
which calls __task_bitness_lock(current, current_bitness()).

Also, I try to handle syscalls as if they are not setup, but there are
trivial ways to do something more than just 32 bit task of 32 bit kernel
or 64 bit task of 64 bit kernel with IA32_EMULATION=n.  The obvious way
is using CS segment of another bitness.  I bet procfs has something
similar.  64 bit locking is rather simple as grep CONFIG_IA32_EMULATION
shows only tens of lines (so, it can be fixed), but emulated 32 bit task
might significantly differ from usual 32 task on 32 bit kernel.

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-14 10:16                               ` Vasiliy Kulikov
@ 2011-08-14 11:29                                 ` Solar Designer
  2011-08-14 11:55                                   ` Vasiliy Kulikov
  0 siblings, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-14 11:29 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Chris Evans, djm

Vasiliy,

On Sun, Aug 14, 2011 at 02:16:58PM +0400, Vasiliy Kulikov wrote:
> My current non-posted-yet patch handles the current process rather than
> a namespace.  It sets a task-related flag(s), which is checked/dropped on
> execve().  On execve() an actual locking is handled.  A locked process
> may not drop the lock.

Sounds good.

> On Sun, Aug 14, 2011 at 13:50 +0400, Solar Designer wrote:
> > So maybe the restriction should apply to a process tree rather than to a
> > namespace?
> 
> A process tree is a horrible term.  What if some elements of fork chain
> died?

Then it's a torn tree. ;-)  OK, let's not use this term.

> > 5 - lock current process to current bitness, but execve() to 32-bit ELF
> > 6 - lock current process to current bitness, but execve() to 64-bit ELF
> > 7 - lock current process to current bitness, but next execve() to its actual bitness
> > 
> > 5 to 7 are probably not very useful, but are defined for consistency.
> 
> These are confusing and IMO meaningless modes.

I agree.

> I think a simple way to go is an addition of PR_BITNESS_LOCK prctl() option,
> which calls __task_bitness_lock(current, current_bitness()).

I am fine with this.  I don't know which approach the LKML folks will
like best.

> Also, I try to handle syscalls as if they are not setup, but there are
> trivial ways to do something more than just 32 bit task of 32 bit kernel
> or 64 bit task of 64 bit kernel with IA32_EMULATION=n.  The obvious way
> is using CS segment of another bitness.  I bet procfs has something
> similar.  64 bit locking is rather simple as grep CONFIG_IA32_EMULATION
> shows only tens of lines (so, it can be fixed), but emulated 32 bit task
> might significantly differ from usual 32 task on 32 bit kernel.

OK, you don't have to emulate the exact same behavior.  Maybe ENOSYS
like you implemented initially would be fine.

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-14 11:29                                 ` Solar Designer
@ 2011-08-14 11:55                                   ` Vasiliy Kulikov
  2011-08-14 12:04                                     ` Solar Designer
  0 siblings, 1 reply; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-14 11:55 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Chris Evans, djm

Solar,

On Sun, Aug 14, 2011 at 15:29 +0400, Solar Designer wrote:
> > I think a simple way to go is an addition of PR_BITNESS_LOCK prctl() option,
> > which calls __task_bitness_lock(current, current_bitness()).
> 
> I am fine with this.  I don't know which approach the LKML folks will
> like best.

Btw, it can be even simplier.  If we use only one flag - lock to the
current bitness - then the code is greatly simplified.  The same
behaviour as with 3 flags can be achieved with binary helpers:

1) vzctl wants to create CT 101 with specific bitness.  If it is 64, it
simply calls prctl(LOCK_BITNESS) and execve's init.  If it is 32, it
exec's small 32 bit helper binary that does the same job, but as 32
bits.  It is compiled from the same source files, so the helper creation
process is trivial.

2) vzctl wants to create CT 101 with the bitness its /sbin/init is.
Then it just looks at /sbin/init and does (1) steps.  There is a race
compared to 3-flags prctl(), but it is not important here.  If init
binary is changed during the startup, something already goes wrong.

3) vsftpd/sshd wants to lock bitness of current task.  It just uses
prctl(LOCK_BITNESS).

We loose a unlock behaviour if execve fails, but it is IMO a very minor
issue.

> > Also, I try to handle syscalls as if they are not setup, but there are
> > trivial ways to do something more than just 32 bit task of 32 bit kernel
> > or 64 bit task of 64 bit kernel with IA32_EMULATION=n.  The obvious way
> > is using CS segment of another bitness.  I bet procfs has something
> > similar.  64 bit locking is rather simple as grep CONFIG_IA32_EMULATION
> > shows only tens of lines (so, it can be fixed), but emulated 32 bit task
> > might significantly differ from usual 32 task on 32 bit kernel.
> 
> OK, you don't have to emulate the exact same behavior.  Maybe ENOSYS
> like you implemented initially would be fine.

Hmm, so you say such emulation is not needed?  Then I can remove 3 C
functions emulating interrupts handlers, 4 asm code chunks to call C
functions, and just patch ptrace_syscall_enter() with 2 line patch to
return -ENOSYS.  It will reduce the patch size by >50% ;)

The only thing is that some programs may do syscalls of other bitness
and get -ENOSYS when the syscall cannot fail and always present (e.g.
getpid()).  But IMO programs willing to do such syscalls are broken.

ENOSYS/kill() differ in kernel policy of handling tasks doing something
wrong.  If we assume syscalls of other bitness are denied and programs
doing it are broken, SIGKILL/SIGSEGV is just OK.  If we assume a task
may "probe" and fallback to something else (which is I'm VERY sceptic),
emulation of absent syscall can be applied (either full emulation or
sending a process'able signal).  If we are cruel and may not tolerate
exploitation attempts, we may kill a process and lock the user as
grsecurity does for similar things.   I think a simple SIGKILL is enough
- it's simple, unambiguous, and is consistent with existing seccomp
behaviour.

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-14 11:55                                   ` Vasiliy Kulikov
@ 2011-08-14 12:04                                     ` Solar Designer
  2011-08-14 12:16                                       ` Vasiliy Kulikov
  2011-08-15 15:38                                       ` Vasiliy Kulikov
  0 siblings, 2 replies; 32+ messages in thread
From: Solar Designer @ 2011-08-14 12:04 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Chris Evans, djm

Vasiliy,

On Sun, Aug 14, 2011 at 03:55:50PM +0400, Vasiliy Kulikov wrote:
> Btw, it can be even simplier.  If we use only one flag - lock to the
> current bitness - then the code is greatly simplified.  The same
> behaviour as with 3 flags can be achieved with binary helpers:

I dislike this.  Please implement extra flags instead.

> 1) vzctl wants to create CT 101 with specific bitness.  If it is 64, it
> simply calls prctl(LOCK_BITNESS) and execve's init.  If it is 32, it
> exec's small 32 bit helper binary that does the same job, but as 32
> bits.  It is compiled from the same source files, so the helper creation
> process is trivial.

I'd rather have a few extra lines of code in the kernel.

> 2) vzctl wants to create CT 101 with the bitness its /sbin/init is.
> Then it just looks at /sbin/init and does (1) steps.

"Looking at" /sbin/init (checking the ELF header?) sounds risky to me.
This depends on when vzctl does that, though (what privileges it still
has at that point).

> > OK, you don't have to emulate the exact same behavior.  Maybe ENOSYS
> > like you implemented initially would be fine.
> 
> Hmm, so you say such emulation is not needed?

I was hoping it'd be simpler, but if it turns out to be non-trivial,
then you can drop it.

And I agree that SIGKILL would be slightly better than -ENOSYS.

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-14 12:04                                     ` Solar Designer
@ 2011-08-14 12:16                                       ` Vasiliy Kulikov
  2011-08-15 15:38                                       ` Vasiliy Kulikov
  1 sibling, 0 replies; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-14 12:16 UTC (permalink / raw)
  To: kernel-hardening; +Cc: Chris Evans, djm

Solar,

On Sun, Aug 14, 2011 at 16:04 +0400, Solar Designer wrote:
> On Sun, Aug 14, 2011 at 03:55:50PM +0400, Vasiliy Kulikov wrote:
> > Btw, it can be even simplier.  If we use only one flag - lock to the
> > current bitness - then the code is greatly simplified.  The same
> > behaviour as with 3 flags can be achieved with binary helpers:
> 
> I dislike this.  Please implement extra flags instead.
> 
> > 1) vzctl wants to create CT 101 with specific bitness.  If it is 64, it
> > simply calls prctl(LOCK_BITNESS) and execve's init.  If it is 32, it
> > exec's small 32 bit helper binary that does the same job, but as 32
> > bits.  It is compiled from the same source files, so the helper creation
> > process is trivial.
> 
> I'd rather have a few extra lines of code in the kernel.
> 
> > 2) vzctl wants to create CT 101 with the bitness its /sbin/init is.
> > Then it just looks at /sbin/init and does (1) steps.
> 
> "Looking at" /sbin/init (checking the ELF header?) sounds risky to me.
> This depends on when vzctl does that, though (what privileges it still
> has at that point).

Then probably we don't need "lock to any bitness on exec" at all?  Only
explicit settings in vz config file.  Configs are anyway dependant on
the contained in the container distro.  The setup will happen only once
(at the container setup), but this way of handling removes some code
from the kernel critical pathes.

What's the global role of "lock to any bitness on exec"?  I can see only
one significant case - have one generic config for containers, which
doesn't depend on the bitness.  But then it is still needs some manual
changes because of the CT root compromize threat (unless we don't think
about CT root compromize, only CT daemons/users compromize).

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-14 12:04                                     ` Solar Designer
  2011-08-14 12:16                                       ` Vasiliy Kulikov
@ 2011-08-15 15:38                                       ` Vasiliy Kulikov
  2011-08-15 21:33                                         ` Solar Designer
                                                           ` (2 more replies)
  1 sibling, 3 replies; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-15 15:38 UTC (permalink / raw)
  To: kernel-hardening

Solar,

Given the latest discussion on LKML we're unlikely to push the
restriction upstream.  So, I'm posting the patch here.

I've implemented immediate locking, lock on 32 exec, lock on 64 bit
exec.  A test program can be downloaded from this link:

ftp://ftp.ru.openwall.com/pvt/segoon/pub/bitness-lock-test/

The demonstration:

    + ./lock32
    My bitness is: 32
    Try to do 32 bit syscall...
    getpid() = 2355
    Try to do 64 bit syscall...
    getpid() = 2355
    + ./lock64
    My bitness is: 64
    Try to do 32 bit syscall...
    getpid() = 2356
    Try to do 64 bit syscall...
    getpid() = 2356
    + ./lock32 -l
    My bitness is: 32
    lock32: prctl(PR_BITNESS_LOCK): Permission denied
    + sudo ./lock32 -l
    My bitness is: 32
    Locked.
    Try to do 32 bit syscall...
    getpid() = 2358
    Try to do 64 bit syscall...
    ./test.sh: строка 6:  2358 Убито              sudo ./lock32 -l
    + sudo ./lock64 -l
    My bitness is: 64
    Locked.
    Try to do 32 bit syscall...
    ./test.sh: строка 7:  2359 Убито              sudo ./lock64 -l
    + sudo ./lock32 -e32 ./lock32
    My bitness is: 32
    Lock to 32 bits on exec.
    Try to do execvp()...
    My bitness is: 32
    Try to do 32 bit syscall...
    getpid() = 2360
    Try to do 64 bit syscall...
    ./test.sh: строка 9:  2360 Убито              sudo ./lock32 -e32 ./lock32
    + sudo ./lock32 -e32 ./lock64
    My bitness is: 32
    Lock to 32 bits on exec.
    Try to do execvp()...
    ./lock64: 1: : not found
    ./lock64: 1: @: not found
    ./lock64: 1: : not found
    ./lock64: 1:ELF\x02\x02: not found
    ./lock64: 2: Syntax error: "(" unexpected
    + sudo ./lock32 -e64 ./lock32
    My bitness is: 32
    Lock to 64 bits on exec.
    Try to do execvp()...
    ./lock32: 1: Syntax error: "(" unexpected
    + sudo ./lock32 -e64 ./lock64
    My bitness is: 32
    Lock to 64 bits on exec.
    Try to do execvp()...
    My bitness is: 64
    Try to do 32 bit syscall...
    ./test.sh: строка 12:  2370 Убито              sudo ./lock32 -e64 ./lock64
    + sudo ./lock64 -e32 ./lock32
    My bitness is: 64
    Lock to 32 bits on exec.
    Try to do execvp()...
    My bitness is: 32
    Try to do 32 bit syscall...
    getpid() = 2371
    Try to do 64 bit syscall...
    ./test.sh: строка 14:  2371 Убито              sudo ./lock64 -e32 ./lock32
    + sudo ./lock64 -e32 ./lock64
    My bitness is: 64
    Lock to 32 bits on exec.
    Try to do execvp()...
    ./lock64: 1: : not found
    ./lock64: 1: @: not found
    ./lock64: 1: : not found
    ./lock64: 1:ELF\x02\x02: not found
    ./lock64: 2: Syntax error: "(" unexpected
    + sudo ./lock64 -e64 ./lock32
    My bitness is: 64
    Lock to 64 bits on exec.
    Try to do execvp()...
    ./lock32: 1: Syntax error: "(" unexpected
    + sudo ./lock64 -e64 ./lock64
    My bitness is: 64
    Lock to 64 bits on exec.
    Try to do execvp()...
    My bitness is: 64
    Try to do 32 bit syscall...
    ./test.sh: строка 17:  2381 Убито              sudo ./lock64 -e64 ./lock64
    + sudo ./lock32 -e ./lock32
    My bitness is: 32
    Locked.
    Try to do execvp()...
    My bitness is: 32
    Try to do 32 bit syscall...
    getpid() = 2382
    Try to do 64 bit syscall...
    ./test.sh: строка 19:  2382 Убито              sudo ./lock32 -e ./lock32
    + sudo ./lock32 -e ./lock64
    My bitness is: 32
    Locked.
    Try to do execvp()...
    lock32: execvp: Exec format error
    Second attempt ...
    lock32: execvp: Exec format error
    + sudo ./lock64 -e ./lock32
    My bitness is: 64
    Locked.
    Try to do execvp()...
    ./lock32: 1: Syntax error: "(" unexpected
    + sudo ./lock64 -e ./lock64
    My bitness is: 64
    Locked.
    Try to do execvp()...
    My bitness is: 64
    Try to do 32 bit syscall...
    ./test.sh: строка 22:  2395 Убито              sudo ./lock64 -e ./lock64

Note the strage output of -e32 lock64, -e64 lock32, lock64 -e lock32.
There is a major problem with lock on exec (ptrace output):

    execve("./lock32", ["./lock32", "-e64", "./lock32"], [/* 17 vars */]) = 0
    ...
    prctl(0x23 /* PR_??? */, 0x1, 0x40, 0, 0) = 0
    execve("./lock32", ["./lock32"], [/* 28 vars */]) = -1 ENOEXEC (Exec format error)
    execve("/bin/sh", ["/bin/sh", "./lock32"], [/* 28 vars */]) = 0
    brk(0)                                  = 0x2447000
    ...

So, library function tries to run /bin/sh if no kernel interpreter is
found.  As the first execve(2) failed, the lock on exec is not forced
anymore, but from the application point of view it is the only execve().
For -e lock32 the expectation is not broken, but 32bit ELF is still tried to
be passed to 64bit /bin/sh.

My point is still that we should keep the only flag - lock current
process and implement simple re-exec of vzctl.  But other ways like
workaround of multiple execve() calls are welcome.

--
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a0e866d..c7960f3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -152,7 +152,7 @@ ENTRY(ia32_sysenter_target)
  	.previous	
 	GET_THREAD_INFO(%r10)
 	orl    $TS_COMPAT,TI_status(%r10)
-	testl  $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	testl  $_TIF_WORK_SYSCALL_ENTRY_32,TI_flags(%r10)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	cmpq	$(IA32_NR_syscalls-1),%rax
@@ -238,7 +238,7 @@ sysexit_audit:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY_32 & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
 	jz	sysenter_auditsys
 #endif
 	SAVE_REST
@@ -311,7 +311,7 @@ ENTRY(ia32_cstar_target)
 	.previous	
 	GET_THREAD_INFO(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	testl $_TIF_WORK_SYSCALL_ENTRY_32,TI_flags(%r10)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	cmpq $IA32_NR_syscalls-1,%rax
@@ -355,7 +355,7 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+	testl $(_TIF_WORK_SYSCALL_ENTRY_32 & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
@@ -422,7 +422,7 @@ ENTRY(ia32_syscall)
 	SAVE_ARGS 0,1,0
 	GET_THREAD_INFO(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+	testl $_TIF_WORK_SYSCALL_ENTRY_32,TI_flags(%r10)
 	jnz ia32_tracesys
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f2ad216..b292580 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -320,4 +320,10 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
 #define arch_randomize_brk arch_randomize_brk
 
+extern void arch_post_exec_elf(int retval, int elf_class);
+#define arch_post_exec_elf arch_post_exec_elf
+
+extern void arch_post_execve(void);
+#define arch_post_execve arch_post_execve
+
 #endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 94e7618..e95986e 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -281,6 +281,12 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
+extern int syscall_bitness_check(void);
+
+extern long arch_prctl(int option, unsigned long arg2, unsigned long arg3,
+		unsigned long arg4, unsigned long arg5);
+#define arch_prctl arch_prctl
+
 #endif /* __KERNEL__ */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..c4b5464 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,8 @@ struct thread_info {
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_SYSCALL32_DENIED	29	/* 32 bit syscalls are denied */
+#define TIF_SYSCALL64_DENIED	30	/* 64 bit syscalls are denied */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -117,11 +119,20 @@ struct thread_info {
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_SYSCALL32_DENIED	(1 << TIF_SYSCALL32_DENIED)
+#define _TIF_SYSCALL64_DENIED	(1 << TIF_SYSCALL64_DENIED)
 
 /* work to do in syscall_trace_enter() */
-#define _TIF_WORK_SYSCALL_ENTRY	\
+#define _TIF_WORK_SYSCALL_ENTRY_64	\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
-	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
+	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_SYSCALL64_DENIED)
+
+/* work to do in syscall_trace_enter() */
+#define _TIF_WORK_SYSCALL_ENTRY_32	\
+	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
+	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_SYSCALL32_DENIED)
 
 /* work to do in syscall_trace_leave() */
 #define _TIF_WORK_SYSCALL_EXIT	\
@@ -259,9 +270,22 @@ static inline void set_restore_sigmask(void)
 	ti->status |= TS_RESTORE_SIGMASK;
 	set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
 }
-#endif	/* !__ASSEMBLY__ */
 
-#ifndef __ASSEMBLY__
+#ifdef CONFIG_IA32_EMULATION
+
+extern void arch_post_fork(struct task_struct *task);
+#define arch_post_fork arch_post_fork
+
+#define BITNESS_LOCK_32 1
+#define BITNESS_LOCK_64 2
+
+struct bitness_lock_on_exec {
+	int lock;
+};
+#define bitness_lock_on_exec bitness_lock_on_exec
+
+#endif /* CONFIG_IA32_EMULATION */
+
 extern void arch_task_cache_init(void);
 extern void free_thread_info(struct thread_info *ti);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0410557..a200ff3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault_32.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SYSCTL)		+= syscall_restrict.o
 
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e13329d..77534b7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -474,7 +474,7 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+	testl $_TIF_WORK_SYSCALL_ENTRY_64,TI_flags(%rcx)
 	jnz tracesys
 system_call_fastpath:
 	cmpq $__NR_syscall_max,%rax
@@ -578,7 +578,7 @@ sysret_audit:
 	/* Do syscall tracing */
 tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+	testl $(_TIF_WORK_SYSCALL_ENTRY_64 & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
 	jz auditsys
 #endif
 	SAVE_REST
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8252879..39d0a85 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1378,7 +1378,16 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (test_thread_flag(TIF_SINGLESTEP))
 		regs->flags |= X86_EFLAGS_TF;
 
-	/* do the secure computing check first */
+	/*
+	 * Do the syscall bitness check first.
+	 *
+	 * If the bitness is denied, exit immediatelly to reduce
+	 * the size of executed code.
+	 */
+	if (syscall_bitness_check())
+		return -1L;
+
+	/* Then check the syscall number. */
 	secure_computing(regs->orig_ax);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
diff --git a/arch/x86/kernel/syscall_restrict.c b/arch/x86/kernel/syscall_restrict.c
new file mode 100644
index 0000000..6001b3a
--- /dev/null
+++ b/arch/x86/kernel/syscall_restrict.c
@@ -0,0 +1,200 @@
+#include <linux/thread_info.h>
+#include <linux/pid_namespace.h>
+#include <linux/sysctl.h>
+#include <linux/kprobes.h>
+#include <linux/ratelimit.h>
+#include <linux/printk.h>
+#include <linux/kdebug.h>
+#include <linux/elf.h>
+#include <linux/prctl.h>
+#include <linux/binfmts.h>
+#include <asm/kdebug.h>
+#include <asm/compat.h>
+
+#ifdef CONFIG_IA32_EMULATION
+
+int syscall_bitness_check(void)
+{
+	int flag;
+
+	if (is_compat_task())
+		flag = TIF_SYSCALL32_DENIED;
+	else
+		flag = TIF_SYSCALL64_DENIED;
+
+	if (test_thread_flag(flag)) {
+		pr_err_ratelimited("%s[%d]: attempt to do a syscall of denied "
+			"bitness\n", current->comm, task_pid_nr(current));
+		force_sig(SIGKILL, current);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int bits_to_flags(int bits)
+{
+	switch (bits) {
+	case 32:
+		return TIF_SYSCALL64_DENIED;
+	case 64:
+		return TIF_SYSCALL32_DENIED;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int __bitness_lock(int bits)
+{
+	int clear_bit_nr = bits_to_flags(bits);
+
+	if (clear_bit_nr < 0)
+		return clear_bit_nr;
+
+	set_tsk_thread_flag(current, clear_bit_nr);
+	return 0;
+}
+
+static int bitness_set_lock_on_exec(int bits, int val)
+{
+	int mask;
+
+	switch (bits) {
+	case 32:
+		mask = BITNESS_LOCK_32;
+		break;
+	case 64:
+		mask = BITNESS_LOCK_64;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	current->bitness_lock_on_exec.lock &= ~mask;
+	if (val)
+		current->bitness_lock_on_exec.lock |= mask;
+
+	return 0;
+}
+
+static bool elf_may_exec(void)
+{
+	if (test_thread_flag(TIF_SYSCALL64_DENIED))
+		return false;
+
+	/* Either we're don't want to be locked or
+	 * we're ok to lock to the ELF's bitness */
+	if (current->bitness_lock_on_exec.lock == 0 ||
+	    current->bitness_lock_on_exec.lock & BITNESS_LOCK_64)
+		return true;
+
+	return false;
+}
+
+bool compat_elf_may_exec(void)
+{
+	if (test_thread_flag(TIF_SYSCALL32_DENIED))
+		return false;
+
+	/* Either we're don't want to be locked or
+	 * we're ok to lock to the ELF's bitness */
+	if (current->bitness_lock_on_exec.lock == 0 ||
+	    current->bitness_lock_on_exec.lock & BITNESS_LOCK_32)
+		return true;
+
+	return false;
+}
+
+extern struct linux_binfmt elf_format, compat_elf_format, script_format;
+
+bool arch_check_interpreter(struct linux_binfmt *fmt)
+{
+	if (fmt == &compat_elf_format)
+		return compat_elf_may_exec();
+
+	if (fmt == &elf_format)
+		return elf_may_exec();
+
+	/* We're ok with loading script, which interpreter is legitimate ELF */
+	if (fmt == &script_format)
+		return true;
+
+	if (current->bitness_lock_on_exec.lock == 0)
+		return true;
+
+	return false;
+}
+
+/*
+ * We cannot do it in arch_post_exec_elf() as it can be called from
+ * binfmt_script's handler, which may fail. If the call sequence is
+ *
+ *  binfmt_script -> binfmt_elf => FAIL
+ *  binfmt_elf => OK
+ *
+ * We should be locked.  To keep code simple, we just clear ->.bitness.lock
+ * on each execve() regardless of return code.
+ */
+void arch_post_execve(void)
+{
+	current->bitness_lock_on_exec.lock = 0;
+}
+
+void arch_post_exec_elf(int retval, int elf_class)
+{
+	if (retval == 0 && current->bitness_lock_on_exec.lock) {
+		int bits = (elf_class == ELFCLASS32) ? 32 : 64;
+		__bitness_lock(bits);
+	}
+}
+
+#else /* CONFIG_IA32_EMULATION */
+
+bool arch_check_interpreter(struct linux_binfmt *fmt) { return true; }
+void arch_post_execve(void) {}
+void arch_post_exec_elf(int retval, int elf_class) {}
+
+static int bitness_set_lock_on_exec(int bits, int val)
+{
+	if (bits == 64)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+static int __bitness_lock(int bits)
+{
+	if (bits == 64)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+#endif /* CONFIG_IA32_EMULATION */
+
+int current_bitness(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+	if (test_thread_flag(TIF_IA32))
+		return 32;
+	else
+#endif
+		return 64;
+}
+
+long arch_prctl(int option, unsigned long arg2, unsigned long arg3,
+		unsigned long arg4, unsigned long arg5)
+{
+	switch (option) {
+	case PR_BITNESS_LOCK_ON_EXEC:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return bitness_set_lock_on_exec(arg3, !!arg2);
+	case PR_BITNESS_LOCK:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return __bitness_lock(current_bitness());
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index dd0fdfc..d25cd50 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -65,7 +65,7 @@ static int elf_core_dump(struct coredump_params *cprm);
 #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
 #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
 
-static struct linux_binfmt elf_format = {
+struct linux_binfmt elf_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_elf_binary,
 	.load_shlib	= load_elf_library,
@@ -556,6 +556,12 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 #endif
 }
 
+#if 0
+#ifndef arch_post_exec_elf
+#define arch_post_exec_elf(rc, class)
+#endif
+#endif
+
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 {
 	struct file *interpreter = NULL; /* to shut gcc up */
@@ -979,6 +985,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 out:
 	kfree(loc);
 out_ret:
+	arch_post_exec_elf(retval, ELF_CLASS);
 	return retval;
 
 	/* error cleanup */
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 396a988..be0e4c5 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -98,7 +98,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 	return search_binary_handler(bprm,regs);
 }
 
-static struct linux_binfmt script_format = {
+struct linux_binfmt script_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_script,
 };
diff --git a/fs/exec.c b/fs/exec.c
index da80612..93f062b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1360,6 +1360,8 @@ out:
 }
 EXPORT_SYMBOL(remove_arg_zero);
 
+extern bool arch_check_interpreter(struct linux_binfmt *fmt);
+
 /*
  * cycle the list of binary formats handler, until one recognizes the image
  */
@@ -1390,6 +1392,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
 			if (!fn)
 				continue;
+			if (!arch_check_interpreter(fmt))
+				continue;
 			if (!try_module_get(fmt->module))
 				continue;
 			read_unlock(&binfmt_lock);
@@ -1441,11 +1445,19 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 		break;
 #endif
 	}
+
+	pr_err("search_binary_handler(%u) = %d\n", (int)current->pid, (int)retval);
 	return retval;
 }
 
 EXPORT_SYMBOL(search_binary_handler);
 
+#if 0
+#ifndef arch_post_execve
+#define arch_post_execve()
+#endif
+#endif
+
 /*
  * sys_execve() executes a new program.
  */
@@ -1527,6 +1539,7 @@ static int do_execve_common(const char *filename,
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
 	acct_update_integrals(current);
+	arch_post_execve();
 	free_bprm(bprm);
 	if (displaced)
 		put_files_struct(displaced);
@@ -1556,6 +1569,7 @@ out_files:
 	if (displaced)
 		reset_files_struct(displaced);
 out_ret:
+	arch_post_execve();
 	return retval;
 }
 
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..91edb9c 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,7 @@
 
 #define PR_MCE_KILL_GET 34
 
+#define PR_BITNESS_LOCK_ON_EXEC	35
+#define PR_BITNESS_LOCK		36
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 20b03bf..ee5ba82 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,6 +1046,10 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 }
 #endif	/* !CONFIG_SMP */
 
+#ifndef bitness_lock_on_exec
+struct bitness_lock_on_exec {};
+#define bitness_lock_on_exec bitness_lock_on_exec
+#endif /* bitness_lock_t */
 
 struct io_context;			/* See blkdev.h */
 
@@ -1405,6 +1409,7 @@ struct task_struct {
 	unsigned int sessionid;
 #endif
 	seccomp_t seccomp;
+	struct bitness_lock_on_exec bitness_lock_on_exec;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/kernel/sys.c b/kernel/sys.c
index a101ba3..e7faa8b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1644,6 +1644,10 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
+#ifndef arch_prctl
+#define arch_prctl(o, a2, a3, a4, a5) (-EINVAL)
+#endif
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -1793,7 +1797,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 				error = PR_MCE_KILL_DEFAULT;
 			break;
 		default:
-			error = -EINVAL;
+			error = arch_prctl(option, arg2, arg3, arg4, arg5);
 			break;
 	}
 	return error;

-- 
Vasiliy

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-15 15:38                                       ` Vasiliy Kulikov
@ 2011-08-15 21:33                                         ` Solar Designer
  2011-08-16  6:39                                           ` Vasiliy Kulikov
  2011-08-15 21:46                                         ` Solar Designer
  2011-08-18 10:34                                         ` Solar Designer
  2 siblings, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-15 21:33 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Mon, Aug 15, 2011 at 07:38:36PM +0400, Vasiliy Kulikov wrote:
> Given the latest discussion on LKML we're unlikely to push the
> restriction upstream.  So, I'm posting the patch here.

I felt that you were too quick to give up, but you appear to be right.

Are you proposing this for OpenVZ and distro kernels now?

Thanks,

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-15 15:38                                       ` Vasiliy Kulikov
  2011-08-15 21:33                                         ` Solar Designer
@ 2011-08-15 21:46                                         ` Solar Designer
  2011-08-16  6:25                                           ` Vasiliy Kulikov
  2011-08-18 10:34                                         ` Solar Designer
  2 siblings, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-15 21:46 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Mon, Aug 15, 2011 at 07:38:36PM +0400, Vasiliy Kulikov wrote:
> Note the strage output of -e32 lock64, -e64 lock32, lock64 -e lock32.
> There is a major problem with lock on exec (ptrace output):
> 
>     execve("./lock32", ["./lock32", "-e64", "./lock32"], [/* 17 vars */]) = 0
>     ...
>     prctl(0x23 /* PR_??? */, 0x1, 0x40, 0, 0) = 0
>     execve("./lock32", ["./lock32"], [/* 28 vars */]) = -1 ENOEXEC (Exec format error)
>     execve("/bin/sh", ["/bin/sh", "./lock32"], [/* 28 vars */]) = 0
>     brk(0)                                  = 0x2447000
>     ...
> 
> So, library function tries to run /bin/sh if no kernel interpreter is
> found.  As the first execve(2) failed, the lock on exec is not forced
> anymore, but from the application point of view it is the only execve().
> For -e lock32 the expectation is not broken, but 32bit ELF is still tried to
> be passed to 64bit /bin/sh.

That's nasty.

> My point is still that we should keep the only flag - lock current
> process and implement simple re-exec of vzctl.

It's not so simple.  It means, for example, that Owl built for x86_64
should also contain a version of vzctl built for i686 - but it normally
lacks development tools and libraries for that (we don't currently do
multilib within a single build of Owl).

> But other ways like workaround of multiple execve() calls are welcome.

Given your discovery, maybe we should have execve() return an error code
like -EPERM, such that the library would not try the shell?

Thanks,

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-15 21:46                                         ` Solar Designer
@ 2011-08-16  6:25                                           ` Vasiliy Kulikov
  0 siblings, 0 replies; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-16  6:25 UTC (permalink / raw)
  To: kernel-hardening

On Tue, Aug 16, 2011 at 01:46 +0400, Solar Designer wrote:
> > My point is still that we should keep the only flag - lock current
> > process and implement simple re-exec of vzctl.
> 
> It's not so simple.  It means, for example, that Owl built for x86_64
> should also contain a version of vzctl built for i686 - but it normally
> lacks development tools and libraries for that (we don't currently do
> multilib within a single build of Owl).
> 
> > But other ways like workaround of multiple execve() calls are welcome.
> 
> Given your discovery, maybe we should have execve() return an error code
> like -EPERM, such that the library would not try the shell?

Other way - do syscall(__NR_execve, ...) instead of execve(...).  It is
a bit ugly, but given it will be used in only one place (and explicitly
by programs, whithout any wrapper) IMO it's acceptable.

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-15 21:33                                         ` Solar Designer
@ 2011-08-16  6:39                                           ` Vasiliy Kulikov
  0 siblings, 0 replies; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-16  6:39 UTC (permalink / raw)
  To: kernel-hardening

Solar,

On Tue, Aug 16, 2011 at 01:33 +0400, Solar Designer wrote:
> Are you proposing this for OpenVZ and distro kernels now?

For OpenVZ it needs s/CAP_SYS_ADMIN/CAP_VE_SYS_ADMIN/ to be able to use
the feature by in-CT root programs.  But given it doesn't go to
upstream, it's unlikely to be needed.

As for the implementation, it looks it's ready and it passes lock.c
tests.  But, as usual, additional testing doesn't hurt :)

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-15 15:38                                       ` Vasiliy Kulikov
  2011-08-15 21:33                                         ` Solar Designer
  2011-08-15 21:46                                         ` Solar Designer
@ 2011-08-18 10:34                                         ` Solar Designer
  2011-08-18 14:42                                           ` Vasiliy Kulikov
  2 siblings, 1 reply; 32+ messages in thread
From: Solar Designer @ 2011-08-18 10:34 UTC (permalink / raw)
  To: kernel-hardening

Vasiliy,

On Mon, Aug 15, 2011 at 07:38:36PM +0400, Vasiliy Kulikov wrote:
> Given the latest discussion on LKML we're unlikely to push the
> restriction upstream.  So, I'm posting the patch here.

I feel that it may be desirable to post the patch to LKML as well - not
as RFC, nor PATCH anymore, but merely for the sake of completeness.  We
promised a better patch, so we should provide it, even if we know it's
NAK'ed.  Just word your message such that no one gets the impression
we're pushing this regardless of it having been NAK'ed.  Mention
explicitly that given the discussion so far you do not expect this to be
applied, but you are posting this for the sake of completeness - to have
this proposal archived in full, including the actual patch.

Does this make sense to you?

Alexander

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [kernel-hardening] 32/64 bitness restriction for pid namespace
  2011-08-18 10:34                                         ` Solar Designer
@ 2011-08-18 14:42                                           ` Vasiliy Kulikov
  0 siblings, 0 replies; 32+ messages in thread
From: Vasiliy Kulikov @ 2011-08-18 14:42 UTC (permalink / raw)
  To: kernel-hardening

Solar,

On Thu, Aug 18, 2011 at 14:34 +0400, Solar Designer wrote:
> On Mon, Aug 15, 2011 at 07:38:36PM +0400, Vasiliy Kulikov wrote:
> > Given the latest discussion on LKML we're unlikely to push the
> > restriction upstream.  So, I'm posting the patch here.
> 
> I feel that it may be desirable to post the patch to LKML as well - not
> as RFC, nor PATCH anymore, but merely for the sake of completeness.  We
> promised a better patch, so we should provide it, even if we know it's
> NAK'ed.  Just word your message such that no one gets the impression
> we're pushing this regardless of it having been NAK'ed.  Mention
> explicitly that given the discussion so far you do not expect this to be
> applied, but you are posting this for the sake of completeness - to have
> this proposal archived in full, including the actual patch.
> 
> Does this make sense to you?

Just sent it with minor corrections to be compilable on non-x86.

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2011-08-18 14:42 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-07 11:00 [kernel-hardening] 32/64 bitness restriction for pid namespace Vasiliy Kulikov
2011-08-08 17:39 ` [kernel-hardening] " Vasiliy Kulikov
2011-08-10  9:52   ` Vasiliy Kulikov
2011-08-10 13:03     ` [kernel-hardening] " Solar Designer
2011-08-10 13:27       ` Vasiliy Kulikov
2011-08-10 14:26         ` Solar Designer
2011-08-10 15:02           ` Vasiliy Kulikov
2011-08-10 15:40             ` Solar Designer
2011-08-10 16:21               ` Vasiliy Kulikov
2011-08-10 16:42                 ` Solar Designer
2011-08-12 12:07                   ` Vasiliy Kulikov
2011-08-12 12:23                     ` Solar Designer
2011-08-13 15:12                       ` Vasiliy Kulikov
2011-08-13 15:19                         ` Solar Designer
2011-08-13 16:55                           ` Vasiliy Kulikov
2011-08-13 17:31                             ` Vasiliy Kulikov
2011-08-13 19:25                               ` Solar Designer
2011-08-13 19:22                             ` Solar Designer
2011-08-14  9:50                             ` Solar Designer
2011-08-14 10:16                               ` Vasiliy Kulikov
2011-08-14 11:29                                 ` Solar Designer
2011-08-14 11:55                                   ` Vasiliy Kulikov
2011-08-14 12:04                                     ` Solar Designer
2011-08-14 12:16                                       ` Vasiliy Kulikov
2011-08-15 15:38                                       ` Vasiliy Kulikov
2011-08-15 21:33                                         ` Solar Designer
2011-08-16  6:39                                           ` Vasiliy Kulikov
2011-08-15 21:46                                         ` Solar Designer
2011-08-16  6:25                                           ` Vasiliy Kulikov
2011-08-18 10:34                                         ` Solar Designer
2011-08-18 14:42                                           ` Vasiliy Kulikov
2011-08-12  9:09                 ` Vasiliy Kulikov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.