All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/7] tracing: split out syscall_trace_enter construction
@ 2011-04-28  3:08 Will Drewry
  2011-04-28  3:08 ` [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering Will Drewry
                   ` (4 more replies)
  0 siblings, 5 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28  3:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, eparis, agl, mingo, jmorris, rostedt, Will Drewry,
	Frederic Weisbecker, Ingo Molnar

perf appears to be the primary consumer of the CONFIG_FTRACE_SYSCALLS
infrastructure.  As such, many of the helpers that target perf can be
split into a perf-focused helper and a generic CONFIG_FTRACE_SYSCALLS
consumer interface.

This change splits out syscall_trace_enter construction from
perf_syscall_enter for current into two helpers:
- ftrace_syscall_enter_state
- ftrace_syscall_enter_state_size

And adds another helper for completeness:
- ftrace_syscall_exit_state_size

These helpers allow for shared code between perf ftrace events and
any other consumers of CONFIG_FTRACE_SYSCALLS events.  The proposed
seccomp_filter patches use this code.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 include/trace/syscall.h       |    4 ++
 kernel/trace/trace_syscalls.c |   96 +++++++++++++++++++++++++++++++++++------
 2 files changed, 86 insertions(+), 14 deletions(-)

diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 31966a4..242ae04 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -41,6 +41,10 @@ extern int reg_event_syscall_exit(struct ftrace_event_call *call);
 extern void unreg_event_syscall_exit(struct ftrace_event_call *call);
 extern int
 ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s);
+extern int ftrace_syscall_enter_state(u8 *buf, size_t available,
+				      struct trace_entry **entry);
+extern size_t ftrace_syscall_enter_state_size(int nb_args);
+extern size_t ftrace_syscall_exit_state_size(void);
 enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
 				      struct trace_event *event);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee7b5a0..f37f120 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -95,7 +95,7 @@ find_syscall_meta(unsigned long syscall)
 	return NULL;
 }
 
-static struct syscall_metadata *syscall_nr_to_meta(int nr)
+struct syscall_metadata *syscall_nr_to_meta(int nr)
 {
 	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
 		return NULL;
@@ -498,7 +498,7 @@ static int sys_perf_refcount_exit;
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
-	struct syscall_trace_enter *rec;
+	void *buf;
 	struct hlist_head *head;
 	int syscall_nr;
 	int rctx;
@@ -513,25 +513,22 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
-	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
-	size = ALIGN(size + sizeof(u32), sizeof(u64));
-	size -= sizeof(u32);
+	size = ftrace_syscall_enter_state_size(sys_data->nb_args);
 
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 		      "perf buffer not large enough"))
 		return;
 
-	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-				sys_data->enter_event->event.type, regs, &rctx);
-	if (!rec)
+	buf = perf_trace_buf_prepare(size, sys_data->enter_event->event.type,
+				     regs, &rctx);
+	if (!buf)
 		return;
 
-	rec->nr = syscall_nr;
-	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
-			       (unsigned long *)&rec->args);
+	/* The only error conditions in this helper are handled above. */
+	ftrace_syscall_enter_state(buf, size, NULL);
 
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(buf, size, rctx, 0, 1, regs, head);
 }
 
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -587,8 +584,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 		return;
 
 	/* We can probably do that at build time */
-	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
-	size -= sizeof(u32);
+	size = ftrace_syscall_exit_state_size();
 
 	/*
 	 * Impossible, but be paranoid with the future
@@ -688,3 +684,75 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 	}
 	return 0;
 }
+
+/* ftrace_syscall_enter_state_size - returns the state size required.
+ *
+ * @nb_args: number of system call args expected.
+ *           a negative value implies the maximum allowed.
+ */
+size_t ftrace_syscall_enter_state_size(int nb_args)
+{
+	/* syscall_get_arguments only supports up to 6 arguments. */
+	int arg_count = (nb_args >= 0 ? nb_args : 6);
+	size_t size = (sizeof(unsigned long) * arg_count) +
+		      sizeof(struct syscall_trace_enter);
+	size = ALIGN(size + sizeof(u32), sizeof(u64));
+	size -= sizeof(u32);
+	return size;
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_enter_state_size);
+
+size_t ftrace_syscall_exit_state_size(void)
+{
+	return ALIGN(sizeof(struct syscall_trace_exit) + sizeof(u32),
+		     sizeof(u64)) - sizeof(u32);
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_exit_state_size);
+
+/* ftrace_syscall_enter_state - build state for filter matching
+ *
+ * @buf: buffer to populate with current task state for matching
+ * @available: size available for use in the buffer.
+ * @entry: optional pointer to the trace_entry member of the state.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ * If @entry is NULL, it will be ignored.
+ */
+int ftrace_syscall_enter_state(u8 *buf, size_t available,
+			       struct trace_entry **entry)
+{
+	struct syscall_trace_enter *sys_enter;
+	struct syscall_metadata *sys_data;
+	int size;
+	int syscall_nr;
+	struct pt_regs *regs = task_pt_regs(current);
+
+	syscall_nr = syscall_get_nr(current, regs);
+	if (syscall_nr < 0)
+		return -EINVAL;
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return -EINVAL;
+
+	/* Determine the actual size needed. */
+	size = sizeof(unsigned long) * sys_data->nb_args +
+	       sizeof(struct syscall_trace_enter);
+	size = ALIGN(size + sizeof(u32), sizeof(u64));
+	size -= sizeof(u32);
+
+	BUG_ON(size > available);
+	sys_enter = (struct syscall_trace_enter *)buf;
+
+	/* Populating the struct trace_sys_enter is left to the caller, but
+	 * a pointer is returned to encourage opacity.
+	 */
+	if (entry)
+		*entry = &sys_enter->ent;
+
+	sys_enter->nr = syscall_nr;
+	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+			      sys_enter->args);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_enter_state);
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28  3:08 [PATCH 2/7] tracing: split out syscall_trace_enter construction Will Drewry
@ 2011-04-28  3:08 ` Will Drewry
  2011-04-28 13:50   ` Steven Rostedt
                     ` (5 more replies)
  2011-04-28  3:08 ` [PATCH 4/7] seccomp_filter: add process state reporting Will Drewry
                   ` (3 subsequent siblings)
  4 siblings, 6 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28  3:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, eparis, agl, mingo, jmorris, rostedt, Will Drewry,
	Frederic Weisbecker, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Roland McGrath, Peter Zijlstra,
	Jiri Slaby, David Howells, Serge E. Hallyn

This change adds a new seccomp mode based on the work by
agl@chromium.org. This mode comes with a bitmask of NR_syscalls size and
an optional linked list of seccomp_filter objects. When in mode 2, all
system calls are first checked against the bitmask to determine if they
are allowed or denied.  If allowed, the list of filters is checked for
the given syscall number. If all filter predicates for the system call
match or the system call was allowed without restriction, the process
continues. Otherwise, it is killed and a KERN_INFO notification is
posted.

The filter language itself is provided by the ftrace filter engine.
Related patches tweak to the perf filter trace and free allow the calls
to be shared. Filters inherit their understanding of types and arguments
for each system call from the CONFIG_FTRACE_SYSCALLS subsystem which
predefines this information in syscall_metadata associated enter_event
(and exit_event) structures.

The result is that a process may reduce its available interfaces to
the kernel through prctl() without knowing the appropriate system call
number a priori and with the flexibility of filtering based on
register-stored arguments.  (String checks suffer from TOCTOU issues and
should be left to LSMs to provide policy for! Don't get greedy :)

A sample filterset for a process that only needs to interact over stdin
and stdout and exit cleanly is shown below:
  sys_read: fd == 0
  sys_write: fd == 1
  sys_exit_group: 1

The filters may be specified once prior to entering the reduced access
state:
  prctl(PR_SET_SECCOMP, 2, filters);
If prctl() is in the allowed bitmask, the process may futher reduce
privileges by dropping system calls from the allowed set.

The only other twist is that it is possible to delay enforcement by one
system call by supplying a "on_next_syscall: 1" 'filter'.  This allows
for a launcher process to fork(), prctl(), then execve() leaving the
launched binary in a filtered state.

Implementation-wise, seccomp.c now uses seccomp_state struct which is
managed using RCU primitives.  It contains the system call bitmask and
a linked list of seccomp_filters (which are also managed as an RCU
list).  All mutations (barring one optional bit flip) to the
seccomp_state are always done on a duplicate of the current state value
which is swapped on prior to use.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 include/linux/seccomp.h |   98 +++++++++++-
 include/trace/syscall.h |    2 +
 kernel/Makefile         |    1 +
 kernel/fork.c           |    8 +
 kernel/seccomp.c        |  144 +++++++++++++++--
 kernel/seccomp_filter.c |  428 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c            |   11 +-
 kernel/trace/Kconfig    |   10 +
 8 files changed, 687 insertions(+), 15 deletions(-)
 create mode 100644 kernel/seccomp_filter.c

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 167c333..16703c4 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -4,10 +4,33 @@
 
 #ifdef CONFIG_SECCOMP
 
+#include <linux/list.h>
 #include <linux/thread_info.h>
+#include <linux/types.h>
 #include <asm/seccomp.h>
 
-typedef struct { int mode; } seccomp_t;
+/**
+ * struct seccomp_state - the state of a seccomp'ed process
+ *
+ * @mode:
+ *     if this is 1, the process is under standard seccomp rules
+ *             is 2, the process is only allowed to make system calls where
+ *                   the corresponding bit is set in bitmask and any
+ *                   associated filters evaluate successfully.
+ * @usage: number of references to the current instance.
+ * @bitmask: a mask of allowed or filtered system calls and additional flags.
+ * @filter_count: number of seccomp filters in @filters.
+ * @filters: list of seccomp_filter entries for  system calls.
+ */
+struct seccomp_state {
+	uint16_t mode;
+	atomic_t usage;
+	DECLARE_BITMAP(bitmask, NR_syscalls + 1);
+	int filter_count;
+	struct list_head filters;
+};
+
+typedef struct { struct seccomp_state *state; } seccomp_t;
 
 extern void __secure_computing(int);
 static inline void secure_computing(int this_syscall)
@@ -16,8 +39,14 @@ static inline void secure_computing(int this_syscall)
 		__secure_computing(this_syscall);
 }
 
+extern struct seccomp_state *seccomp_state_new(void);
+extern struct seccomp_state *seccomp_state_dup(const struct seccomp_state *);
+extern struct seccomp_state *get_seccomp_state(struct seccomp_state *);
+extern void put_seccomp_state(struct seccomp_state *);
 extern long prctl_get_seccomp(void);
-extern long prctl_set_seccomp(unsigned long);
+extern long prctl_set_seccomp(unsigned long, char *);
+
+#define SECCOMP_MAX_FILTER_LENGTH 16384
 
 #else /* CONFIG_SECCOMP */
 
@@ -27,16 +56,79 @@ typedef struct { } seccomp_t;
 
 #define secure_computing(x) do { } while (0)
 
+#define SECCOMP_MAX_FILTER_LENGTH 0
+
 static inline long prctl_get_seccomp(void)
 {
 	return -EINVAL;
 }
 
-static inline long prctl_set_seccomp(unsigned long arg2)
+static inline long prctl_set_seccomp(unsigned long arg2, unsigned char *filters)
 {
 	return -EINVAL;
 }
 
+static inline struct seccomp_state *seccomp_state_new(void)
+{
+	return NULL;
+}
+
+static inline struct seccomp_state *seccomp_state_dup(
+					const struct seccomp_state *state)
+{
+	return NULL;
+}
+
+static inline struct seccomp_state *get_seccomp_state(
+					struct seccomp_state *state)
+{
+	return NULL;
+}
+
+static inline void put_seccomp_state(struct seccomp_state *state)
+{
+}
+
 #endif /* CONFIG_SECCOMP */
 
+#if defined(CONFIG_SECCOMP) && defined(CONFIG_SECCOMP_FILTER)
+
+extern int seccomp_copy_all_filters(struct list_head *,
+				    const struct list_head *);
+extern void seccomp_drop_all_filters(struct seccomp_state *);
+
+extern int seccomp_parse_filters(struct seccomp_state *, char *);
+extern int seccomp_test_filters(struct seccomp_state *, int);
+
+extern int seccomp_show_filters(struct seccomp_state *, struct seq_file *);
+
+#else  /* CONFIG_SECCOMP && CONFIG_SECCOMP_FILTER */
+
+static inline int seccomp_test_filters(struct seccomp_state *state, int nr)
+{
+	return -EINVAL;
+}
+
+static inline int seccomp_parse_filters(struct seccomp_state *state,
+					char *filters)
+{
+	return -EINVAL;
+}
+
+static inline int seccomp_show_filters(struct seccomp_state *state,
+				       struct seq_file *m)
+{
+	return -EINVAL;
+}
+
+extern inline int seccomp_copy_all_filters(struct list_head *dst,
+					   const struct list_head *src)
+{
+	return 0;
+}
+
+static inline void seccomp_drop_all_filters(struct seccomp_state *state) { }
+
+#endif  /* CONFIG_SECCOMP && CONFIG_SECCOMP_FILTER */
+
 #endif /* _LINUX_SECCOMP_H */
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 242ae04..1f72fce 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -35,6 +35,8 @@ struct syscall_metadata {
 extern unsigned long arch_syscall_addr(int nr);
 extern int init_syscall_trace(struct ftrace_event_call *call);
 
+extern struct syscall_metadata *syscall_nr_to_meta(int);
+
 extern int reg_event_syscall_enter(struct ftrace_event_call *call);
 extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
 extern int reg_event_syscall_exit(struct ftrace_event_call *call);
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb3..a4b21fb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_SECCOMP_FILTER) += seccomp_filter.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548de..bdcf70b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -169,6 +170,9 @@ void free_task(struct task_struct *tsk)
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
+#ifdef CONFIG_SECCOMP
+	put_seccomp_state(tsk->seccomp.state);
+#endif
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -280,6 +284,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	if (err)
 		goto out;
 
+#ifdef CONFIG_SECCOMP
+	tsk->seccomp.state = get_seccomp_state(orig->seccomp.state);
+#endif
+
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13..1bee87c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -8,10 +8,11 @@
 
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/compat.h>
 
 /* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+#define NR_SECCOMP_MODES 2
 
 /*
  * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -32,9 +33,11 @@ static int mode1_syscalls_32[] = {
 
 void __secure_computing(int this_syscall)
 {
-	int mode = current->seccomp.mode;
+	int mode = -1;
 	int * syscall;
-
+	/* Do we need an RCU read lock to access current's state? */
+	if (current->seccomp.state)
+		mode = current->seccomp.state->mode;
 	switch (mode) {
 	case 1:
 		syscall = mode1_syscalls;
@@ -47,6 +50,16 @@ void __secure_computing(int this_syscall)
 				return;
 		} while (*++syscall);
 		break;
+	case 2:
+#ifdef CONFIG_COMPAT
+		if (is_compat_task())
+			/* XXX: No compat support yet. */
+			break;
+#endif
+		if (!seccomp_test_filters(current->seccomp.state,
+					  this_syscall))
+			return;
+		break;
 	default:
 		BUG();
 	}
@@ -57,30 +70,139 @@ void __secure_computing(int this_syscall)
 	do_exit(SIGKILL);
 }
 
+/* seccomp_state_new - allocate a new state object. */
+struct seccomp_state *seccomp_state_new()
+{
+	struct seccomp_state *new = kzalloc(sizeof(struct seccomp_state),
+					    GFP_KERNEL);
+	if (!new)
+		return NULL;
+	atomic_set(&new->usage, 1);
+	INIT_LIST_HEAD(&new->filters);
+	return new;
+}
+
+/* seccomp_state_dup - copies an existing state object. */
+struct seccomp_state *seccomp_state_dup(const struct seccomp_state *orig)
+{
+	int err;
+	struct seccomp_state *new = seccomp_state_new();
+
+	err = -ENOMEM;
+	if (!new)
+		goto fail;
+	new->mode = orig->mode;
+	memcpy(new->bitmask, orig->bitmask, sizeof(new->bitmask));
+	err = seccomp_copy_all_filters(&new->filters,
+				       &orig->filters);
+	if (err)
+		goto fail;
+	new->filter_count = orig->filter_count;
+
+	return new;
+fail:
+	put_seccomp_state(new);
+	return NULL;
+}
+
+/* get_seccomp_state - increments the reference count of @orig */
+struct seccomp_state *get_seccomp_state(struct seccomp_state *orig)
+{
+	if (!orig)
+		return NULL;
+	atomic_inc(&orig->usage);
+	return orig;
+}
+
+static void __put_seccomp_state(struct seccomp_state *orig)
+{
+	WARN_ON(atomic_read(&orig->usage));
+	seccomp_drop_all_filters(orig);
+	kfree(orig);
+}
+
+/* put_seccomp_state - decrements the reference count of @orig and may free. */
+void put_seccomp_state(struct seccomp_state *orig)
+{
+	if (!orig)
+		return;
+
+	if (atomic_dec_and_test(&orig->usage))
+		__put_seccomp_state(orig);
+}
+
 long prctl_get_seccomp(void)
 {
-	return current->seccomp.mode;
+	if (!current->seccomp.state)
+		return 0;
+	return current->seccomp.state->mode;
 }
 
-long prctl_set_seccomp(unsigned long seccomp_mode)
+long prctl_set_seccomp(unsigned long seccomp_mode, char *filters)
 {
 	long ret;
+	struct seccomp_state *state, *orig_state;
 
-	/* can set it only once to be even more secure */
+	rcu_read_lock();
+	orig_state = get_seccomp_state(rcu_dereference(current->seccomp.state));
+	rcu_read_unlock();
+
+	/* mode 1 can only be set once, but mode 2 may have access reduced. */
 	ret = -EPERM;
-	if (unlikely(current->seccomp.mode))
+	if (orig_state && orig_state->mode != 2 && seccomp_mode != 2)
 		goto out;
 
 	ret = -EINVAL;
-	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
-		current->seccomp.mode = seccomp_mode;
-		set_thread_flag(TIF_SECCOMP);
+	if (!seccomp_mode || seccomp_mode > NR_SECCOMP_MODES)
+		goto out;
+
+	/* Prepare to modify the seccomp state by dropping the shared
+	 * reference after duplicating it.
+	 */
+	state = (orig_state ? seccomp_state_dup(orig_state) :
+			      seccomp_state_new());
+	if (!state) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = 0;
+	switch (seccomp_mode) {
+	case 1:
 #ifdef TIF_NOTSC
 		disable_TSC();
 #endif
+		state->mode = seccomp_mode;
+		set_thread_flag(TIF_SECCOMP);
 		ret = 0;
+		break;
+	case 2:
+		if (filters) {
+			ret = seccomp_parse_filters(state, filters);
+			/* No partial applications. */
+			if (ret)
+				goto free_state;
+		}
+
+		if (!state->mode) {
+			state->mode = seccomp_mode;
+			set_thread_flag(TIF_SECCOMP);
+		}
+		break;
+	default:
+		ret = -EINVAL;
 	}
 
- out:
+	rcu_assign_pointer(current->seccomp.state, state);
+	synchronize_rcu();
+	put_seccomp_state(orig_state);  /* for the get */
+
+out:
+	put_seccomp_state(orig_state);  /* for the task */
+	return ret;
+
+free_state:
+	put_seccomp_state(orig_state);  /* for the get */
+	put_seccomp_state(state);  /* drop the dup */
 	return ret;
 }
diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
new file mode 100644
index 0000000..8f7878b
--- /dev/null
+++ b/kernel/seccomp_filter.c
@@ -0,0 +1,428 @@
+/* filter engine-based seccomp system call filtering.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ */
+
+#include <linux/seccomp.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/syscall.h>
+#include <trace/syscall.h>
+
+#define SECCOMP_MAX_FILTER_COUNT 16
+#define SECCOMP_DIRECTIVE_ON_NEXT "on_next_syscall"
+#define SECCOMP_ON_NEXT_BIT NR_syscalls
+
+struct seccomp_filter {
+	struct list_head list;
+	struct rcu_head rcu;
+	int syscall_nr;
+	struct syscall_metadata *data;
+	struct event_filter *event_filter;
+};
+
+static struct seccomp_filter *alloc_seccomp_filter(int syscall_nr,
+						   const char *filter_string)
+{
+	int err;
+	struct seccomp_filter *filter = kzalloc(sizeof(struct seccomp_filter),
+						GFP_KERNEL);
+	if (!filter)
+		goto fail;
+
+	filter->syscall_nr = syscall_nr;
+	filter->data = syscall_nr_to_meta(syscall_nr);
+	if (!filter->data)
+		goto fail;
+
+	err = ftrace_parse_filter(&filter->event_filter,
+				  filter->data->enter_event->event.type,
+				  filter_string);
+	if (err)
+		goto fail;
+
+	return filter;
+
+fail:
+	kfree(filter);
+	return NULL;
+}
+
+static void free_seccomp_filter(struct seccomp_filter *filter)
+{
+	ftrace_free_filter(filter->event_filter);
+	kfree(filter);
+}
+
+static void free_seccomp_filter_rcu(struct rcu_head *head)
+{
+	free_seccomp_filter(container_of(head,
+					 struct seccomp_filter,
+					 rcu));
+}
+
+static struct seccomp_filter *copy_seccomp_filter(struct seccomp_filter *orig)
+{
+	return alloc_seccomp_filter(orig->syscall_nr,
+				ftrace_get_filter_string(orig->event_filter));
+}
+
+/* Safely drops all filters for a given syscall. */
+static void drop_matching_filters(struct seccomp_state *state, int syscall_nr)
+{
+	struct list_head *this, *temp;
+	list_for_each_safe(this, temp, &state->filters) {
+		struct seccomp_filter *f = list_entry(this,
+						      struct seccomp_filter,
+						      list);
+		if (f->syscall_nr == syscall_nr) {
+			WARN_ON(state->filter_count == 0);
+			state->filter_count--;
+			list_del_rcu(this);
+			call_rcu(&f->rcu, free_seccomp_filter_rcu);
+		}
+	}
+}
+
+static int add_filter(struct seccomp_state *state, int syscall_nr,
+		      char *filter_string)
+{
+	struct syscall_metadata *data;
+	struct seccomp_filter *filter;
+
+	if (state->filter_count == SECCOMP_MAX_FILTER_COUNT - 1)
+		return -EINVAL;
+
+	/* This check will catch flag bits, like on_next_syscall. */
+	data = syscall_nr_to_meta(syscall_nr);
+	if (!data)
+		return -EINVAL;
+
+	filter = alloc_seccomp_filter(syscall_nr,
+				    filter_string);
+	if (!filter)
+		return -EINVAL;
+
+	state->filter_count++;
+	list_add_tail_rcu(&filter->list, &state->filters);
+	return 0;
+}
+
+static int syscall_name_to_nr(const char *name)
+{
+	int i;
+	for (i = 0; i < NR_syscalls; i++) {
+		struct syscall_metadata *meta = syscall_nr_to_meta(i);
+		if (!meta)
+			continue;
+		if (!strcmp(meta->name, name))
+			return i;
+	}
+	return -1;
+}
+
+static int directive_to_bit(const char *syscall)
+{
+	int syscall_nr = -1;
+	if (!strcmp(syscall, SECCOMP_DIRECTIVE_ON_NEXT))
+		return SECCOMP_ON_NEXT_BIT;
+	syscall_nr = syscall_name_to_nr(syscall);
+	return syscall_nr;
+}
+
+/* 1 on match, 0 otherwise. */
+static int filter_match_current(struct seccomp_filter *filter)
+{
+	int err;
+	uint8_t syscall_state[64];
+
+	memset(syscall_state, 0, sizeof(syscall_state));
+
+	/* The generic tracing entry can remain zeroed. */
+	err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
+					 NULL);
+	if (err)
+		return 0;
+
+	return filter_match_preds(filter->event_filter, syscall_state);
+}
+
+/**
+ * parse_seccomp_filter() - update a system call mask
+ * @state: the seccomp_state to update.
+ * @filter: a rule expressed in a char array.
+ */
+static int parse_seccomp_filter(struct seccomp_state *state,
+				char *filter_spec)
+{
+	char *filter_string;
+	char *syscall = filter_spec;
+	int syscall_nr = -1;
+	int ret = -EINVAL;
+
+	if (!state)
+		goto done;
+
+	if (!filter_spec)
+		goto done;
+
+	/* Expected format is
+	 * directive: filter_string
+	 * where directive may be a system call name (sys_*) or
+	 * on_next_syscall.
+	 */
+	filter_string = strchr(filter_spec, ':');
+	if (!filter_string)
+		goto done;
+
+	*filter_string++ = '\0';
+
+	/* Map the name to the syscalls defined name. */
+	syscall_nr = directive_to_bit(syscall);
+	if (syscall_nr < 0) {
+		goto done;
+	}
+
+	/* Short-circuit filter parsing and check for "0" to clear. */
+	ret = 0;
+	if (!strcmp(strstrip(filter_string), "0")) {
+		clear_bit(syscall_nr, state->bitmask);
+		drop_matching_filters(state, syscall_nr);
+		goto done;
+	}
+	/* Do not allow any allowed syscalls or filter additions when a
+	 * secure computing mode is enabled.
+	 */
+	ret = -EPERM;
+	if (state->mode)
+		goto done;
+
+	/* Short-circuit parsing for the allow case. */
+	if (!strcmp(strstrip(filter_string), "1"))
+		goto allow;
+
+	/* Attempt to create a new filter, but don't do it if it doesn't parse.
+	 */
+	/* TODO either use append_filter_string or replace old matches. */
+	if (add_filter(state, syscall_nr, filter_string)) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+allow:
+	ret = 0;
+	set_bit(syscall_nr, state->bitmask);
+
+done:
+	return ret;
+}
+
+#ifndef KSTK_EIP
+#define KSTK_EIP(x) 0L
+#endif
+
+static void log_failure(int syscall)
+{
+	const char *syscall_name = "unknown";
+	struct syscall_metadata *data = syscall_nr_to_meta(syscall);
+	if (data)
+		syscall_name = data->name;
+	printk(KERN_INFO
+		"%s[%d]: system call %d (%s) blocked at ip:%lx\n",
+		current->comm, task_pid_nr(current), syscall, syscall_name,
+		KSTK_EIP(current));
+}
+
+/* seccomp_drop_all_filters - cleans up the filter list
+ *
+ * @state: the seccomp_state to destroy the filters in.
+ */
+void seccomp_drop_all_filters(struct seccomp_state *state)
+{
+	struct list_head *this, *temp;
+	state->filter_count = 0;
+	if (list_empty(&state->filters))
+		return;
+	list_for_each_safe(this, temp, &state->filters) {
+		struct seccomp_filter *f = list_entry(this,
+						      struct seccomp_filter,
+						      list);
+		list_del_rcu(this);
+		/* Schedules freeing on the RCU. */
+		call_rcu(&f->rcu, free_seccomp_filter_rcu);
+	}
+}
+EXPORT_SYMBOL_GPL(seccomp_drop_all_filters);
+
+/* seccomp_copy_all_filters - copies all filters from src to dst.
+ *
+ * @dst: the list_head for seccomp_filters to populate.
+ * @src: the list_head for seccomp_filters to copy from.
+ * Returns non-zero on failure.
+ */
+int seccomp_copy_all_filters(struct list_head *dst,
+			     const struct list_head *src)
+{
+	struct seccomp_filter *filter;
+	int ret = 0;
+	BUG_ON(!dst || !src);
+	if (list_empty(src))
+		goto done;
+	rcu_read_lock();
+	list_for_each_entry(filter, src, list) {
+		struct seccomp_filter *new_filter = copy_seccomp_filter(filter);
+		if (!new_filter) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		list_add_tail_rcu(&new_filter->list, dst);
+	}
+
+done:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_copy_all_filters);
+
+/* seccomp_show_filters - prints the filter state to a seq_file
+ * @state: the seccomp_state to enumerate the filter and bitmask of.
+ * @m: the prepared seq_file to receive the data.
+ * Returns 0 on a successful write.
+ */
+int seccomp_show_filters(struct seccomp_state *state, struct seq_file *m)
+{
+	int i = 0;
+	if (!state) {
+		return 0;
+	}
+
+	if (test_bit(SECCOMP_ON_NEXT_BIT, state->bitmask))
+		seq_printf(m, SECCOMP_DIRECTIVE_ON_NEXT ": 1\n");
+	/* Lazy but effective. */
+	do {
+		struct syscall_metadata *meta;
+		struct seccomp_filter *filter;
+		int filtered = 0;
+		if (!test_bit(i, state->bitmask))
+			continue;
+		rcu_read_lock();
+		list_for_each_entry_rcu(filter, &state->filters, list) {
+			if (filter->syscall_nr == i) {
+				filtered = 1;
+				seq_printf(m, "%s: %s\n",
+				   filter->data->name,
+				   ftrace_get_filter_string(
+					filter->event_filter));
+			}
+		}
+		rcu_read_unlock();
+
+		if (!filtered) {
+			meta = syscall_nr_to_meta(i);
+			BUG_ON(!meta);
+			seq_printf(m, "%s: 1\n", meta->name);
+		}
+	} while (++i < NR_syscalls);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seccomp_show_filters);
+
+/* seccomp_test_filters - tests 'current' agaist the given syscall
+ *
+ * @state: seccomp_state of current to use.
+ * @syscall: number of the system call to test
+ * Returns 0 on ok and non-zero on error/failure.
+ */
+int seccomp_test_filters(struct seccomp_state *state, int syscall)
+{
+	struct seccomp_filter *filter = NULL;
+	int ret = 0;
+	if (syscall >= NR_syscalls) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* The non-atomic version is used.  Since the goal of this bit is to
+	 * provide a means to set a filter set prior to  exec*(), it there
+	 * should not be a race nor should it matter if one occurs.
+	 */
+	if (__test_and_clear_bit(NR_syscalls, state->bitmask)) {
+		ret = 0;
+		goto out;
+	}
+
+	if (!test_bit(syscall, state->bitmask)) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	/* Check if the syscall is filtered, if not, allow it. */
+	rcu_read_lock();
+	/* XXX: using two bitmasks would avoid searching the filter list
+	 *      unnecessarily.
+	 */
+	list_for_each_entry(filter, &state->filters, list) {
+		/* Multiple filters are allowed per system call.
+		 * They are logically ANDed in order of addition.
+		 */
+		if (filter->syscall_nr == syscall) {
+			if (!filter_match_current(filter)) {
+				ret = -EACCES;
+				goto out_unlock;
+			}
+		}
+	}
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	if (ret)
+		log_failure(syscall);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_test_filters);
+
+/* seccomp_parse_filters - populates state with seccomp_filters
+ *
+ * @state: seccomp state to manage the filters on.
+ * @buf: list of seccomp filters of the format:
+ *       directive: ftrace_filter_string
+ * Returns 0 on success or non-zero if any filter is invalid.
+ *
+ * A directive may be any valid syscalls enter event name,
+ * like sys_newuname, or SECCOMP_DIRECTIVE_ON_NEXT.
+ *
+ * See parse_seccomp_filter for exact behavior.
+ */
+int seccomp_parse_filters(struct seccomp_state *state, char *buf)
+{
+	char *filter, *next;
+	int err = 0;
+	next = buf;
+	while ((filter = strsep(&next, "\n")) != NULL) {
+		if (!filter[0])
+			continue;
+		if ((err = parse_seccomp_filter(state, filter)))
+			break;
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(seccomp_parse_filters);
diff --git a/kernel/sys.c b/kernel/sys.c
index af468ed..1d3f27e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1620,6 +1620,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 {
 	struct task_struct *me = current;
 	unsigned char comm[sizeof(me->comm)];
+	char *seccomp_filter = NULL;
 	long error;
 
 	error = security_task_prctl(option, arg2, arg3, arg4, arg5);
@@ -1703,7 +1704,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			error = prctl_get_seccomp();
 			break;
 		case PR_SET_SECCOMP:
-			error = prctl_set_seccomp(arg2);
+			if (arg3 && SECCOMP_MAX_FILTER_LENGTH) {
+				seccomp_filter = strndup_user(
+						   (char __user *)arg3,
+						   SECCOMP_MAX_FILTER_LENGTH);
+				if (!seccomp_filter)
+					return -ENOMEM;
+			}
+			error = prctl_set_seccomp(arg2, seccomp_filter);
+			kfree(seccomp_filter);
 			break;
 		case PR_GET_TSC:
 			error = GET_TSC_CTL(arg2);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 61d7d59..e21b9eb 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -240,6 +240,16 @@ config FTRACE_SYSCALLS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp trace event-based filtering"
+	depends on FTRACE_SYSCALLS
+	select CONFIG_SECCOMP
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.
+
+	  See Documentation/trace/seccomp_filter.txt for more detail.
+
 config TRACE_BRANCH_PROFILING
 	bool
 	select GENERIC_TRACER
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH 4/7] seccomp_filter: add process state reporting
  2011-04-28  3:08 [PATCH 2/7] tracing: split out syscall_trace_enter construction Will Drewry
  2011-04-28  3:08 ` [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering Will Drewry
@ 2011-04-28  3:08 ` Will Drewry
  2011-04-28  3:21   ` KOSAKI Motohiro
  2011-05-12  3:04   ` [PATCH 4/5] v2 " Will Drewry
  2011-04-28  3:08 ` [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28  3:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, eparis, agl, mingo, jmorris, rostedt, Will Drewry,
	Andrew Morton, Alexey Dobriyan, David Howells, Al Viro,
	David Rientjes, KOSAKI Motohiro, Stephen Wilson

Adds seccomp and seccomp_filter status reporting to proc.
/proc/<pid>/status will include a Seccomp field, and
/proc/<pid>/seccomp_filter will provide read-only access
to the current filter and bitmask set for seccomp_filters.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 fs/proc/array.c |   21 +++++++++++++++++++++
 fs/proc/base.c  |   25 +++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 5e4f776..c35ec60 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -77,6 +77,7 @@
 #include <linux/cpuset.h>
 #include <linux/rcupdate.h>
 #include <linux/delayacct.h>
+#include <linux/seccomp.h>
 #include <linux/seq_file.h>
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
@@ -337,6 +338,19 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 	seq_putc(m, '\n');
 }
 
+static void task_show_seccomp(struct seq_file *m, struct task_struct *p) {
+#if defined(CONFIG_SECCOMP)
+	int mode;
+	struct seccomp_state* state;
+	rcu_read_lock();
+	state = get_seccomp_state(rcu_dereference(p->seccomp.state));
+	mode = state ? state->mode : 0;
+	rcu_read_unlock();
+	put_seccomp_state(state);
+	seq_printf(m, "Seccomp:\t%d\n", mode);
+#endif
+}
+
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
@@ -354,6 +368,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
+	task_show_seccomp(m, task);
 	return 0;
 }
 
@@ -544,3 +559,9 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
 
 	return 0;
 }
+
+int proc_pid_seccomp(struct seq_file *m, struct pid_namespace *ns,
+			struct pid *pid, struct task_struct *task)
+{
+	return 0;
+}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa5327..4b6f0c7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -73,6 +73,7 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
+#include <linux/seccomp.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
@@ -579,6 +580,24 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
+#ifdef CONFIG_SECCOMP_FILTER
+/*
+ * Print out the current seccomp filter set for the task.
+ */
+int proc_pid_seccomp_filter_show(struct seq_file *m, struct pid_namespace *ns,
+				 struct pid *pid, struct task_struct *task)
+{
+	struct seccomp_state *state;
+
+	rcu_read_lock();
+	state = get_seccomp_state(task->seccomp.state);
+	rcu_read_unlock();
+	seccomp_show_filters(state, m);
+	put_seccomp_state(state);
+	return 0;
+}
+#endif
+
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -2838,6 +2857,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",    S_IRUGO, proc_pid_syscall),
 #endif
+#ifdef CONFIG_SECCOMP_FILTER
+	ONE("seccomp_filter",     S_IRUSR, proc_pid_seccomp_filter_show),
+#endif
 	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
 	ONE("stat",       S_IRUGO, proc_tgid_stat),
 	ONE("statm",      S_IRUGO, proc_pid_statm),
@@ -3180,6 +3202,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",   S_IRUGO, proc_pid_syscall),
 #endif
+#ifdef CONFIG_SECCOMP_FILTER
+	ONE("seccomp_filter",     S_IRUSR, proc_pid_seccomp_filter_show),
+#endif
 	INF("cmdline",   S_IRUGO, proc_pid_cmdline),
 	ONE("stat",      S_IRUGO, proc_tid_stat),
 	ONE("statm",     S_IRUGO, proc_pid_statm),
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-04-28  3:08 [PATCH 2/7] tracing: split out syscall_trace_enter construction Will Drewry
  2011-04-28  3:08 ` [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering Will Drewry
  2011-04-28  3:08 ` [PATCH 4/7] seccomp_filter: add process state reporting Will Drewry
@ 2011-04-28  3:08 ` Will Drewry
  2011-04-28  7:06   ` Ingo Molnar
  2011-04-28 15:46   ` Randy Dunlap
  2011-04-28  3:08 ` [PATCH 6/7] include/linux/syscalls.h: add __ layer of macros with return types Will Drewry
  2011-04-28  3:08 ` [PATCH 7/7] arch/x86: hook int returning system calls Will Drewry
  4 siblings, 2 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28  3:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, eparis, agl, mingo, jmorris, rostedt, Will Drewry,
	Randy Dunlap

Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
implemented presently, and what it may be used for.  In addition,
the limitations and caveats of the proposed implementation are
included.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 Documentation/trace/seccomp_filter.txt |   75 ++++++++++++++++++++++++++++++++
 1 files changed, 75 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/trace/seccomp_filter.txt

diff --git a/Documentation/trace/seccomp_filter.txt b/Documentation/trace/seccomp_filter.txt
new file mode 100644
index 0000000..6a0fd33
--- /dev/null
+++ b/Documentation/trace/seccomp_filter.txt
@@ -0,0 +1,75 @@
+		Seccomp filtering
+		=================
+
+Introduction
+------------
+
+A large number of system calls are exposed to every userland process
+with many of them going unused for the entire lifetime of the
+application.  As system calls change and mature, bugs are found and
+quashed.  A certain subset of userland applications benefit by having
+a reduce set of available system calls.  The reduced set reduces the
+total kernel surface exposed to the application.  System call filtering
+is meant for use with those applications.
+
+The implementation currently leverages both the existing seccomp
+infrastructure and the kernel tracing infrastructure.  By centralizing
+hooks for attack surface reduction in seccomp, it is possible to assure
+attention to security that is less relevant in normal ftrace scenarios,
+such as time of check, time of use attacks.  However, ftrace provides a
+rich, human-friendly environment for specifying system calls by name and
+expected arguments.  (As such, this requires FTRACE_SYSCALLS.)
+
+
+What it isn't
+-------------
+
+System call filtering isn't a sandbox.  It provides a clearly defined
+mechanism for minimizing the exposed kernel surface.  Beyond that, policy for
+logical behavior and information flow should be managed with an LSM of your
+choosing.
+
+
+Usage
+-----
+
+An additional seccomp mode is exposed through mode '2'.  This mode
+depends on CONFIG_SECCOMP_FILTER which in turn depends on
+CONFIG_FTRACE_SYSCALLS.
+
+A collection of filters may be supplied via prctl, and the current set of
+filters is exposed in /proc/<pid>/seccomp_filter.
+
+For instance,
+  const char filters[] =
+    "sys_read: (fd == 1) || (fd == 2)\n"
+    "sys_write: (fd == 0)\n"
+    "sys_exit: 1\n"
+    "sys_exit_group: 1\n"
+    "on_next_syscall: 1";
+  prctl(PR_SET_SECCOMP, 2, filters);
+
+This will setup system call filters for read, write, and exit where reading can
+be done only from fds 1 and 2 and writing to fd 0.  The "on_next_syscall" directive tells
+seccomp to not enforce the ruleset until after the next system call is run.  This allows
+for launchers to apply system call filters to a binary before executing it.
+
+Once enabled, the access may only be reduced.  For example, a set of filters may be:
+
+  sys_read: 1
+  sys_write: 1
+  sys_mmap: 1
+  sys_prctl: 1
+
+Then it may call the following to drop mmap access:
+  prctl(PR_SET_SECCOMP, 2, "sys_mmap: 0");
+
+
+Caveats
+-------
+
+The system call names come from ftrace events.  At present, many system
+calls are not hooked - such as x86's ptregs wrapped system calls.
+
+In addition compat_task()s will not be supported until a sys32s begin
+being hooked.
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH 6/7] include/linux/syscalls.h: add __ layer of macros with return types.
  2011-04-28  3:08 [PATCH 2/7] tracing: split out syscall_trace_enter construction Will Drewry
                   ` (2 preceding siblings ...)
  2011-04-28  3:08 ` [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
@ 2011-04-28  3:08 ` Will Drewry
  2011-04-28  3:08 ` [PATCH 7/7] arch/x86: hook int returning system calls Will Drewry
  4 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28  3:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, eparis, agl, mingo, jmorris, rostedt, Will Drewry,
	Steven Rostedt, Frederic Weisbecker, Masami Hiramatsu,
	Mathieu Desnoyers

This change addresses two issues directly:
1. The inability for ftrace to hook system calls that don't return
   a long.
2. The fact that SYSCALL_DEFINE() does not touch ftrace at all

1 is fixed by adding __SYSCALL_DEFINEx/0 and __SYSCALL_TRACEx/0
macros which lay underneath the normal call,
SYSCALL_DEFINE0, and SYSCALL_DEFINE[1-6].  All existing calls will
continue to work normally and SYSCALL_DEFINE calls will become
ftrace-able but without argument inspection  support.

2 is addressed by separating out SYSCALL_TRACE0 and pulling it under
SYSCALL_DEFINE.  It means that calls that may lack argument
introspection will still be traceable in a binary way without any
additional code changes.

There are still some challenges for wrapping calls that have a
trampoline directly in the assembly, like the ptregs calls in
arch/x86/kernel (e.g., sys_clone).  Given that the arguments to the
function do not directly map to those of the system call, the main
macros are not friendly for use.  However, the SYSCALL_TRACE0 macro in
this change could be used as a placeholder (declared above the function
definition) to allow limited tracing functionality until a better
solution is presented.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 include/linux/syscalls.h |   52 +++++++++++++++++++++++++++++++---------------
 1 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 83ecc17..1c4a3fb 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -171,7 +171,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	  __attribute__((section("__syscalls_metadata")))	\
 	 *__p_syscall_meta_##sname = &__syscall_meta_##sname;
 
-#define SYSCALL_DEFINE0(sname)					\
+#define __SYSCALL_TRACE0(linkage, ret, sname)			\
 	SYSCALL_TRACE_ENTER_EVENT(_##sname);			\
 	SYSCALL_TRACE_EXIT_EVENT(_##sname);			\
 	static struct syscall_metadata __used			\
@@ -185,12 +185,16 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	};							\
 	static struct syscall_metadata __used			\
 	  __attribute__((section("__syscalls_metadata")))	\
-	 *__p_syscall_meta_##sname = &__syscall_meta__##sname;	\
-	asmlinkage long sys_##sname(void)
+	 *__p_syscall_meta_##sname = &__syscall_meta__##sname;
 #else
-#define SYSCALL_DEFINE0(name)	   asmlinkage long sys_##name(void)
+#define __SYSCALL_TRACE0(linkage, ret, sname)
 #endif
 
+#define __SYSCALL_DEFINE0(linkage, ret, sname)			\
+	__SYSCALL_TRACE0(linkage, ret, sname)			\
+	linkage ret sys_##sname(void)
+
+#define SYSCALL_DEFINE0(name) __SYSCALL_DEFINE0(asmlinkage, long, name)
 #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
@@ -214,39 +218,53 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 #define SYSCALL_DEFINEx(x, sname, ...)				\
+	__SYSCALL_DEFINEx(asmlinkage, long, x, sname, __VA_ARGS__)
+
+#define __SYSCALL_TRACEx(linkage, ret, x, sname, ...)	\
 	static const char *types_##sname[] = {			\
 		__SC_STR_TDECL##x(__VA_ARGS__)			\
 	};							\
 	static const char *args_##sname[] = {			\
 		__SC_STR_ADECL##x(__VA_ARGS__)			\
 	};							\
-	SYSCALL_METADATA(sname, x);				\
-	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+	SYSCALL_METADATA(sname, x);				
 #else
 #define SYSCALL_DEFINEx(x, sname, ...)				\
-	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+	__SYSCALL_DEFINEx(asmlinkage, long, x, sname, __VA_ARGS__)
+#define __SYSCALL_TRACEx(linkage, ret, x, sname, ...)
 #endif
 
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 
-#define SYSCALL_DEFINE(name) static inline long SYSC_##name
-
-#define __SYSCALL_DEFINEx(x, name, ...)					\
-	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));		\
-	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));	\
-	asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))		\
+#define SYSCALL_DEFINE(name)						\
+	__SYSCALL_DEFINE(long, name)
+#define __SYSCALL_DEFINE(ret, name)				\
+	__SYSCALL_TRACE0(, ret, name)
+	static inline ret SYSC_##name
+
+#define __SYSCALL_DEFINEx(linkage, ret, x, name, ...)			\
+	__SYSCALL_TRACEx(linkage, ret, x, name, __VA_ARGS__)		\
+	linkage ret sys##name(__SC_DECL##x(__VA_ARGS__));		\
+	static inline ret SYSC##name(__SC_DECL##x(__VA_ARGS__));	\
+	linkage ret SyS##name(__SC_LONG##x(__VA_ARGS__))		\
 	{								\
 		__SC_TEST##x(__VA_ARGS__);				\
-		return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__));	\
+		return (ret) SYSC##name(__SC_CAST##x(__VA_ARGS__));	\
 	}								\
 	SYSCALL_ALIAS(sys##name, SyS##name);				\
-	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
+	static inline ret SYSC##name(__SC_DECL##x(__VA_ARGS__))
 
 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
-#define SYSCALL_DEFINE(name) asmlinkage long sys_##name
-#define __SYSCALL_DEFINEx(x, name, ...)					\
-	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
+#define SYSCALL_DEFINE(name)						\
+	__SYSCALL_DEFINE(asmlinkage, long, name)
+#define __SYSCALL_DEFINE(linkage, ret, name)				\
+	__SYSCALL_TRACE0(linkage, ret, name)				\
+	linkage ret sys_##name
+
+#define __SYSCALL_DEFINEx(linkage, ret, x, name, ...)			\
+	__SYSCALL_TRACEx(linkage, ret, x, name, __VA_ARGS__)		\
+	linkage ret sys##name(__SC_DECL##x(__VA_ARGS__))
 
 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH 7/7] arch/x86: hook int returning system calls
  2011-04-28  3:08 [PATCH 2/7] tracing: split out syscall_trace_enter construction Will Drewry
                   ` (3 preceding siblings ...)
  2011-04-28  3:08 ` [PATCH 6/7] include/linux/syscalls.h: add __ layer of macros with return types Will Drewry
@ 2011-04-28  3:08 ` Will Drewry
  4 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28  3:08 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, eparis, agl, mingo, jmorris, rostedt, Will Drewry,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin

Using the newly specified __SYSCALL_DEFINEx helpers, redefine
int-returning system calls with the macro to enable ftrace access.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/x86/kernel/ldt.c |    5 +++--
 arch/x86/kernel/tls.c |    7 +++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ea69726..3f1160c 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -10,6 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/string.h>
+#include <linux/syscalls.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/vmalloc.h>
@@ -245,8 +246,8 @@ out:
 	return error;
 }
 
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
-			      unsigned long bytecount)
+__SYSCALL_DEFINEx(asmlinkage, int, 3, _modify_ldt,
+		  int, func, void __user *, ptr, unsigned long, bytecount)
 {
 	int ret = -ENOSYS;
 
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6bb7b85..0f27a6b 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -1,6 +1,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/syscalls.h>
 #include <linux/user.h>
 #include <linux/regset.h>
 
@@ -90,7 +91,8 @@ int do_set_thread_area(struct task_struct *p, int idx,
 	return 0;
 }
 
-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+__SYSCALL_DEFINEx(asmlinkage, int, 1, _set_thread_area,
+		  struct user_desc __user *, u_info)
 {
 	int ret = do_set_thread_area(current, -1, u_info, 1);
 	asmlinkage_protect(1, ret, u_info);
@@ -140,7 +142,8 @@ int do_get_thread_area(struct task_struct *p, int idx,
 	return 0;
 }
 
-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+__SYSCALL_DEFINEx(asmlinkage, int, 1, _get_thread_area,
+		  struct user_desc __user *, u_info)
 {
 	int ret = do_get_thread_area(current, -1, u_info);
 	asmlinkage_protect(1, ret, u_info);
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* Re: [PATCH 4/7] seccomp_filter: add process state reporting
  2011-04-28  3:08 ` [PATCH 4/7] seccomp_filter: add process state reporting Will Drewry
@ 2011-04-28  3:21   ` KOSAKI Motohiro
  2011-04-28  3:24     ` Will Drewry
  2011-05-12  3:04   ` [PATCH 4/5] v2 " Will Drewry
  1 sibling, 1 reply; 406+ messages in thread
From: KOSAKI Motohiro @ 2011-04-28  3:21 UTC (permalink / raw)
  To: Will Drewry
  Cc: kosaki.motohiro, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, rostedt, Andrew Morton, Alexey Dobriyan, David Howells,
	Al Viro, David Rientjes, Stephen Wilson

> Adds seccomp and seccomp_filter status reporting to proc.
> /proc/<pid>/status will include a Seccomp field, and
> /proc/<pid>/seccomp_filter will provide read-only access
> to the current filter and bitmask set for seccomp_filters.
> 
> Signed-off-by: Will Drewry <wad@chromium.org>
> ---
>  fs/proc/array.c |   21 +++++++++++++++++++++
>  fs/proc/base.c  |   25 +++++++++++++++++++++++++
>  2 files changed, 46 insertions(+), 0 deletions(-)

I'm not againt seccomp_filter. but I dislike to increase /proc/<pid>/status mess.
1) it's read from a lot of applications, I don't want to worry about performance
thing. 2) 99.99% user never use seccomp. this field is useless for them.

Can't you make individual seccomp specific file?




^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 4/7] seccomp_filter: add process state reporting
  2011-04-28  3:21   ` KOSAKI Motohiro
@ 2011-04-28  3:24     ` Will Drewry
  2011-04-28  3:40       ` Al Viro
  2011-04-28 22:54       ` James Morris
  0 siblings, 2 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28  3:24 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Andrew Morton, Alexey Dobriyan, David Howells, Al Viro,
	David Rientjes, Stephen Wilson

On Wed, Apr 27, 2011 at 10:21 PM, KOSAKI Motohiro
<kosaki.motohiro@jp.fujitsu.com> wrote:
>> Adds seccomp and seccomp_filter status reporting to proc.
>> /proc/<pid>/status will include a Seccomp field, and
>> /proc/<pid>/seccomp_filter will provide read-only access
>> to the current filter and bitmask set for seccomp_filters.
>>
>> Signed-off-by: Will Drewry <wad@chromium.org>
>> ---
>>  fs/proc/array.c |   21 +++++++++++++++++++++
>>  fs/proc/base.c  |   25 +++++++++++++++++++++++++
>>  2 files changed, 46 insertions(+), 0 deletions(-)
>
> I'm not againt seccomp_filter. but I dislike to increase /proc/<pid>/status mess.
> 1) it's read from a lot of applications, I don't want to worry about performance
> thing. 2) 99.99% user never use seccomp. this field is useless for them.
>
> Can't you make individual seccomp specific file?

Definitely.  Would it make sense to have /proc/<pid>/seccomp and
/proc/<pid>/seccomp_filter?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 4/7] seccomp_filter: add process state reporting
  2011-04-28  3:24     ` Will Drewry
@ 2011-04-28  3:40       ` Al Viro
  2011-04-28  3:43         ` Will Drewry
  2011-04-28 22:54       ` James Morris
  1 sibling, 1 reply; 406+ messages in thread
From: Al Viro @ 2011-04-28  3:40 UTC (permalink / raw)
  To: Will Drewry
  Cc: KOSAKI Motohiro, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, rostedt, Andrew Morton, Alexey Dobriyan, David Howells,
	David Rientjes, Stephen Wilson

On Wed, Apr 27, 2011 at 10:24:20PM -0500, Will Drewry wrote:

> Definitely.  Would it make sense to have /proc/<pid>/seccomp and
> /proc/<pid>/seccomp_filter?

Just one question: WTF bother with S_IRUSR?  Note that it's absolutely
_useless_ in procfs; any kind of permission checks must be done in
read(2) time since doing it in open(2) is worthless.  Consider execve()
on suid binary; oops, there goes your uid and there goes the effect
of checks done by open(2).  And if you *do* checks in read(2), why bother
duplicating them in open(2)?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 4/7] seccomp_filter: add process state reporting
  2011-04-28  3:40       ` Al Viro
@ 2011-04-28  3:43         ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28  3:43 UTC (permalink / raw)
  To: Al Viro
  Cc: KOSAKI Motohiro, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, rostedt, Andrew Morton, Alexey Dobriyan, David Howells,
	David Rientjes, Stephen Wilson

On Wed, Apr 27, 2011 at 10:40 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Wed, Apr 27, 2011 at 10:24:20PM -0500, Will Drewry wrote:
>
>> Definitely.  Would it make sense to have /proc/<pid>/seccomp and
>> /proc/<pid>/seccomp_filter?
>
> Just one question: WTF bother with S_IRUSR?  Note that it's absolutely
> _useless_ in procfs; any kind of permission checks must be done in
> read(2) time since doing it in open(2) is worthless.  Consider execve()
> on suid binary; oops, there goes your uid and there goes the effect
> of checks done by open(2).  And if you *do* checks in read(2), why bother
> duplicating them in open(2)?

In earlier versions I was allowing filter/bitmask updating via the
proc file (which I nixed :).  Is S_IRUGO preferred? I don't see any
crazy information leakage by sharing the filters/ruleset.  I'll fold
it into the next version of this patch.

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-04-28  3:08 ` [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
@ 2011-04-28  7:06   ` Ingo Molnar
  2011-04-28 14:56     ` Eric Paris
  2011-04-28 17:43     ` Serge E. Hallyn
  2011-04-28 15:46   ` Randy Dunlap
  1 sibling, 2 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-04-28  7:06 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, jmorris, rostedt,
	Randy Dunlap, Linus Torvalds, Andrew Morton, Tom Zanussi,
	Frédéric Weisbecker, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner


* Will Drewry <wad@chromium.org> wrote:

> +A collection of filters may be supplied via prctl, and the current set of
> +filters is exposed in /proc/<pid>/seccomp_filter.
> +
> +For instance,
> +  const char filters[] =
> +    "sys_read: (fd == 1) || (fd == 2)\n"
> +    "sys_write: (fd == 0)\n"
> +    "sys_exit: 1\n"
> +    "sys_exit_group: 1\n"
> +    "on_next_syscall: 1";
> +  prctl(PR_SET_SECCOMP, 2, filters);
> +
> +This will setup system call filters for read, write, and exit where reading can
> +be done only from fds 1 and 2 and writing to fd 0.  The "on_next_syscall" directive tells
> +seccomp to not enforce the ruleset until after the next system call is run.  This allows
> +for launchers to apply system call filters to a binary before executing it.
> +
> +Once enabled, the access may only be reduced.  For example, a set of filters may be:
> +
> +  sys_read: 1
> +  sys_write: 1
> +  sys_mmap: 1
> +  sys_prctl: 1
> +
> +Then it may call the following to drop mmap access:
> +  prctl(PR_SET_SECCOMP, 2, "sys_mmap: 0");

Ok, color me thoroughly impressed - AFAICS you implemented my suggestions in:

  http://lwn.net/Articles/332974/

and you made it work in practice!

We could split out the ftrace filter engine some more and make it more 
independent of ftrace. It's basically an in-kernel interpreter able to run off 
tracepoints.

I've Cc:-ed Linus and Andrew: are you guys opposed to such flexible, dynamic 
filters conceptually? I think we should really think hard about the actual ABI 
as this could easily spread to more applications than Chrome/Chromium.

Btw., i also think that such an approach is actually the sane(r) design to 
implement security modules: using such filters is far more flexible than the 
typical LSM approach of privileged user-space uploading various nasty objects 
into kernel space and implementing silly (and limited and intrusive) hooks 
there, like SElinux and the other security modules do.

This approach also has the ability to become recursive (gets inherited by child 
tasks, which could add their own filters) and unprivileged - unlike LSMs.

I like this *a lot* more than any security sandboxing approach i've seen 
before.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28  3:08 ` [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering Will Drewry
@ 2011-04-28 13:50   ` Steven Rostedt
  2011-04-28 15:30     ` Will Drewry
  2011-04-28 14:29   ` Frederic Weisbecker
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-04-28 13:50 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris,
	Frederic Weisbecker, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Roland McGrath, Peter Zijlstra,
	Jiri Slaby, David Howells, Serge E. Hallyn

On Wed, 2011-04-27 at 22:08 -0500, Will Drewry wrote:

> -typedef struct { int mode; } seccomp_t;
> +/**
> + * struct seccomp_state - the state of a seccomp'ed process
> + *
> + * @mode:
> + *     if this is 1, the process is under standard seccomp rules
> + *             is 2, the process is only allowed to make system calls where
> + *                   the corresponding bit is set in bitmask and any
> + *                   associated filters evaluate successfully.

I hate arbitrary numbers. This is not a big deal, but can we create an
enum or define that puts names for the modes.

SECCOMP_MODE_BASIC
SECCOMP_MODE_FILTER

??

> + * @usage: number of references to the current instance.
> + * @bitmask: a mask of allowed or filtered system calls and additional flags.
> + * @filter_count: number of seccomp filters in @filters.
> + * @filters: list of seccomp_filter entries for  system calls.
> + */
> +struct seccomp_state {
> +	uint16_t mode;
> +	atomic_t usage;
> +	DECLARE_BITMAP(bitmask, NR_syscalls + 1);
> +	int filter_count;
> +	struct list_head filters;
> +};
> +
> +typedef struct { struct seccomp_state *state; } seccomp_t;
>  
>  extern void __secure_computing(int);
>  static inline void secure_computing(int this_syscall)
> @@ -16,8 +39,14 @@ static inline void secure_computing(int this_syscall)
>  		__secure_computing(this_syscall);
>  }
>  


> diff --git a/kernel/fork.c b/kernel/fork.c
> index e7548de..bdcf70b 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -34,6 +34,7 @@
>  #include <linux/cgroup.h>
>  #include <linux/security.h>
>  #include <linux/hugetlb.h>
> +#include <linux/seccomp.h>
>  #include <linux/swap.h>
>  #include <linux/syscalls.h>
>  #include <linux/jiffies.h>
> @@ -169,6 +170,9 @@ void free_task(struct task_struct *tsk)
>  	free_thread_info(tsk->stack);
>  	rt_mutex_debug_task_free(tsk);
>  	ftrace_graph_exit_task(tsk);
> +#ifdef CONFIG_SECCOMP
> +	put_seccomp_state(tsk->seccomp.state);
> +#endif
>  	free_task_struct(tsk);
>  }
>  EXPORT_SYMBOL(free_task);
> @@ -280,6 +284,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
>  	if (err)
>  		goto out;
>  
> +#ifdef CONFIG_SECCOMP
> +	tsk->seccomp.state = get_seccomp_state(orig->seccomp.state);
> +#endif

I wonder if we could macro these to get rid of the #ifdefs in fork.c.

#define put_seccomp_state(state)	do { } while (0)
#define assign_seccomp_state(src, state) do { } while (0)

??

> +
>  	setup_thread_stack(tsk, orig);
>  	clear_user_return_notifier(tsk);
>  	clear_tsk_need_resched(tsk);
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 57d4b13..1bee87c 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -8,10 +8,11 @@
>  
>  #include <linux/seccomp.h>
>  #include <linux/sched.h>
> +#include <linux/slab.h>
>  #include <linux/compat.h>
>  
>  /* #define SECCOMP_DEBUG 1 */
> -#define NR_SECCOMP_MODES 1
> +#define NR_SECCOMP_MODES 2
>  
>  /*
>   * Secure computing mode 1 allows only read/write/exit/sigreturn.
> @@ -32,9 +33,11 @@ static int mode1_syscalls_32[] = {
>  
>  void __secure_computing(int this_syscall)
>  {
> -	int mode = current->seccomp.mode;
> +	int mode = -1;
>  	int * syscall;
> -
> +	/* Do we need an RCU read lock to access current's state? */

I'm actually confused to why you are using RCU. What are you protecting.
Currently, I see the state is always accessed from current->seccomp. But
current should not be fighting with itself.

Maybe I'm missing something.

-- Steve

> +	if (current->seccomp.state)
> +		mode = current->seccomp.state->mode;
>  	switch (mode) {
>  	case 1:
>  		syscall = mode1_syscalls;
> @@ -47,6 +50,16 @@ void __secure_computing(int this_syscall)
>  				return;
>  		} while (*++syscall);
>  		break;
> +	case 2:
> +#ifdef CONFIG_COMPAT
> +		if (is_compat_task())
> +			/* XXX: No compat support yet. */
> +			break;
> +#endif
> +		if (!seccomp_test_filters(current->seccomp.state,
> +					  this_syscall))
> +			return;
> +		break;
>  	default:
>  		BUG();
>  	}
> @@ -57,30 +70,139 @@ void __secure_computing(int this_syscall)
>  	do_exit(SIGKILL);



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28  3:08 ` [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering Will Drewry
  2011-04-28 13:50   ` Steven Rostedt
@ 2011-04-28 14:29   ` Frederic Weisbecker
  2011-04-28 15:15     ` Will Drewry
  2011-04-28 15:12   ` Frederic Weisbecker
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 406+ messages in thread
From: Frederic Weisbecker @ 2011-04-28 14:29 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Wed, Apr 27, 2011 at 10:08:47PM -0500, Will Drewry wrote:
> This change adds a new seccomp mode based on the work by
> agl@chromium.org. This mode comes with a bitmask of NR_syscalls size and
> an optional linked list of seccomp_filter objects. When in mode 2, all

Since you now use the filters. Why not using them to filter syscalls
entirely rather than using a bitmap of allowed syscalls?

You have the "nr" field in syscall tracepoints.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-04-28  7:06   ` Ingo Molnar
@ 2011-04-28 14:56     ` Eric Paris
  2011-04-28 18:37       ` Will Drewry
  2011-04-28 17:43     ` Serge E. Hallyn
  1 sibling, 1 reply; 406+ messages in thread
From: Eric Paris @ 2011-04-28 14:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, linux-kernel, kees.cook, agl, jmorris, rostedt,
	Randy Dunlap, Linus Torvalds, Andrew Morton, Tom Zanussi,
	Frédéric Weisbecker, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Thu, 2011-04-28 at 09:06 +0200, Ingo Molnar wrote:
> * Will Drewry <wad@chromium.org> wrote:
> 
> > +A collection of filters may be supplied via prctl, and the current set of
> > +filters is exposed in /proc/<pid>/seccomp_filter.
> > +
> > +For instance,
> > +  const char filters[] =
> > +    "sys_read: (fd == 1) || (fd == 2)\n"
> > +    "sys_write: (fd == 0)\n"
> > +    "sys_exit: 1\n"
> > +    "sys_exit_group: 1\n"
> > +    "on_next_syscall: 1";
> > +  prctl(PR_SET_SECCOMP, 2, filters);
> > +
> > +This will setup system call filters for read, write, and exit where reading can
> > +be done only from fds 1 and 2 and writing to fd 0.  The "on_next_syscall" directive tells
> > +seccomp to not enforce the ruleset until after the next system call is run.  This allows
> > +for launchers to apply system call filters to a binary before executing it.
> > +
> > +Once enabled, the access may only be reduced.  For example, a set of filters may be:
> > +
> > +  sys_read: 1
> > +  sys_write: 1
> > +  sys_mmap: 1
> > +  sys_prctl: 1
> > +
> > +Then it may call the following to drop mmap access:
> > +  prctl(PR_SET_SECCOMP, 2, "sys_mmap: 0");
> 
> Ok, color me thoroughly impressed

Me too!

> I've Cc:-ed Linus and Andrew: are you guys opposed to such flexible, dynamic 
> filters conceptually? I think we should really think hard about the actual ABI 
> as this could easily spread to more applications than Chrome/Chromium.

I'll definitely port QEMU to use this new interface rather than my more
rigid flexible (haha "rigid flexible") seccomp.  I'll see if I run into
any issues with this ABI in that porting...

> Btw., i also think that such an approach is actually the sane(r) design to 
> implement security modules: using such filters is far more flexible than the 
> typical LSM approach of privileged user-space uploading various nasty objects 
> into kernel space and implementing silly (and limited and intrusive) hooks 
> there, like SElinux and the other security modules do.

Then you are wrong.  There's no question that this interface can provide
great extensions to the current discretionary functionality provided by
legacy security controls but if you actually want to mediate what tasks
can do to other tasks or can do to arbitrary objects on the system this
doesn't cut it.  Every system call that takes or uses a structure as an
argument or that uses copy_from_user (for something other than just
unparsed data) is uncontrollable.

This approach is great and with careful coding of userspace apps can be
made very useful in constraining those apps, but a replacement for
mandatory access control it is not.

> This approach also has the ability to become recursive (gets inherited by child 
> tasks, which could add their own filters) and unprivileged - unlike LSMs.

LSMs have that ability.  There's nothing to prevent a module loading
service to allow unpriv applications to further constrain themselves.
It's just the different between DAC and MAC.  You are clearly a DAC guy,
and there is no question this change is great in that mindset,  but you
don't seem to understand either the flexibility of the LSM or the
purpose of some of the modules implemented on top of the LSM.

> I like this *a lot* more than any security sandboxing approach i've seen 
> before.

I like this *a lot*.  It will be a HUGE addition to the security
sandboxing approaches I've seen before.

-Eric


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28  3:08 ` [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering Will Drewry
  2011-04-28 13:50   ` Steven Rostedt
  2011-04-28 14:29   ` Frederic Weisbecker
@ 2011-04-28 15:12   ` Frederic Weisbecker
  2011-04-28 15:20     ` Frederic Weisbecker
  2011-04-28 15:29     ` Will Drewry
  2011-04-28 16:28   ` Steven Rostedt
                     ` (2 subsequent siblings)
  5 siblings, 2 replies; 406+ messages in thread
From: Frederic Weisbecker @ 2011-04-28 15:12 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Wed, Apr 27, 2011 at 10:08:47PM -0500, Will Drewry wrote:
> This change adds a new seccomp mode based on the work by
> agl@chromium.org. This mode comes with a bitmask of NR_syscalls size and
> an optional linked list of seccomp_filter objects. When in mode 2, all
> system calls are first checked against the bitmask to determine if they
> are allowed or denied.  If allowed, the list of filters is checked for
> the given syscall number. If all filter predicates for the system call
> match or the system call was allowed without restriction, the process
> continues. Otherwise, it is killed and a KERN_INFO notification is
> posted.
> 
> The filter language itself is provided by the ftrace filter engine.
> Related patches tweak to the perf filter trace and free allow the calls
> to be shared. Filters inherit their understanding of types and arguments
> for each system call from the CONFIG_FTRACE_SYSCALLS subsystem which
> predefines this information in syscall_metadata associated enter_event
> (and exit_event) structures.
> 
> The result is that a process may reduce its available interfaces to
> the kernel through prctl() without knowing the appropriate system call
> number a priori and with the flexibility of filtering based on
> register-stored arguments.  (String checks suffer from TOCTOU issues and
> should be left to LSMs to provide policy for! Don't get greedy :)
> 
> A sample filterset for a process that only needs to interact over stdin
> and stdout and exit cleanly is shown below:
>   sys_read: fd == 0
>   sys_write: fd == 1
>   sys_exit_group: 1
> 
> The filters may be specified once prior to entering the reduced access
> state:
>   prctl(PR_SET_SECCOMP, 2, filters);

Instead of having such multiline filter definition with syscall
names prepended, it would be nicer to make the parsing simplier.

You could have either:

	prctl(PR_SET_SECCOMP, mode);
	/* Works only if we are in mode 2 */
	prctl(PR_SET_SECCOMP_FILTER, syscall_nr, filter);

or:
	/*
	 * If mode == 2, set the filter to syscall_nr
	 * Recall this for each syscall that need a filter.
	 * If a filter was previously set on the targeted syscall,
	 * it will be overwritten.
	 */
	prctl(PR_SET_SECCOMP, mode, syscall_nr, filter);

One can erase a previous filter by setting the new filter "1".

Also, instead of having a bitmap of syscall to accept. You could
simply set "0" as a filter to those you want to deactivate:

prctl(PR_SET_SECCOMP, 2, 1, 0); <- deactivate the syscall_nr 1

Hm?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 14:29   ` Frederic Weisbecker
@ 2011-04-28 15:15     ` Will Drewry
  2011-04-28 15:57       ` Frederic Weisbecker
  0 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-04-28 15:15 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 9:29 AM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> On Wed, Apr 27, 2011 at 10:08:47PM -0500, Will Drewry wrote:
>> This change adds a new seccomp mode based on the work by
>> agl@chromium.org. This mode comes with a bitmask of NR_syscalls size and
>> an optional linked list of seccomp_filter objects. When in mode 2, all
>
> Since you now use the filters. Why not using them to filter syscalls
> entirely rather than using a bitmap of allowed syscalls?

The current approach just uses a linked list of filters.  While a more
efficient data structure could be used, the bitmask provides a quick
binary decision, and optimizes for the relatively common case where
there won't be many non-binary filters to evaluate so we don't have to
walk the list for a larger number of yes/no decisions versus more
complex predicates.  Though that may be a short-sighted view! I'm
happy to change it up.

> You have the "nr" field in syscall tracepoints.

I'n not sure I follow.  Do you mean moving entirely to using the
actual tracepoint infrastructure instead of using the seccomp hooks,
or just looking up proper filter by syscall nr?  If there's a sane and
better way to do the latter, I'm all ears :)  As far as using the
tracepoints themselves, I looked to how the perf/ftrace interactions
worked and while I could've registered with the syscalls tracepoints
for enter and exit, it would mean later evaluation of the system call
interception, possibly out-of-order with respect to other registered
event sinks, and there is complexity in just killing current from
within the notifier-like list registered syscall events (as Eric Paris
ran into when expanding filtering into perf itself).  To get around
that, the tracepoint handler would have to pump the data somewhere
else (like it does for perf), and it just seemed messy.  I think it's
doable, but I don't know that the pure syscall tracepoint
infrastructure should be burdened with the added requirements that
come with seccomp-filtering.   If I didn't properly understand the
code, though, please set me on the right path.

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 15:12   ` Frederic Weisbecker
@ 2011-04-28 15:20     ` Frederic Weisbecker
  2011-04-28 15:29     ` Will Drewry
  1 sibling, 0 replies; 406+ messages in thread
From: Frederic Weisbecker @ 2011-04-28 15:20 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 05:12:44PM +0200, Frederic Weisbecker wrote:
> On Wed, Apr 27, 2011 at 10:08:47PM -0500, Will Drewry wrote:
> > This change adds a new seccomp mode based on the work by
> > agl@chromium.org. This mode comes with a bitmask of NR_syscalls size and
> > an optional linked list of seccomp_filter objects. When in mode 2, all
> > system calls are first checked against the bitmask to determine if they
> > are allowed or denied.  If allowed, the list of filters is checked for
> > the given syscall number. If all filter predicates for the system call
> > match or the system call was allowed without restriction, the process
> > continues. Otherwise, it is killed and a KERN_INFO notification is
> > posted.
> > 
> > The filter language itself is provided by the ftrace filter engine.
> > Related patches tweak to the perf filter trace and free allow the calls
> > to be shared. Filters inherit their understanding of types and arguments
> > for each system call from the CONFIG_FTRACE_SYSCALLS subsystem which
> > predefines this information in syscall_metadata associated enter_event
> > (and exit_event) structures.
> > 
> > The result is that a process may reduce its available interfaces to
> > the kernel through prctl() without knowing the appropriate system call
> > number a priori and with the flexibility of filtering based on
> > register-stored arguments.  (String checks suffer from TOCTOU issues and
> > should be left to LSMs to provide policy for! Don't get greedy :)
> > 
> > A sample filterset for a process that only needs to interact over stdin
> > and stdout and exit cleanly is shown below:
> >   sys_read: fd == 0
> >   sys_write: fd == 1
> >   sys_exit_group: 1
> > 
> > The filters may be specified once prior to entering the reduced access
> > state:
> >   prctl(PR_SET_SECCOMP, 2, filters);
> 
> Instead of having such multiline filter definition with syscall
> names prepended, it would be nicer to make the parsing simplier.
> 
> You could have either:
> 
> 	prctl(PR_SET_SECCOMP, mode);
> 	/* Works only if we are in mode 2 */
> 	prctl(PR_SET_SECCOMP_FILTER, syscall_nr, filter);
> 
> or:
> 	/*
> 	 * If mode == 2, set the filter to syscall_nr
> 	 * Recall this for each syscall that need a filter.
> 	 * If a filter was previously set on the targeted syscall,
> 	 * it will be overwritten.
> 	 */
> 	prctl(PR_SET_SECCOMP, mode, syscall_nr, filter);
> 
> One can erase a previous filter by setting the new filter "1".
> 
> Also, instead of having a bitmap of syscall to accept. You could
> simply set "0" as a filter to those you want to deactivate:
> 
> prctl(PR_SET_SECCOMP, 2, 1, 0); <- deactivate the syscall_nr 1


I meant "0" and not 0. Because a NULL filter would actually mean we
don't have a filter, which would be the same as "1".

> 
> Hm?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 15:12   ` Frederic Weisbecker
  2011-04-28 15:20     ` Frederic Weisbecker
@ 2011-04-28 15:29     ` Will Drewry
  2011-04-28 16:13       ` Frederic Weisbecker
  1 sibling, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-04-28 15:29 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 10:12 AM, Frederic Weisbecker
<fweisbec@gmail.com> wrote:
> On Wed, Apr 27, 2011 at 10:08:47PM -0500, Will Drewry wrote:
>> This change adds a new seccomp mode based on the work by
>> agl@chromium.org. This mode comes with a bitmask of NR_syscalls size and
>> an optional linked list of seccomp_filter objects. When in mode 2, all
>> system calls are first checked against the bitmask to determine if they
>> are allowed or denied.  If allowed, the list of filters is checked for
>> the given syscall number. If all filter predicates for the system call
>> match or the system call was allowed without restriction, the process
>> continues. Otherwise, it is killed and a KERN_INFO notification is
>> posted.
>>
>> The filter language itself is provided by the ftrace filter engine.
>> Related patches tweak to the perf filter trace and free allow the calls
>> to be shared. Filters inherit their understanding of types and arguments
>> for each system call from the CONFIG_FTRACE_SYSCALLS subsystem which
>> predefines this information in syscall_metadata associated enter_event
>> (and exit_event) structures.
>>
>> The result is that a process may reduce its available interfaces to
>> the kernel through prctl() without knowing the appropriate system call
>> number a priori and with the flexibility of filtering based on
>> register-stored arguments.  (String checks suffer from TOCTOU issues and
>> should be left to LSMs to provide policy for! Don't get greedy :)
>>
>> A sample filterset for a process that only needs to interact over stdin
>> and stdout and exit cleanly is shown below:
>>   sys_read: fd == 0
>>   sys_write: fd == 1
>>   sys_exit_group: 1
>>
>> The filters may be specified once prior to entering the reduced access
>> state:
>>   prctl(PR_SET_SECCOMP, 2, filters);
>
> Instead of having such multiline filter definition with syscall
> names prepended, it would be nicer to make the parsing simplier.
>
> You could have either:
>
>        prctl(PR_SET_SECCOMP, mode);
>        /* Works only if we are in mode 2 */
>        prctl(PR_SET_SECCOMP_FILTER, syscall_nr, filter);

It'd need to be syscall_name instead of syscall_nr.  Otherwise we're
right back to where Adam's patch was 2+ years ago :)  Using the event
names from the syscalls infrastructure means the consumer of the
interface doesn't need to be confident of the syscall number.  That
said, it would be nice to be able to specify the number as well.  If
there were no complaints, it'd be nice to support both, imo.

> or:
>        /*
>         * If mode == 2, set the filter to syscall_nr
>         * Recall this for each syscall that need a filter.
>         * If a filter was previously set on the targeted syscall,
>         * it will be overwritten.
>         */
>        prctl(PR_SET_SECCOMP, mode, syscall_nr, filter);
>
> One can erase a previous filter by setting the new filter "1".
>
> Also, instead of having a bitmap of syscall to accept. You could
> simply set "0" as a filter to those you want to deactivate:
>
> prctl(PR_SET_SECCOMP, 2, 1, 0); <- deactivate the syscall_nr 1
>
> Hm?

I like the simplicity in not needing to parse anything extra, but it
does add the need for extra state - either a bit or a new field - to
represent "enabled/enforcing".

The only way to do it without a third mode would be to take a
blacklist model - where all syscalls are allowed by default and the
caller has to enumerate them all and drop them. That would definitely
not be the right approach :)

If a new bit of state was added,  it could be used as:
  prctl(PR_SET_SECCOMP, 2);
  prctl(PR_SET_SECCOMP, 2, "sys_read", "fd == 1");  /* add a read filter */
  prctl(PR_SET_SECCOMP, 2, "sys_write", "fd == 0");  /* add a read filter */
  ...
  prctl(PR_SET_SECCOMP, 2, "sys_read", "0");  /* clear the sys_read
filters and block it */  (or NULL?)
  prctl(PR_SET_SECCOMP, 2, "enable");  /* Start enforcing */
  prctl(PR_SET_SECCOMP, 2, "sys_write", "0");  /* Reduce attack
surface on the fly */


As to the "0" filter instead of a bitmask, would it make sense to just
cut over to an hlist now and drop the bitmask?  It looks like perf
uses that model, and I'd hope it wouldn't incur too much additional
overhead.  (The linked list approach now is certainly not scalable for
a large number of filters!)

If that interface seems sane, I can certainly start exploring it and
see if I hit any surprises (and put it in the next version of the
patch :).  I think it'll simplify a fair amount of the add/drop code!

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 13:50   ` Steven Rostedt
@ 2011-04-28 15:30     ` Will Drewry
  2011-04-28 16:20       ` Serge E. Hallyn
  2011-04-28 16:56       ` Steven Rostedt
  0 siblings, 2 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28 15:30 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris,
	Frederic Weisbecker, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 8:50 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Wed, 2011-04-27 at 22:08 -0500, Will Drewry wrote:
>
>> -typedef struct { int mode; } seccomp_t;
>> +/**
>> + * struct seccomp_state - the state of a seccomp'ed process
>> + *
>> + * @mode:
>> + *     if this is 1, the process is under standard seccomp rules
>> + *             is 2, the process is only allowed to make system calls where
>> + *                   the corresponding bit is set in bitmask and any
>> + *                   associated filters evaluate successfully.
>
> I hate arbitrary numbers. This is not a big deal, but can we create an
> enum or define that puts names for the modes.
>
> SECCOMP_MODE_BASIC
> SECCOMP_MODE_FILTER
>
> ??

Works for me.

>> + * @usage: number of references to the current instance.
>> + * @bitmask: a mask of allowed or filtered system calls and additional flags.
>> + * @filter_count: number of seccomp filters in @filters.
>> + * @filters: list of seccomp_filter entries for  system calls.
>> + */
>> +struct seccomp_state {
>> +     uint16_t mode;
>> +     atomic_t usage;
>> +     DECLARE_BITMAP(bitmask, NR_syscalls + 1);
>> +     int filter_count;
>> +     struct list_head filters;
>> +};
>> +
>> +typedef struct { struct seccomp_state *state; } seccomp_t;
>>
>>  extern void __secure_computing(int);
>>  static inline void secure_computing(int this_syscall)
>> @@ -16,8 +39,14 @@ static inline void secure_computing(int this_syscall)
>>               __secure_computing(this_syscall);
>>  }
>>
>
>
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index e7548de..bdcf70b 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -34,6 +34,7 @@
>>  #include <linux/cgroup.h>
>>  #include <linux/security.h>
>>  #include <linux/hugetlb.h>
>> +#include <linux/seccomp.h>
>>  #include <linux/swap.h>
>>  #include <linux/syscalls.h>
>>  #include <linux/jiffies.h>
>> @@ -169,6 +170,9 @@ void free_task(struct task_struct *tsk)
>>       free_thread_info(tsk->stack);
>>       rt_mutex_debug_task_free(tsk);
>>       ftrace_graph_exit_task(tsk);
>> +#ifdef CONFIG_SECCOMP
>> +     put_seccomp_state(tsk->seccomp.state);
>> +#endif
>>       free_task_struct(tsk);
>>  }
>>  EXPORT_SYMBOL(free_task);
>> @@ -280,6 +284,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
>>       if (err)
>>               goto out;
>>
>> +#ifdef CONFIG_SECCOMP
>> +     tsk->seccomp.state = get_seccomp_state(orig->seccomp.state);
>> +#endif
>
> I wonder if we could macro these to get rid of the #ifdefs in fork.c.
>
> #define put_seccomp_state(state)        do { } while (0)
> #define assign_seccomp_state(src, state) do { } while (0)
>
> ??

Makes sense to me.  I'll add it to the next update.

>> +
>>       setup_thread_stack(tsk, orig);
>>       clear_user_return_notifier(tsk);
>>       clear_tsk_need_resched(tsk);
>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index 57d4b13..1bee87c 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -8,10 +8,11 @@
>>
>>  #include <linux/seccomp.h>
>>  #include <linux/sched.h>
>> +#include <linux/slab.h>
>>  #include <linux/compat.h>
>>
>>  /* #define SECCOMP_DEBUG 1 */
>> -#define NR_SECCOMP_MODES 1
>> +#define NR_SECCOMP_MODES 2
>>
>>  /*
>>   * Secure computing mode 1 allows only read/write/exit/sigreturn.
>> @@ -32,9 +33,11 @@ static int mode1_syscalls_32[] = {
>>
>>  void __secure_computing(int this_syscall)
>>  {
>> -     int mode = current->seccomp.mode;
>> +     int mode = -1;
>>       int * syscall;
>> -
>> +     /* Do we need an RCU read lock to access current's state? */
>
> I'm actually confused to why you are using RCU. What are you protecting.
> Currently, I see the state is always accessed from current->seccomp. But
> current should not be fighting with itself.
>
> Maybe I'm missing something.

I'm sure it's me that's missing something.  I believe the seccomp
pointer can be accessed from:
- current
- via /proc/<pid>/seccomp_filter (read-only)

Given those cases, would it make sense to ditch the RCU interface for it?

Thanks!
will

>
>> +     if (current->seccomp.state)
>> +             mode = current->seccomp.state->mode;
>>       switch (mode) {
>>       case 1:
>>               syscall = mode1_syscalls;
>> @@ -47,6 +50,16 @@ void __secure_computing(int this_syscall)
>>                               return;
>>               } while (*++syscall);
>>               break;
>> +     case 2:
>> +#ifdef CONFIG_COMPAT
>> +             if (is_compat_task())
>> +                     /* XXX: No compat support yet. */
>> +                     break;
>> +#endif
>> +             if (!seccomp_test_filters(current->seccomp.state,
>> +                                       this_syscall))
>> +                     return;
>> +             break;
>>       default:
>>               BUG();
>>       }
>> @@ -57,30 +70,139 @@ void __secure_computing(int this_syscall)
>>       do_exit(SIGKILL);
>
>
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-04-28  3:08 ` [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
  2011-04-28  7:06   ` Ingo Molnar
@ 2011-04-28 15:46   ` Randy Dunlap
  2011-04-28 18:23     ` Will Drewry
  1 sibling, 1 reply; 406+ messages in thread
From: Randy Dunlap @ 2011-04-28 15:46 UTC (permalink / raw)
  To: Will Drewry; +Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt

On Wed, 27 Apr 2011 22:08:49 -0500 Will Drewry wrote:

> Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
> implemented presently, and what it may be used for.  In addition,
> the limitations and caveats of the proposed implementation are
> included.
> 
> Signed-off-by: Will Drewry <wad@chromium.org>
> ---
>  Documentation/trace/seccomp_filter.txt |   75 ++++++++++++++++++++++++++++++++
>  1 files changed, 75 insertions(+), 0 deletions(-)
>  create mode 100644 Documentation/trace/seccomp_filter.txt
> 
> diff --git a/Documentation/trace/seccomp_filter.txt b/Documentation/trace/seccomp_filter.txt
> new file mode 100644
> index 0000000..6a0fd33
> --- /dev/null
> +++ b/Documentation/trace/seccomp_filter.txt
> @@ -0,0 +1,75 @@
> +		Seccomp filtering
> +		=================
> +
> +Introduction
> +------------
> +
> +A large number of system calls are exposed to every userland process
> +with many of them going unused for the entire lifetime of the
> +application.  As system calls change and mature, bugs are found and
> +quashed.  A certain subset of userland applications benefit by having
> +a reduce set of available system calls.  The reduced set reduces the

     reduced

> +total kernel surface exposed to the application.  System call filtering
> +is meant for use with those applications.
> +
> +The implementation currently leverages both the existing seccomp
> +infrastructure and the kernel tracing infrastructure.  By centralizing
> +hooks for attack surface reduction in seccomp, it is possible to assure
> +attention to security that is less relevant in normal ftrace scenarios,
> +such as time of check, time of use attacks.  However, ftrace provides a
> +rich, human-friendly environment for specifying system calls by name and
> +expected arguments.  (As such, this requires FTRACE_SYSCALLS.)
> +
> +
> +What it isn't
> +-------------
> +
> +System call filtering isn't a sandbox.  It provides a clearly defined
> +mechanism for minimizing the exposed kernel surface.  Beyond that, policy for
> +logical behavior and information flow should be managed with an LSM of your
> +choosing.
> +
> +
> +Usage
> +-----
> +
> +An additional seccomp mode is exposed through mode '2'.  This mode
> +depends on CONFIG_SECCOMP_FILTER which in turn depends on
> +CONFIG_FTRACE_SYSCALLS.
> +
> +A collection of filters may be supplied via prctl, and the current set of
> +filters is exposed in /proc/<pid>/seccomp_filter.
> +
> +For instance,
> +  const char filters[] =
> +    "sys_read: (fd == 1) || (fd == 2)\n"
> +    "sys_write: (fd == 0)\n"
> +    "sys_exit: 1\n"
> +    "sys_exit_group: 1\n"
> +    "on_next_syscall: 1";
> +  prctl(PR_SET_SECCOMP, 2, filters);
> +
> +This will setup system call filters for read, write, and exit where reading can
> +be done only from fds 1 and 2 and writing to fd 0.  The "on_next_syscall" directive tells
> +seccomp to not enforce the ruleset until after the next system call is run.  This allows
> +for launchers to apply system call filters to a binary before executing it.
> +
> +Once enabled, the access may only be reduced.  For example, a set of filters may be:
> +
> +  sys_read: 1
> +  sys_write: 1
> +  sys_mmap: 1
> +  sys_prctl: 1
> +
> +Then it may call the following to drop mmap access:
> +  prctl(PR_SET_SECCOMP, 2, "sys_mmap: 0");
> +
> +
> +Caveats
> +-------
> +
> +The system call names come from ftrace events.  At present, many system
> +calls are not hooked - such as x86's ptregs wrapped system calls.
> +
> +In addition compat_task()s will not be supported until a sys32s begin
> +being hooked.

Last sentence is hard to read IMO:
a. what are compat_task()s?
b. what is a sys32s begin?
c. awkward wording, maybe change to:   until a sys32s begin has been hooked.


thanks,
---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 15:15     ` Will Drewry
@ 2011-04-28 15:57       ` Frederic Weisbecker
  2011-04-28 16:05         ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Frederic Weisbecker @ 2011-04-28 15:57 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 10:15:04AM -0500, Will Drewry wrote:
> On Thu, Apr 28, 2011 at 9:29 AM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> > On Wed, Apr 27, 2011 at 10:08:47PM -0500, Will Drewry wrote:
> >> This change adds a new seccomp mode based on the work by
> >> agl@chromium.org. This mode comes with a bitmask of NR_syscalls size and
> >> an optional linked list of seccomp_filter objects. When in mode 2, all
> >
> > Since you now use the filters. Why not using them to filter syscalls
> > entirely rather than using a bitmap of allowed syscalls?
> 
> The current approach just uses a linked list of filters.  While a more
> efficient data structure could be used, the bitmask provides a quick
> binary decision, and optimizes for the relatively common case where
> there won't be many non-binary filters to evaluate so we don't have to
> walk the list for a larger number of yes/no decisions versus more
> complex predicates.  Though that may be a short-sighted view! I'm
> happy to change it up.

Well, using a hlist that points to the filters may be not that slower.
Dunno, that needs to be measured perhaps.

No big deal for now.

> 
> > You have the "nr" field in syscall tracepoints.
> 
> I'n not sure I follow.  Do you mean moving entirely to using the
> actual tracepoint infrastructure instead of using the seccomp hooks,
> or just looking up proper filter by syscall nr?  If there's a sane and
> better way to do the latter, I'm all ears :)  As far as using the
> tracepoints themselves, I looked to how the perf/ftrace interactions
> worked and while I could've registered with the syscalls tracepoints
> for enter and exit, it would mean later evaluation of the system call
> interception, possibly out-of-order with respect to other registered
> event sinks, and there is complexity in just killing current from
> within the notifier-like list registered syscall events (as Eric Paris
> ran into when expanding filtering into perf itself).  To get around
> that, the tracepoint handler would have to pump the data somewhere
> else (like it does for perf), and it just seemed messy.  I think it's
> doable, but I don't know that the pure syscall tracepoint
> infrastructure should be burdened with the added requirements that
> come with seccomp-filtering.   If I didn't properly understand the
> code, though, please set me on the right path.

No, my bad I was confused. I always post questions that show my
misunderstanding of a new (or not) patchset. It's like a tradition ;)

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 15:57       ` Frederic Weisbecker
@ 2011-04-28 16:05         ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28 16:05 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 10:57 AM, Frederic Weisbecker
<fweisbec@gmail.com> wrote:
> On Thu, Apr 28, 2011 at 10:15:04AM -0500, Will Drewry wrote:
>> On Thu, Apr 28, 2011 at 9:29 AM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
>> > On Wed, Apr 27, 2011 at 10:08:47PM -0500, Will Drewry wrote:
>> >> This change adds a new seccomp mode based on the work by
>> >> agl@chromium.org. This mode comes with a bitmask of NR_syscalls size and
>> >> an optional linked list of seccomp_filter objects. When in mode 2, all
>> >
>> > Since you now use the filters. Why not using them to filter syscalls
>> > entirely rather than using a bitmap of allowed syscalls?
>>
>> The current approach just uses a linked list of filters.  While a more
>> efficient data structure could be used, the bitmask provides a quick
>> binary decision, and optimizes for the relatively common case where
>> there won't be many non-binary filters to evaluate so we don't have to
>> walk the list for a larger number of yes/no decisions versus more
>> complex predicates.  Though that may be a short-sighted view! I'm
>> happy to change it up.
>
> Well, using a hlist that points to the filters may be not that slower.
> Dunno, that needs to be measured perhaps.
>
> No big deal for now.

Cool - that makes sense.  I just haven't used hlist before and was
reticent to dive in given how much other new territory this was.  I'll
check it out, though.

>>
>> > You have the "nr" field in syscall tracepoints.
>>
>> I'n not sure I follow.  Do you mean moving entirely to using the
>> actual tracepoint infrastructure instead of using the seccomp hooks,
>> or just looking up proper filter by syscall nr?  If there's a sane and
>> better way to do the latter, I'm all ears :)  As far as using the
>> tracepoints themselves, I looked to how the perf/ftrace interactions
>> worked and while I could've registered with the syscalls tracepoints
>> for enter and exit, it would mean later evaluation of the system call
>> interception, possibly out-of-order with respect to other registered
>> event sinks, and there is complexity in just killing current from
>> within the notifier-like list registered syscall events (as Eric Paris
>> ran into when expanding filtering into perf itself).  To get around
>> that, the tracepoint handler would have to pump the data somewhere
>> else (like it does for perf), and it just seemed messy.  I think it's
>> doable, but I don't know that the pure syscall tracepoint
>> infrastructure should be burdened with the added requirements that
>> come with seccomp-filtering.   If I didn't properly understand the
>> code, though, please set me on the right path.
>
> No, my bad I was confused. I always post questions that show my
> misunderstanding of a new (or not) patchset. It's like a tradition ;)

Awesome - I was getting worried I'd missed something terribly obvious!

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 15:29     ` Will Drewry
@ 2011-04-28 16:13       ` Frederic Weisbecker
  2011-04-28 16:48         ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Frederic Weisbecker @ 2011-04-28 16:13 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 10:29:11AM -0500, Will Drewry wrote:
> On Thu, Apr 28, 2011 at 10:12 AM, Frederic Weisbecker
> <fweisbec@gmail.com> wrote:
> > Instead of having such multiline filter definition with syscall
> > names prepended, it would be nicer to make the parsing simplier.
> >
> > You could have either:
> >
> >        prctl(PR_SET_SECCOMP, mode);
> >        /* Works only if we are in mode 2 */
> >        prctl(PR_SET_SECCOMP_FILTER, syscall_nr, filter);
> 
> It'd need to be syscall_name instead of syscall_nr.  Otherwise we're
> right back to where Adam's patch was 2+ years ago :)  Using the event
> names from the syscalls infrastructure means the consumer of the
> interface doesn't need to be confident of the syscall number.

Is it really a problem? There are libraries that can resolve that. Of
course I can't recall their name.

> 
> > or:
> >        /*
> >         * If mode == 2, set the filter to syscall_nr
> >         * Recall this for each syscall that need a filter.
> >         * If a filter was previously set on the targeted syscall,
> >         * it will be overwritten.
> >         */
> >        prctl(PR_SET_SECCOMP, mode, syscall_nr, filter);
> >
> > One can erase a previous filter by setting the new filter "1".
> >
> > Also, instead of having a bitmap of syscall to accept. You could
> > simply set "0" as a filter to those you want to deactivate:
> >
> > prctl(PR_SET_SECCOMP, 2, 1, 0); <- deactivate the syscall_nr 1
> >
> > Hm?
> 
> I like the simplicity in not needing to parse anything extra, but it
> does add the need for extra state - either a bit or a new field - to
> represent "enabled/enforcing".

And by the way I'm really puzzled about these. I don't understand
well why we need this.

As for the enable_on_next_syscall. The documentation says
it's useful if you want the filter to only apply to the
child. So if fork immediately follows, you will be able to fork
but if the child doesn't have the right to exec, it won't be able
to do so. Same for the mmap() that involves...

So I'm a bit confused about that.

But yeah if that's really needed, it looks to me better to
reduce the parsing and cut it that way:

	prctl(PR_SET_SECCOMP, 2, syscall_name_or_nr, filter);
	prctl(PR_SECCOMP_APPLY_FILTERS, enable_on_next_syscall?)

or something...

> 
> The only way to do it without a third mode would be to take a
> blacklist model - where all syscalls are allowed by default and the
> caller has to enumerate them all and drop them. That would definitely
> not be the right approach :)
> 
> If a new bit of state was added,  it could be used as:
>   prctl(PR_SET_SECCOMP, 2);
>   prctl(PR_SET_SECCOMP, 2, "sys_read", "fd == 1");  /* add a read filter */
>   prctl(PR_SET_SECCOMP, 2, "sys_write", "fd == 0");  /* add a read filter */
>   ...
>   prctl(PR_SET_SECCOMP, 2, "sys_read", "0");  /* clear the sys_read
> filters and block it */  (or NULL?)
>   prctl(PR_SET_SECCOMP, 2, "enable");  /* Start enforcing */
>   prctl(PR_SET_SECCOMP, 2, "sys_write", "0");  /* Reduce attack
> surface on the fly */
> 
> 
> As to the "0" filter instead of a bitmask, would it make sense to just
> cut over to an hlist now and drop the bitmask?
> It looks like perf
> uses that model, and I'd hope it wouldn't incur too much additional
> overhead.  (The linked list approach now is certainly not scalable for
> a large number of filters!)

The linked list certainly doesn't scale there. But either a hlist for
everything, or a hlist + bitmap to check if the syscall is enabled,
why not. May be start with a pure hlist for any filters and if performance
issues that really matter are pinpointed, then move the "1" and "0"
implementation to a bitmap.

My guess is that doesn't really matter. If it's "1" then you can
just have an empty set of filters for the syscall and it goes ahead
quickly. If it's "0" then the app fails, which is not what I would
call a fast path.

> If that interface seems sane, I can certainly start exploring it and
> see if I hit any surprises (and put it in the next version of the
> patch :).  I think it'll simplify a fair amount of the add/drop code!

Yup.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 15:30     ` Will Drewry
@ 2011-04-28 16:20       ` Serge E. Hallyn
  2011-04-28 16:56       ` Steven Rostedt
  1 sibling, 0 replies; 406+ messages in thread
From: Serge E. Hallyn @ 2011-04-28 16:20 UTC (permalink / raw)
  To: Will Drewry
  Cc: Steven Rostedt, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Peter Zijlstra,
	Jiri Slaby, David Howells

Quoting Will Drewry (wad@chromium.org):
> >>  void __secure_computing(int this_syscall)
> >>  {
> >> -     int mode = current->seccomp.mode;
> >> +     int mode = -1;
> >>       int * syscall;
> >> -
> >> +     /* Do we need an RCU read lock to access current's state? */
> >
> > I'm actually confused to why you are using RCU. What are you protecting.
> > Currently, I see the state is always accessed from current->seccomp. But
> > current should not be fighting with itself.
> >
> > Maybe I'm missing something.
> 
> I'm sure it's me that's missing something.  I believe the seccomp
> pointer can be accessed from:
> - current
> - via /proc/<pid>/seccomp_filter (read-only)
> 
> Given those cases, would it make sense to ditch the RCU interface for it?

ISTM you need them to protect the reader.

-serge

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28  3:08 ` [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering Will Drewry
                     ` (2 preceding siblings ...)
  2011-04-28 15:12   ` Frederic Weisbecker
@ 2011-04-28 16:28   ` Steven Rostedt
  2011-04-28 16:53     ` Will Drewry
  2011-04-28 16:55   ` Serge E. Hallyn
  2011-05-03  8:39   ` Avi Kivity
  5 siblings, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-04-28 16:28 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris,
	Frederic Weisbecker, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Roland McGrath, Peter Zijlstra,
	Jiri Slaby, David Howells, Serge E. Hallyn

On Wed, 2011-04-27 at 22:08 -0500, Will Drewry wrote:

> The only other twist is that it is possible to delay enforcement by one
> system call by supplying a "on_next_syscall: 1" 'filter'.  This allows
> for a launcher process to fork(), prctl(), then execve() leaving the
> launched binary in a filtered state.

I wonder if the more "unixy" thing to do is, instead of on_next_sycall,
have "enable_on_exec". Where the user could do multiple syscalls but the
filter will not take place until an exec is made?

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 16:13       ` Frederic Weisbecker
@ 2011-04-28 16:48         ` Will Drewry
  2011-04-28 17:36           ` Frederic Weisbecker
  0 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-04-28 16:48 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 11:13 AM, Frederic Weisbecker
<fweisbec@gmail.com> wrote:
> On Thu, Apr 28, 2011 at 10:29:11AM -0500, Will Drewry wrote:
>> On Thu, Apr 28, 2011 at 10:12 AM, Frederic Weisbecker
>> <fweisbec@gmail.com> wrote:
>> > Instead of having such multiline filter definition with syscall
>> > names prepended, it would be nicer to make the parsing simplier.
>> >
>> > You could have either:
>> >
>> >        prctl(PR_SET_SECCOMP, mode);
>> >        /* Works only if we are in mode 2 */
>> >        prctl(PR_SET_SECCOMP_FILTER, syscall_nr, filter);
>>
>> It'd need to be syscall_name instead of syscall_nr.  Otherwise we're
>> right back to where Adam's patch was 2+ years ago :)  Using the event
>> names from the syscalls infrastructure means the consumer of the
>> interface doesn't need to be confident of the syscall number.
>
> Is it really a problem?

I'd prefer using numbers myself since it gives more flexibility around
filtering entries that haven't yet made it into ftrace syscalls hooks.
However, I think there's some complexity in ensuring it matches the
host kernel.  (It would also allow compat_task support which doesn't
seem to be possible now.)

> There are libraries that can resolve that. Of course I can't recall their name.

asm-generic/unistd.h will provide the mapping, if available. It would
mean that any helper libraries would need to be matched to the locally
running kernel.

If syscall names and numbers are supported, then it would mean that
this change could do both what Adam's original bitmask patch did, by
default, and when CONFIG_FTRACE_SYSCALLS are enabled, it could do name
resolution and apply more complex filters based on the syscalls
events.

Would that make sense?  I wonder if the inconsistency between
sometimes using names and filters and sometimes using numbers and 0|1
would become onerous if userspace applications started using this.
Maybe something like
  prctl(PR_SECCOMP_HAS_FILTERS);
would be desirable?  I dunno.

>> > or:
>> >        /*
>> >         * If mode == 2, set the filter to syscall_nr
>> >         * Recall this for each syscall that need a filter.
>> >         * If a filter was previously set on the targeted syscall,
>> >         * it will be overwritten.
>> >         */
>> >        prctl(PR_SET_SECCOMP, mode, syscall_nr, filter);
>> >
>> > One can erase a previous filter by setting the new filter "1".
>> >
>> > Also, instead of having a bitmap of syscall to accept. You could
>> > simply set "0" as a filter to those you want to deactivate:
>> >
>> > prctl(PR_SET_SECCOMP, 2, 1, 0); <- deactivate the syscall_nr 1
>> >
>> > Hm?
>>
>> I like the simplicity in not needing to parse anything extra, but it
>> does add the need for extra state - either a bit or a new field - to
>> represent "enabled/enforcing".
>
> And by the way I'm really puzzled about these. I don't understand
> well why we need this.

Right now, if you are in seccomp mode !=0 , your system calls will be
hooked (if TIF_SECCOMP is set too). The current patch begins filtering
immediately.  And once filtering is enabled, the process is not
allowed to expand its access to the kernel system cals - only shrink
it.  An "enabled" bit would be needed to tell it that even though it
is running in mode=2, it should/shouldn't kill the process for system
call filter violations and it should allow filters to be added, not
just dropped.  It's why the current patch does it in one step: set the
filters and the mode.  While an enabled bit could be hidden behind
whether TIF_SECCOMP is applied, I'm not sure if that would just be
confusing.  There'd still be a need to APPLY_FILTERS to get the
TIF_SECCOMP flag set to start hooking the system calls.

> As for the enable_on_next_syscall. The documentation says
> it's useful if you want the filter to only apply to the
> child. So if fork immediately follows, you will be able to fork
> but if the child doesn't have the right to exec, it won't be able
> to do so. Same for the mmap() that involves...
>
> So I'm a bit confused about that.

Here's an example:
  http://static.dataspill.org/seccomp_filter/v1/seccomp_launcher.c

You'd use it by calling fork() ... prctl(..., ON_NEXT_SYSCALL) then
execve since you'll already have fork()d.  It seems to work, but maybe
I'm missing something :)

> But yeah if that's really needed, it looks to me better to
> reduce the parsing and cut it that way:
>
>        prctl(PR_SET_SECCOMP, 2, syscall_name_or_nr, filter);
>        prctl(PR_SECCOMP_APPLY_FILTERS, enable_on_next_syscall?)
>
> or something...

Cool - if there are any other flags, it could be something like:
  prctl(PR_SECCOMP_SET_FLAGS, SECCOMP_ENFORCE|SECCOMP_DELAYED_ENFORCEMENT);

But only if there are other flags worth considering. I had a few
others in mind, but now I've completely forgotten :/

>>
>> The only way to do it without a third mode would be to take a
>> blacklist model - where all syscalls are allowed by default and the
>> caller has to enumerate them all and drop them. That would definitely
>> not be the right approach :)
>>
>> If a new bit of state was added,  it could be used as:
>>   prctl(PR_SET_SECCOMP, 2);
>>   prctl(PR_SET_SECCOMP, 2, "sys_read", "fd == 1");  /* add a read filter */
>>   prctl(PR_SET_SECCOMP, 2, "sys_write", "fd == 0");  /* add a read filter */
>>   ...
>>   prctl(PR_SET_SECCOMP, 2, "sys_read", "0");  /* clear the sys_read
>> filters and block it */  (or NULL?)
>>   prctl(PR_SET_SECCOMP, 2, "enable");  /* Start enforcing */
>>   prctl(PR_SET_SECCOMP, 2, "sys_write", "0");  /* Reduce attack
>> surface on the fly */
>>
>>
>> As to the "0" filter instead of a bitmask, would it make sense to just
>> cut over to an hlist now and drop the bitmask?
>> It looks like perf
>> uses that model, and I'd hope it wouldn't incur too much additional
>> overhead.  (The linked list approach now is certainly not scalable for
>> a large number of filters!)
>
> The linked list certainly doesn't scale there. But either a hlist for
> everything, or a hlist + bitmap to check if the syscall is enabled,
> why not. May be start with a pure hlist for any filters and if performance
> issues that really matter are pinpointed, then move the "1" and "0"
> implementation to a bitmap.
>
> My guess is that doesn't really matter. If it's "1" then you can
> just have an empty set of filters for the syscall and it goes ahead
> quickly. If it's "0" then the app fails, which is not what I would
> call a fast path.

Sounds reasonable to me. I'll start to iterate on the suggested
changes and see what emerges.

>> If that interface seems sane, I can certainly start exploring it and
>> see if I hit any surprises (and put it in the next version of the
>> patch :).  I think it'll simplify a fair amount of the add/drop code!
>
> Yup.

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 16:28   ` Steven Rostedt
@ 2011-04-28 16:53     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28 16:53 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris,
	Frederic Weisbecker, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Roland McGrath, Peter Zijlstra,
	Jiri Slaby, David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 11:28 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Wed, 2011-04-27 at 22:08 -0500, Will Drewry wrote:
>
>> The only other twist is that it is possible to delay enforcement by one
>> system call by supplying a "on_next_syscall: 1" 'filter'.  This allows
>> for a launcher process to fork(), prctl(), then execve() leaving the
>> launched binary in a filtered state.
>
> I wonder if the more "unixy" thing to do is, instead of on_next_sycall,
> have "enable_on_exec". Where the user could do multiple syscalls but the
> filter will not take place until an exec is made?

That's what it was originally, but since ftrace syscalls doesn't wrap
sys_execve on x86, I opted to just say "next syscall".  But of course
I can just check the _NR_execve == syscall_nr and do the right thing.
Duh.

thanks!

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28  3:08 ` [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering Will Drewry
                     ` (3 preceding siblings ...)
  2011-04-28 16:28   ` Steven Rostedt
@ 2011-04-28 16:55   ` Serge E. Hallyn
  2011-04-28 17:16     ` Steven Rostedt
  2011-05-03  8:39   ` Avi Kivity
  5 siblings, 1 reply; 406+ messages in thread
From: Serge E. Hallyn @ 2011-04-28 16:55 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Frederic Weisbecker, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Roland McGrath, Peter Zijlstra,
	Jiri Slaby, David Howells

Quoting Will Drewry (wad@chromium.org):

Thanks very much for going through with this, Will.  I really hope to see
this go upstream soon.

> This change adds a new seccomp mode based on the work by
> agl@chromium.org. This mode comes with a bitmask of NR_syscalls size and
> an optional linked list of seccomp_filter objects. When in mode 2, all
> system calls are first checked against the bitmask to determine if they
> are allowed or denied.  If allowed, the list of filters is checked for
> the given syscall number. If all filter predicates for the system call
> match or the system call was allowed without restriction, the process
> continues. Otherwise, it is killed and a KERN_INFO notification is
> posted.
> 
> The filter language itself is provided by the ftrace filter engine.
> Related patches tweak to the perf filter trace and free allow the calls
> to be shared. Filters inherit their understanding of types and arguments
> for each system call from the CONFIG_FTRACE_SYSCALLS subsystem which
> predefines this information in syscall_metadata associated enter_event
> (and exit_event) structures.
> 
> The result is that a process may reduce its available interfaces to
> the kernel through prctl() without knowing the appropriate system call
> number a priori and with the flexibility of filtering based on
> register-stored arguments.  (String checks suffer from TOCTOU issues and
> should be left to LSMs to provide policy for! Don't get greedy :)
> 
> A sample filterset for a process that only needs to interact over stdin
> and stdout and exit cleanly is shown below:
>   sys_read: fd == 0
>   sys_write: fd == 1
>   sys_exit_group: 1
> 
> The filters may be specified once prior to entering the reduced access
> state:
>   prctl(PR_SET_SECCOMP, 2, filters);
> If prctl() is in the allowed bitmask, the process may futher reduce
> privileges by dropping system calls from the allowed set.
> 
> The only other twist is that it is possible to delay enforcement by one
> system call by supplying a "on_next_syscall: 1" 'filter'.  This allows
> for a launcher process to fork(), prctl(), then execve() leaving the
> launched binary in a filtered state.
> 
> Implementation-wise, seccomp.c now uses seccomp_state struct which is
> managed using RCU primitives.  It contains the system call bitmask and
> a linked list of seccomp_filters (which are also managed as an RCU
> list).  All mutations (barring one optional bit flip) to the
> seccomp_state are always done on a duplicate of the current state value
> which is swapped on prior to use.
> 
> Signed-off-by: Will Drewry <wad@chromium.org>

...

>  void __secure_computing(int this_syscall)
>  {
> -	int mode = current->seccomp.mode;
> +	int mode = -1;
>  	int * syscall;
> -
> +	/* Do we need an RCU read lock to access current's state? */

Nope.

...

> +struct seccomp_state *get_seccomp_state(struct seccomp_state *orig)
> +{
> +	if (!orig)
> +		return NULL;
> +	atomic_inc(&orig->usage);
> +	return orig;
> +}
> +
> +static void __put_seccomp_state(struct seccomp_state *orig)
> +{
> +	WARN_ON(atomic_read(&orig->usage));
> +	seccomp_drop_all_filters(orig);
> +	kfree(orig);
> +}
> +
> +/* put_seccomp_state - decrements the reference count of @orig and may free. */
> +void put_seccomp_state(struct seccomp_state *orig)
> +{
> +	if (!orig)
> +		return;
> +
> +	if (atomic_dec_and_test(&orig->usage))
> +		__put_seccomp_state(orig);
> +}
> +
>  long prctl_get_seccomp(void)
>  {
> -	return current->seccomp.mode;
> +	if (!current->seccomp.state)
> +		return 0;
> +	return current->seccomp.state->mode;
>  }
>  
> -long prctl_set_seccomp(unsigned long seccomp_mode)
> +long prctl_set_seccomp(unsigned long seccomp_mode, char *filters)
>  {
>  	long ret;
> +	struct seccomp_state *state, *orig_state;
>  
> -	/* can set it only once to be even more secure */
> +	rcu_read_lock();
> +	orig_state = get_seccomp_state(rcu_dereference(current->seccomp.state));
> +	rcu_read_unlock();
> +
> +	/* mode 1 can only be set once, but mode 2 may have access reduced. */
>  	ret = -EPERM;
> -	if (unlikely(current->seccomp.mode))
> +	if (orig_state && orig_state->mode != 2 && seccomp_mode != 2)
>  		goto out;
>  
>  	ret = -EINVAL;
> -	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
> -		current->seccomp.mode = seccomp_mode;
> -		set_thread_flag(TIF_SECCOMP);
> +	if (!seccomp_mode || seccomp_mode > NR_SECCOMP_MODES)
> +		goto out;
> +
> +	/* Prepare to modify the seccomp state by dropping the shared
> +	 * reference after duplicating it.
> +	 */
> +	state = (orig_state ? seccomp_state_dup(orig_state) :
> +			      seccomp_state_new());
> +	if (!state) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = 0;
> +	switch (seccomp_mode) {
> +	case 1:
>  #ifdef TIF_NOTSC
>  		disable_TSC();
>  #endif
> +		state->mode = seccomp_mode;
> +		set_thread_flag(TIF_SECCOMP);
>  		ret = 0;
> +		break;
> +	case 2:
> +		if (filters) {
> +			ret = seccomp_parse_filters(state, filters);
> +			/* No partial applications. */
> +			if (ret)
> +				goto free_state;
> +		}
> +
> +		if (!state->mode) {
> +			state->mode = seccomp_mode;
> +			set_thread_flag(TIF_SECCOMP);
> +		}
> +		break;
> +	default:
> +		ret = -EINVAL;
>  	}
>  
> - out:
> +	rcu_assign_pointer(current->seccomp.state, state);
> +	synchronize_rcu();
> +	put_seccomp_state(orig_state);  /* for the get */
> +
> +out:
> +	put_seccomp_state(orig_state);  /* for the task */
> +	return ret;
> +
> +free_state:
> +	put_seccomp_state(orig_state);  /* for the get */
> +	put_seccomp_state(state);  /* drop the dup */
>  	return ret;
>  }

This looks exactly right.  The only case where put_seccomp_state()
might actually lead to freeing the state is where the current's
state gets reassigned.  So you need to synchronize_rcu() before
that (as you do).  The other cases will only decrement the usage
counter, can race with a reader doing (inc; get) but not with a
final free, which can only be done here.

(Rambling above is just me pursuading myself)

> diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c

Unfortunately your use of filters doesn't seem exactly right.

> +/* seccomp_copy_all_filters - copies all filters from src to dst.
> + *
> + * @dst: the list_head for seccomp_filters to populate.
> + * @src: the list_head for seccomp_filters to copy from.
> + * Returns non-zero on failure.
> + */
> +int seccomp_copy_all_filters(struct list_head *dst,
> +			     const struct list_head *src)
> +{
> +	struct seccomp_filter *filter;
> +	int ret = 0;
> +	BUG_ON(!dst || !src);
> +	if (list_empty(src))
> +		goto done;
> +	rcu_read_lock();
> +	list_for_each_entry(filter, src, list) {
> +		struct seccomp_filter *new_filter = copy_seccomp_filter(filter);

copy_seccomp_filter() causes kzalloc to be called.  You can't do that under
rcu_read_lock().

I actually thought you were going to be more extreme about the seccomp
state than you are:  I thought you were going to tie a filter list to
seccomp state.  So adding or removing a filter would have required
duping the seccomp state, duping all the filters, making the change in
the copy, and then swapping the new state into place.  Slow in the
hopefully rare update case, but safe.

You don't have to do that, but then I'm pretty sure you'll need to add
reference counts to each filter and use rcu cycles to a reader from
having the filter disappear mid-read.

-serge

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 15:30     ` Will Drewry
  2011-04-28 16:20       ` Serge E. Hallyn
@ 2011-04-28 16:56       ` Steven Rostedt
  2011-04-28 18:02         ` Will Drewry
  1 sibling, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-04-28 16:56 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris,
	Frederic Weisbecker, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, 2011-04-28 at 10:30 -0500, Will Drewry wrote:

> >> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> >> index 57d4b13..1bee87c 100644
> >> --- a/kernel/seccomp.c
> >> +++ b/kernel/seccomp.c
> >> @@ -8,10 +8,11 @@
> >>
> >>  #include <linux/seccomp.h>
> >>  #include <linux/sched.h>
> >> +#include <linux/slab.h>
> >>  #include <linux/compat.h>
> >>
> >>  /* #define SECCOMP_DEBUG 1 */
> >> -#define NR_SECCOMP_MODES 1
> >> +#define NR_SECCOMP_MODES 2
> >>
> >>  /*
> >>   * Secure computing mode 1 allows only read/write/exit/sigreturn.
> >> @@ -32,9 +33,11 @@ static int mode1_syscalls_32[] = {
> >>
> >>  void __secure_computing(int this_syscall)
> >>  {
> >> -     int mode = current->seccomp.mode;
> >> +     int mode = -1;
> >>       int * syscall;
> >> -
> >> +     /* Do we need an RCU read lock to access current's state? */
> >
> > I'm actually confused to why you are using RCU. What are you protecting.
> > Currently, I see the state is always accessed from current->seccomp. But
> > current should not be fighting with itself.
> >
> > Maybe I'm missing something.
> 
> I'm sure it's me that's missing something.  I believe the seccomp
> pointer can be accessed from:
> - current
> - via /proc/<pid>/seccomp_filter (read-only)
> 
> Given those cases, would it make sense to ditch the RCU interface for it?

Looking at this in a bit more detail. I think you can ditch the
rcu_readlocks where current accesses its own seccomp state. As current
is the one that duplicates it (and ups the refcount) on fork, and
current wont free it until after it performs a rcu_synchronization. No
one else can free current's seccomp state while current has a reference
to it.

You still need the rcu_readlocks on the readers for the proc system.
Otherwise the handle can be freed while its still in use. With the
rcu_readlocks, these readers will always get the refcount before current
frees it. And then the dec_and_test should work as expected when the
readers do the put.

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 16:55   ` Serge E. Hallyn
@ 2011-04-28 17:16     ` Steven Rostedt
  2011-04-28 17:39       ` Serge E. Hallyn
  0 siblings, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-04-28 17:16 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Will Drewry, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Roland McGrath,
	Peter Zijlstra, Jiri Slaby, David Howells

On Thu, 2011-04-28 at 11:55 -0500, Serge E. Hallyn wrote:

> ...
> 
> >  void __secure_computing(int this_syscall)
> >  {
> > -	int mode = current->seccomp.mode;
> > +	int mode = -1;
> >  	int * syscall;
> > -
> > +	/* Do we need an RCU read lock to access current's state? */
> 
> Nope.

Correct.

> > - out:
> > +	rcu_assign_pointer(current->seccomp.state, state);
> > +	synchronize_rcu();
> > +	put_seccomp_state(orig_state);  /* for the get */
> > +
> > +out:
> > +	put_seccomp_state(orig_state);  /* for the task */
> > +	return ret;
> > +
> > +free_state:
> > +	put_seccomp_state(orig_state);  /* for the get */
> > +	put_seccomp_state(state);  /* drop the dup */
> >  	return ret;
> >  }
> 
> This looks exactly right.  The only case where put_seccomp_state()
> might actually lead to freeing the state is where the current's
> state gets reassigned.  So you need to synchronize_rcu() before
> that (as you do).  The other cases will only decrement the usage
> counter, can race with a reader doing (inc; get) but not with a
> final free, which can only be done here.

Technically incorrect ;)

"final free, which can only be done here."

This is not the only place that a free will happen. But the code is
correct none-the-less.

Reader on another CPU ups the orig_state refcount under rcu_readlock,
but after it ups the refcount it releases the rcu_readlock and continues
to read this state.

Current on this CPU calls this function does the synchronize_rcu() and
calls put on the state. But since the reader still has a ref count on
it, it does not get freed here.

When the reader is finally done with the state it calls the put() which
does the final free on it.

The code still looks correct, I'm just nitpicking your analysis.

> 
> (Rambling above is just me pursuading myself)

Me rambling too.

> 
> > diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
> 
> Unfortunately your use of filters doesn't seem exactly right.
> 
> > +/* seccomp_copy_all_filters - copies all filters from src to dst.
> > + *
> > + * @dst: the list_head for seccomp_filters to populate.
> > + * @src: the list_head for seccomp_filters to copy from.
> > + * Returns non-zero on failure.
> > + */
> > +int seccomp_copy_all_filters(struct list_head *dst,
> > +			     const struct list_head *src)
> > +{
> > +	struct seccomp_filter *filter;
> > +	int ret = 0;
> > +	BUG_ON(!dst || !src);
> > +	if (list_empty(src))
> > +		goto done;
> > +	rcu_read_lock();
> > +	list_for_each_entry(filter, src, list) {
> > +		struct seccomp_filter *new_filter = copy_seccomp_filter(filter);
> 
> copy_seccomp_filter() causes kzalloc to be called.  You can't do that under
> rcu_read_lock().

Unless you change the kzalloc to do GFP_ATOMIC. Not sure I'd recommend
doing that.

> 
> I actually thought you were going to be more extreme about the seccomp
> state than you are:  I thought you were going to tie a filter list to
> seccomp state.  So adding or removing a filter would have required
> duping the seccomp state, duping all the filters, making the change in
> the copy, and then swapping the new state into place.  Slow in the
> hopefully rare update case, but safe.
> 
> You don't have to do that, but then I'm pretty sure you'll need to add
> reference counts to each filter and use rcu cycles to a reader from
> having the filter disappear mid-read.

Or you can preallocate the new filters, call rcu_read_lock(), check if
the number of old filters is the same or less, if more, call
rcu_read_unlock, and try allocating more, and then call rcu_read_lock()
again and repeat. Then just copy the filters to the preallocate ones.
rcu_read_unlock() and then free any unused allocated filters.

Maybe a bit messy, but not that bad.

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 16:48         ` Will Drewry
@ 2011-04-28 17:36           ` Frederic Weisbecker
  2011-04-28 18:21             ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Frederic Weisbecker @ 2011-04-28 17:36 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 11:48:47AM -0500, Will Drewry wrote:
> On Thu, Apr 28, 2011 at 11:13 AM, Frederic Weisbecker
> <fweisbec@gmail.com> wrote:
> > On Thu, Apr 28, 2011 at 10:29:11AM -0500, Will Drewry wrote:
> >> On Thu, Apr 28, 2011 at 10:12 AM, Frederic Weisbecker
> >> <fweisbec@gmail.com> wrote:
> >> > Instead of having such multiline filter definition with syscall
> >> > names prepended, it would be nicer to make the parsing simplier.
> >> >
> >> > You could have either:
> >> >
> >> >        prctl(PR_SET_SECCOMP, mode);
> >> >        /* Works only if we are in mode 2 */
> >> >        prctl(PR_SET_SECCOMP_FILTER, syscall_nr, filter);
> >>
> >> It'd need to be syscall_name instead of syscall_nr.  Otherwise we're
> >> right back to where Adam's patch was 2+ years ago :)  Using the event
> >> names from the syscalls infrastructure means the consumer of the
> >> interface doesn't need to be confident of the syscall number.
> >
> > Is it really a problem?
> 
> I'd prefer using numbers myself since it gives more flexibility around
> filtering entries that haven't yet made it into ftrace syscalls hooks.
> However, I think there's some complexity in ensuring it matches the
> host kernel.  (It would also allow compat_task support which doesn't
> seem to be possible now.)

You'd have some problems with compat and syscall naming, I think some
of them are not the same between compat architectures.

Note that your new prctl() request is turning syscall handler names (in-kernel API)
into a user ABI. We won't be able to change the name of those kernel handlers
after that. I'm not sure that's desired.

OTOH, syscall numbers are a user ABI, 


> 
> > There are libraries that can resolve that. Of course I can't recall their name.
> 
> asm-generic/unistd.h will provide the mapping, if available. It would
> mean that any helper libraries would need to be matched to the locally
> running kernel.

Yeah. But syscall numbers don't move inside a given architecture (could they?)
Or if so then you'd a dynamic library.


> >> > or:
> >> >        /*
> >> >         * If mode == 2, set the filter to syscall_nr
> >> >         * Recall this for each syscall that need a filter.
> >> >         * If a filter was previously set on the targeted syscall,
> >> >         * it will be overwritten.
> >> >         */
> >> >        prctl(PR_SET_SECCOMP, mode, syscall_nr, filter);
> >> >
> >> > One can erase a previous filter by setting the new filter "1".
> >> >
> >> > Also, instead of having a bitmap of syscall to accept. You could
> >> > simply set "0" as a filter to those you want to deactivate:
> >> >
> >> > prctl(PR_SET_SECCOMP, 2, 1, 0); <- deactivate the syscall_nr 1
> >> >
> >> > Hm?
> >>
> >> I like the simplicity in not needing to parse anything extra, but it
> >> does add the need for extra state - either a bit or a new field - to
> >> represent "enabled/enforcing".
> >
> > And by the way I'm really puzzled about these. I don't understand
> > well why we need this.
> 
> Right now, if you are in seccomp mode !=0 , your system calls will be
> hooked (if TIF_SECCOMP is set too). The current patch begins filtering
> immediately.  And once filtering is enabled, the process is not
> allowed to expand its access to the kernel system cals - only shrink
> it.  An "enabled" bit would be needed to tell it that even though it
> is running in mode=2, it should/shouldn't kill the process for system
> call filter violations and it should allow filters to be added, not
> just dropped.  It's why the current patch does it in one step: set the
> filters and the mode.  While an enabled bit could be hidden behind
> whether TIF_SECCOMP is applied, I'm not sure if that would just be
> confusing.  There'd still be a need to APPLY_FILTERS to get the
> TIF_SECCOMP flag set to start hooking the system calls.

It seems the filter is only ever needed for the childs, right?
And moreover when they exec. So Steve's suggestion to apply
the filters only after the next exec makes sense.

That would solve the problem of both ON_NEXT_SYSCALL and APPLY_FILTER.

I may be missing something obvious though. Do you see a limitation there?

> 
> > As for the enable_on_next_syscall. The documentation says
> > it's useful if you want the filter to only apply to the
> > child. So if fork immediately follows, you will be able to fork
> > but if the child doesn't have the right to exec, it won't be able
> > to do so. Same for the mmap() that involves...
> >
> > So I'm a bit confused about that.
> 
> Here's an example:
>   http://static.dataspill.org/seccomp_filter/v1/seccomp_launcher.c
> 
> You'd use it by calling fork() ... prctl(..., ON_NEXT_SYSCALL) then
> execve since you'll already have fork()d.  It seems to work, but maybe
> I'm missing something :)

Nope, I see.

> 
> > But yeah if that's really needed, it looks to me better to
> > reduce the parsing and cut it that way:
> >
> >        prctl(PR_SET_SECCOMP, 2, syscall_name_or_nr, filter);
> >        prctl(PR_SECCOMP_APPLY_FILTERS, enable_on_next_syscall?)
> >
> > or something...
> 
> Cool - if there are any other flags, it could be something like:
>   prctl(PR_SECCOMP_SET_FLAGS, SECCOMP_ENFORCE|SECCOMP_DELAYED_ENFORCEMENT);
> 
> But only if there are other flags worth considering. I had a few
> others in mind, but now I've completely forgotten :/

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 17:16     ` Steven Rostedt
@ 2011-04-28 17:39       ` Serge E. Hallyn
  2011-04-28 18:01         ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Serge E. Hallyn @ 2011-04-28 17:39 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Will Drewry, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Roland McGrath,
	Peter Zijlstra, Jiri Slaby, David Howells

Quoting Steven Rostedt (rostedt@goodmis.org):
> On Thu, 2011-04-28 at 11:55 -0500, Serge E. Hallyn wrote:
> 
> > ...
> > 
> > >  void __secure_computing(int this_syscall)
> > >  {
> > > -	int mode = current->seccomp.mode;
> > > +	int mode = -1;
> > >  	int * syscall;
> > > -
> > > +	/* Do we need an RCU read lock to access current's state? */
> > 
> > Nope.
> 
> Correct.
> 
> > > - out:
> > > +	rcu_assign_pointer(current->seccomp.state, state);
> > > +	synchronize_rcu();
> > > +	put_seccomp_state(orig_state);  /* for the get */
> > > +
> > > +out:
> > > +	put_seccomp_state(orig_state);  /* for the task */
> > > +	return ret;
> > > +
> > > +free_state:
> > > +	put_seccomp_state(orig_state);  /* for the get */
> > > +	put_seccomp_state(state);  /* drop the dup */
> > >  	return ret;
> > >  }
> > 
> > This looks exactly right.  The only case where put_seccomp_state()
> > might actually lead to freeing the state is where the current's
> > state gets reassigned.  So you need to synchronize_rcu() before
> > that (as you do).  The other cases will only decrement the usage
> > counter, can race with a reader doing (inc; get) but not with a
> > final free, which can only be done here.
> 
> Technically incorrect ;)
> 
> "final free, which can only be done here."
> 
> This is not the only place that a free will happen. But the code is
> correct none-the-less.
> 
> Reader on another CPU ups the orig_state refcount under rcu_readlock,
> but after it ups the refcount it releases the rcu_readlock and continues
> to read this state.
> 
> Current on this CPU calls this function does the synchronize_rcu() and
> calls put on the state. But since the reader still has a ref count on
> it, it does not get freed here.
> 
> When the reader is finally done with the state it calls the put() which
> does the final free on it.
> 
> The code still looks correct, I'm just nitpicking your analysis.

:)  I appreciate the precision.

> > (Rambling above is just me pursuading myself)
> 
> Me rambling too.
> 
> > 
> > > diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
> > 
> > Unfortunately your use of filters doesn't seem exactly right.
> > 
> > > +/* seccomp_copy_all_filters - copies all filters from src to dst.
> > > + *
> > > + * @dst: the list_head for seccomp_filters to populate.
> > > + * @src: the list_head for seccomp_filters to copy from.
> > > + * Returns non-zero on failure.
> > > + */
> > > +int seccomp_copy_all_filters(struct list_head *dst,
> > > +			     const struct list_head *src)
> > > +{
> > > +	struct seccomp_filter *filter;
> > > +	int ret = 0;
> > > +	BUG_ON(!dst || !src);
> > > +	if (list_empty(src))
> > > +		goto done;
> > > +	rcu_read_lock();
> > > +	list_for_each_entry(filter, src, list) {
> > > +		struct seccomp_filter *new_filter = copy_seccomp_filter(filter);
> > 
> > copy_seccomp_filter() causes kzalloc to be called.  You can't do that under
> > rcu_read_lock().
> 
> Unless you change the kzalloc to do GFP_ATOMIC. Not sure I'd recommend
> doing that.
> 
> > 
> > I actually thought you were going to be more extreme about the seccomp
> > state than you are:  I thought you were going to tie a filter list to
> > seccomp state.  So adding or removing a filter would have required
> > duping the seccomp state, duping all the filters, making the change in
> > the copy, and then swapping the new state into place.  Slow in the
> > hopefully rare update case, but safe.
> > 
> > You don't have to do that, but then I'm pretty sure you'll need to add
> > reference counts to each filter and use rcu cycles to a reader from
> > having the filter disappear mid-read.
> 
> Or you can preallocate the new filters, call rcu_read_lock(), check if
> the number of old filters is the same or less, if more, call
> rcu_read_unlock, and try allocating more, and then call rcu_read_lock()
> again and repeat. Then just copy the filters to the preallocate ones.
> rcu_read_unlock() and then free any unused allocated filters.
> 
> Maybe a bit messy, but not that bad.

Sounds good.

thanks,
-serge

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-04-28  7:06   ` Ingo Molnar
  2011-04-28 14:56     ` Eric Paris
@ 2011-04-28 17:43     ` Serge E. Hallyn
  1 sibling, 0 replies; 406+ messages in thread
From: Serge E. Hallyn @ 2011-04-28 17:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, linux-kernel, kees.cook, eparis, agl, jmorris,
	rostedt, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Frédéric Weisbecker,
	Arnaldo Carvalho de Melo, Peter Zijlstra, Thomas Gleixner

Quoting Ingo Molnar (mingo@elte.hu):
> I've Cc:-ed Linus and Andrew: are you guys opposed to such flexible, dynamic 
> filters conceptually? I think we should really think hard about the actual ABI 
> as this could easily spread to more applications than Chrome/Chromium.

We want to use it for containers, to try and provide some bit of
mitigation for the fact that they are sharing a kernel with the host.

thanks,
-serge

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 17:39       ` Serge E. Hallyn
@ 2011-04-28 18:01         ` Will Drewry
  2011-04-28 18:21           ` Steven Rostedt
  2011-04-28 18:51           ` Serge E. Hallyn
  0 siblings, 2 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28 18:01 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Steven Rostedt, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Roland McGrath,
	Peter Zijlstra, Jiri Slaby, David Howells

On Thu, Apr 28, 2011 at 12:39 PM, Serge E. Hallyn <serge@hallyn.com> wrote:
> Quoting Steven Rostedt (rostedt@goodmis.org):
>> On Thu, 2011-04-28 at 11:55 -0500, Serge E. Hallyn wrote:
>>
>> > ...
>> >
>> > >  void __secure_computing(int this_syscall)
>> > >  {
>> > > - int mode = current->seccomp.mode;
>> > > + int mode = -1;
>> > >   int * syscall;
>> > > -
>> > > + /* Do we need an RCU read lock to access current's state? */
>> >
>> > Nope.
>>
>> Correct.
>>
>> > > - out:
>> > > + rcu_assign_pointer(current->seccomp.state, state);
>> > > + synchronize_rcu();
>> > > + put_seccomp_state(orig_state);  /* for the get */
>> > > +
>> > > +out:
>> > > + put_seccomp_state(orig_state);  /* for the task */
>> > > + return ret;
>> > > +
>> > > +free_state:
>> > > + put_seccomp_state(orig_state);  /* for the get */
>> > > + put_seccomp_state(state);  /* drop the dup */
>> > >   return ret;
>> > >  }
>> >
>> > This looks exactly right.  The only case where put_seccomp_state()
>> > might actually lead to freeing the state is where the current's
>> > state gets reassigned.  So you need to synchronize_rcu() before
>> > that (as you do).  The other cases will only decrement the usage
>> > counter, can race with a reader doing (inc; get) but not with a
>> > final free, which can only be done here.
>>
>> Technically incorrect ;)
>>
>> "final free, which can only be done here."
>>
>> This is not the only place that a free will happen. But the code is
>> correct none-the-less.
>>
>> Reader on another CPU ups the orig_state refcount under rcu_readlock,
>> but after it ups the refcount it releases the rcu_readlock and continues
>> to read this state.
>>
>> Current on this CPU calls this function does the synchronize_rcu() and
>> calls put on the state. But since the reader still has a ref count on
>> it, it does not get freed here.
>>
>> When the reader is finally done with the state it calls the put() which
>> does the final free on it.
>>
>> The code still looks correct, I'm just nitpicking your analysis.
>
> :)  I appreciate the precision.
>
>> > (Rambling above is just me pursuading myself)
>>
>> Me rambling too.
>>
>> >
>> > > diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
>> >
>> > Unfortunately your use of filters doesn't seem exactly right.
>> >
>> > > +/* seccomp_copy_all_filters - copies all filters from src to dst.
>> > > + *
>> > > + * @dst: the list_head for seccomp_filters to populate.
>> > > + * @src: the list_head for seccomp_filters to copy from.
>> > > + * Returns non-zero on failure.
>> > > + */
>> > > +int seccomp_copy_all_filters(struct list_head *dst,
>> > > +                      const struct list_head *src)
>> > > +{
>> > > + struct seccomp_filter *filter;
>> > > + int ret = 0;
>> > > + BUG_ON(!dst || !src);
>> > > + if (list_empty(src))
>> > > +         goto done;
>> > > + rcu_read_lock();
>> > > + list_for_each_entry(filter, src, list) {
>> > > +         struct seccomp_filter *new_filter = copy_seccomp_filter(filter);
>> >
>> > copy_seccomp_filter() causes kzalloc to be called.  You can't do that under
>> > rcu_read_lock().
>>
>> Unless you change the kzalloc to do GFP_ATOMIC. Not sure I'd recommend
>> doing that.

Good to know! My question (below) is if I should even be using an RCU
guard at all. I may have been a bit too overzealous.

>> >
>> > I actually thought you were going to be more extreme about the seccomp
>> > state than you are:  I thought you were going to tie a filter list to
>> > seccomp state.  So adding or removing a filter would have required
>> > duping the seccomp state, duping all the filters, making the change in
>> > the copy, and then swapping the new state into place.  Slow in the
>> > hopefully rare update case, but safe.

Hrm, I think I'm confused now!  This is exactly what I *thought* the
code was doing.

At present, seccomp_state can be shared across predecessor/ancestor
relationships using refcounting in fork.c  (get/put). However, the
only way to change a given seccomp_state or its filters is either
through the one-bit on_next_syscall change or through
prctl_set_seccomp.  In prctl_set_seccomp, it does:
state = (orig_state ? seccomp_state_dup(orig_state) :
                              seccomp_state_new());
operates on the new state and then rcu_assign_pointer()s it to the
task.  I didn't intentionally provide any way to drop filters from an
existing state object nor change the filtered syscalls on an in-use
object.  That _dup call should hit the impromperly rcu_locked
copy_all_filters returning duplicates of the original filters by
reparsing the filter_string.

Did I accidentally provide a means to mutate a state object or filter
list without dup()ing? :/

>> > You don't have to do that, but then I'm pretty sure you'll need to add
>> > reference counts to each filter and use rcu cycles to a reader from
>> > having the filter disappear mid-read.

Right now, I don't think it is possible for seccomp_copy_all_filters()
to be called with a src list that changes since every change is
guarded by a seccomp_state_dup(). If that's not true, then I violated
my own invariant :/  If that is the case, should I not treat the list
as an RCU list?  There should never be any simultaneous
reader/writers, just a single reader/writer or multiple readers.

>>
>> Or you can preallocate the new filters, call rcu_read_lock(), check if
>> the number of old filters is the same or less, if more, call
>> rcu_read_unlock, and try allocating more, and then call rcu_read_lock()
>> again and repeat. Then just copy the filters to the preallocate ones.
>> rcu_read_unlock() and then free any unused allocated filters.
>>
>> Maybe a bit messy, but not that bad.
>
> Sounds good.

I'd prefer a heavy-weight copy ;)

I think I'm a bit lost -- am I missing something obvious here?  I was
hoping by using a swapped-in-seccomp_state-pointer, locking and
consistency internal to the state objects would be a tad easier -
though expensive.

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 16:56       ` Steven Rostedt
@ 2011-04-28 18:02         ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28 18:02 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris,
	Frederic Weisbecker, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 11:56 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Thu, 2011-04-28 at 10:30 -0500, Will Drewry wrote:
>
>> >> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> >> index 57d4b13..1bee87c 100644
>> >> --- a/kernel/seccomp.c
>> >> +++ b/kernel/seccomp.c
>> >> @@ -8,10 +8,11 @@
>> >>
>> >>  #include <linux/seccomp.h>
>> >>  #include <linux/sched.h>
>> >> +#include <linux/slab.h>
>> >>  #include <linux/compat.h>
>> >>
>> >>  /* #define SECCOMP_DEBUG 1 */
>> >> -#define NR_SECCOMP_MODES 1
>> >> +#define NR_SECCOMP_MODES 2
>> >>
>> >>  /*
>> >>   * Secure computing mode 1 allows only read/write/exit/sigreturn.
>> >> @@ -32,9 +33,11 @@ static int mode1_syscalls_32[] = {
>> >>
>> >>  void __secure_computing(int this_syscall)
>> >>  {
>> >> -     int mode = current->seccomp.mode;
>> >> +     int mode = -1;
>> >>       int * syscall;
>> >> -
>> >> +     /* Do we need an RCU read lock to access current's state? */
>> >
>> > I'm actually confused to why you are using RCU. What are you protecting.
>> > Currently, I see the state is always accessed from current->seccomp. But
>> > current should not be fighting with itself.
>> >
>> > Maybe I'm missing something.
>>
>> I'm sure it's me that's missing something.  I believe the seccomp
>> pointer can be accessed from:
>> - current
>> - via /proc/<pid>/seccomp_filter (read-only)
>>
>> Given those cases, would it make sense to ditch the RCU interface for it?
>
> Looking at this in a bit more detail. I think you can ditch the
> rcu_readlocks where current accesses its own seccomp state. As current
> is the one that duplicates it (and ups the refcount) on fork, and
> current wont free it until after it performs a rcu_synchronization. No
> one else can free current's seccomp state while current has a reference
> to it.
>
> You still need the rcu_readlocks on the readers for the proc system.
> Otherwise the handle can be freed while its still in use. With the
> rcu_readlocks, these readers will always get the refcount before current
> frees it. And then the dec_and_test should work as expected when the
> readers do the put.

Great, I'll do that!

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 18:01         ` Will Drewry
@ 2011-04-28 18:21           ` Steven Rostedt
  2011-04-28 18:34             ` Will Drewry
  2011-04-28 18:51           ` Serge E. Hallyn
  1 sibling, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-04-28 18:21 UTC (permalink / raw)
  To: Will Drewry
  Cc: Serge E. Hallyn, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Roland McGrath,
	Peter Zijlstra, Jiri Slaby, David Howells

On Thu, 2011-04-28 at 13:01 -0500, Will Drewry wrote:

> Good to know! My question (below) is if I should even be using an RCU
> guard at all. I may have been a bit too overzealous.
> 
> >> >
> >> > I actually thought you were going to be more extreme about the seccomp
> >> > state than you are:  I thought you were going to tie a filter list to
> >> > seccomp state.  So adding or removing a filter would have required
> >> > duping the seccomp state, duping all the filters, making the change in
> >> > the copy, and then swapping the new state into place.  Slow in the
> >> > hopefully rare update case, but safe.
> 
> Hrm, I think I'm confused now!  This is exactly what I *thought* the
> code was doing.

I guess my thought by looking at the code is the call_rcu() to free the
filters in drop_matching_filters().

Also, you have seccomp_drop_all_filters() as a standalone function that
is even exported to modules.

This to me, seems that the filters can disappear at any time. Because
the freeing is done with a rcu_call() the access to the filters needs a
ref count.

> 
> At present, seccomp_state can be shared across predecessor/ancestor
> relationships using refcounting in fork.c  (get/put). However, the
> only way to change a given seccomp_state or its filters is either
> through the one-bit on_next_syscall change or through
> prctl_set_seccomp.  In prctl_set_seccomp, it does:
> state = (orig_state ? seccomp_state_dup(orig_state) :
>                               seccomp_state_new());
> operates on the new state and then rcu_assign_pointer()s it to the
> task.  I didn't intentionally provide any way to drop filters from an
> existing state object nor change the filtered syscalls on an in-use
> object.  That _dup call should hit the impromperly rcu_locked
> copy_all_filters returning duplicates of the original filters by
> reparsing the filter_string.
> 
> Did I accidentally provide a means to mutate a state object or filter
> list without dup()ing? :/

That seccomp_drop_all_filters() looks like you can.

> 
> >> > You don't have to do that, but then I'm pretty sure you'll need to add
> >> > reference counts to each filter and use rcu cycles to a reader from
> >> > having the filter disappear mid-read.
> 
> Right now, I don't think it is possible for seccomp_copy_all_filters()
> to be called with a src list that changes since every change is
> guarded by a seccomp_state_dup(). If that's not true, then I violated
> my own invariant :/  If that is the case, should I not treat the list
> as an RCU list?  There should never be any simultaneous
> reader/writers, just a single reader/writer or multiple readers.

Again, that seccomp_copy_all_filters() is called free standing (exported
to modules). Which to me means that it can be called by anyone at
anytime. There is no protection of this src list.

> 
> >>
> >> Or you can preallocate the new filters, call rcu_read_lock(), check if
> >> the number of old filters is the same or less, if more, call
> >> rcu_read_unlock, and try allocating more, and then call rcu_read_lock()
> >> again and repeat. Then just copy the filters to the preallocate ones.
> >> rcu_read_unlock() and then free any unused allocated filters.
> >>
> >> Maybe a bit messy, but not that bad.
> >
> > Sounds good.
> 
> I'd prefer a heavy-weight copy ;)
> 
> I think I'm a bit lost -- am I missing something obvious here?  I was
> hoping by using a swapped-in-seccomp_state-pointer, locking and
> consistency internal to the state objects would be a tad easier -
> though expensive.

Perhaps, but those free standing functions (the ones that are exported
to modules) seem like they can destroy state.

Your code would have been correct if you could call kzalloc under
rcu_read_lock() (which you can on some kernel configurations but not
all). The issue is that you need to pull out that allocation from the
rcu_read_lock() because rcu_read_lock assumes you can't preempt, and
that allocation can schedule out. The access to the filters must be done
under rcu_read_lock(), other than that, you're fine.

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 17:36           ` Frederic Weisbecker
@ 2011-04-28 18:21             ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28 18:21 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On Thu, Apr 28, 2011 at 12:36 PM, Frederic Weisbecker
<fweisbec@gmail.com> wrote:
> On Thu, Apr 28, 2011 at 11:48:47AM -0500, Will Drewry wrote:
>> On Thu, Apr 28, 2011 at 11:13 AM, Frederic Weisbecker
>> <fweisbec@gmail.com> wrote:
>> > On Thu, Apr 28, 2011 at 10:29:11AM -0500, Will Drewry wrote:
>> >> On Thu, Apr 28, 2011 at 10:12 AM, Frederic Weisbecker
>> >> <fweisbec@gmail.com> wrote:
>> >> > Instead of having such multiline filter definition with syscall
>> >> > names prepended, it would be nicer to make the parsing simplier.
>> >> >
>> >> > You could have either:
>> >> >
>> >> >        prctl(PR_SET_SECCOMP, mode);
>> >> >        /* Works only if we are in mode 2 */
>> >> >        prctl(PR_SET_SECCOMP_FILTER, syscall_nr, filter);
>> >>
>> >> It'd need to be syscall_name instead of syscall_nr.  Otherwise we're
>> >> right back to where Adam's patch was 2+ years ago :)  Using the event
>> >> names from the syscalls infrastructure means the consumer of the
>> >> interface doesn't need to be confident of the syscall number.
>> >
>> > Is it really a problem?
>>
>> I'd prefer using numbers myself since it gives more flexibility around
>> filtering entries that haven't yet made it into ftrace syscalls hooks.
>> However, I think there's some complexity in ensuring it matches the
>> host kernel.  (It would also allow compat_task support which doesn't
>> seem to be possible now.)
>
> You'd have some problems with compat and syscall naming, I think some
> of them are not the same between compat architectures.
>
> Note that your new prctl() request is turning syscall handler names (in-kernel API)
> into a user ABI. We won't be able to change the name of those kernel handlers
> after that. I'm not sure that's desired.
>
> OTOH, syscall numbers are a user ABI,

Makes sense.  I don't mind going the syscall nr route and using a
library for name resolution in userspace, but other users could just
include the libc provided names.  If it doesn't work, the SIGKILL will
make it apparent pretty quickly :p   I don't think it would be a
problem and it makes the kernel side even simpler again.

>
>
>>
>> > There are libraries that can resolve that. Of course I can't recall their name.
>>
>> asm-generic/unistd.h will provide the mapping, if available. It would
>> mean that any helper libraries would need to be matched to the locally
>> running kernel.
>
> Yeah. But syscall numbers don't move inside a given architecture (could they?)
> Or if so then you'd a dynamic library.

Works for me.

>
>> >> > or:
>> >> >        /*
>> >> >         * If mode == 2, set the filter to syscall_nr
>> >> >         * Recall this for each syscall that need a filter.
>> >> >         * If a filter was previously set on the targeted syscall,
>> >> >         * it will be overwritten.
>> >> >         */
>> >> >        prctl(PR_SET_SECCOMP, mode, syscall_nr, filter);
>> >> >
>> >> > One can erase a previous filter by setting the new filter "1".
>> >> >
>> >> > Also, instead of having a bitmap of syscall to accept. You could
>> >> > simply set "0" as a filter to those you want to deactivate:
>> >> >
>> >> > prctl(PR_SET_SECCOMP, 2, 1, 0); <- deactivate the syscall_nr 1
>> >> >
>> >> > Hm?
>> >>
>> >> I like the simplicity in not needing to parse anything extra, but it
>> >> does add the need for extra state - either a bit or a new field - to
>> >> represent "enabled/enforcing".
>> >
>> > And by the way I'm really puzzled about these. I don't understand
>> > well why we need this.
>>
>> Right now, if you are in seccomp mode !=0 , your system calls will be
>> hooked (if TIF_SECCOMP is set too). The current patch begins filtering
>> immediately.  And once filtering is enabled, the process is not
>> allowed to expand its access to the kernel system cals - only shrink
>> it.  An "enabled" bit would be needed to tell it that even though it
>> is running in mode=2, it should/shouldn't kill the process for system
>> call filter violations and it should allow filters to be added, not
>> just dropped.  It's why the current patch does it in one step: set the
>> filters and the mode.  While an enabled bit could be hidden behind
>> whether TIF_SECCOMP is applied, I'm not sure if that would just be
>> confusing.  There'd still be a need to APPLY_FILTERS to get the
>> TIF_SECCOMP flag set to start hooking the system calls.
>
> It seems the filter is only ever needed for the childs, right?
> And moreover when they exec. So Steve's suggestion to apply
> the filters only after the next exec makes sense.
>
> That would solve the problem of both ON_NEXT_SYSCALL and APPLY_FILTER.
>
> I may be missing something obvious though. Do you see a limitation there?
>
>>
>> > As for the enable_on_next_syscall. The documentation says
>> > it's useful if you want the filter to only apply to the
>> > child. So if fork immediately follows, you will be able to fork
>> > but if the child doesn't have the right to exec, it won't be able
>> > to do so. Same for the mmap() that involves...
>> >
>> > So I'm a bit confused about that.
>>
>> Here's an example:
>>   http://static.dataspill.org/seccomp_filter/v1/seccomp_launcher.c
>>
>> You'd use it by calling fork() ... prctl(..., ON_NEXT_SYSCALL) then
>> execve since you'll already have fork()d.  It seems to work, but maybe
>> I'm missing something :)
>
> Nope, I see.
>
>>
>> > But yeah if that's really needed, it looks to me better to
>> > reduce the parsing and cut it that way:
>> >
>> >        prctl(PR_SET_SECCOMP, 2, syscall_name_or_nr, filter);
>> >        prctl(PR_SECCOMP_APPLY_FILTERS, enable_on_next_syscall?)
>> >
>> > or something...
>>
>> Cool - if there are any other flags, it could be something like:
>>   prctl(PR_SECCOMP_SET_FLAGS, SECCOMP_ENFORCE|SECCOMP_DELAYED_ENFORCEMENT);
>>
>> But only if there are other flags worth considering. I had a few
>> others in mind, but now I've completely forgotten :/
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-04-28 15:46   ` Randy Dunlap
@ 2011-04-28 18:23     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28 18:23 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt

On Thu, Apr 28, 2011 at 10:46 AM, Randy Dunlap <rdunlap@xenotime.net> wrote:
> On Wed, 27 Apr 2011 22:08:49 -0500 Will Drewry wrote:
>
>> Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
>> implemented presently, and what it may be used for.  In addition,
>> the limitations and caveats of the proposed implementation are
>> included.
>>
>> Signed-off-by: Will Drewry <wad@chromium.org>
>> ---
>>  Documentation/trace/seccomp_filter.txt |   75 ++++++++++++++++++++++++++++++++
>>  1 files changed, 75 insertions(+), 0 deletions(-)
>>  create mode 100644 Documentation/trace/seccomp_filter.txt
>>
>> diff --git a/Documentation/trace/seccomp_filter.txt b/Documentation/trace/seccomp_filter.txt
>> new file mode 100644
>> index 0000000..6a0fd33
>> --- /dev/null
>> +++ b/Documentation/trace/seccomp_filter.txt
>> @@ -0,0 +1,75 @@
>> +             Seccomp filtering
>> +             =================
>> +
>> +Introduction
>> +------------
>> +
>> +A large number of system calls are exposed to every userland process
>> +with many of them going unused for the entire lifetime of the
>> +application.  As system calls change and mature, bugs are found and
>> +quashed.  A certain subset of userland applications benefit by having
>> +a reduce set of available system calls.  The reduced set reduces the
>
>     reduced
>
>> +total kernel surface exposed to the application.  System call filtering
>> +is meant for use with those applications.
>> +
>> +The implementation currently leverages both the existing seccomp
>> +infrastructure and the kernel tracing infrastructure.  By centralizing
>> +hooks for attack surface reduction in seccomp, it is possible to assure
>> +attention to security that is less relevant in normal ftrace scenarios,
>> +such as time of check, time of use attacks.  However, ftrace provides a
>> +rich, human-friendly environment for specifying system calls by name and
>> +expected arguments.  (As such, this requires FTRACE_SYSCALLS.)
>> +
>> +
>> +What it isn't
>> +-------------
>> +
>> +System call filtering isn't a sandbox.  It provides a clearly defined
>> +mechanism for minimizing the exposed kernel surface.  Beyond that, policy for
>> +logical behavior and information flow should be managed with an LSM of your
>> +choosing.
>> +
>> +
>> +Usage
>> +-----
>> +
>> +An additional seccomp mode is exposed through mode '2'.  This mode
>> +depends on CONFIG_SECCOMP_FILTER which in turn depends on
>> +CONFIG_FTRACE_SYSCALLS.
>> +
>> +A collection of filters may be supplied via prctl, and the current set of
>> +filters is exposed in /proc/<pid>/seccomp_filter.
>> +
>> +For instance,
>> +  const char filters[] =
>> +    "sys_read: (fd == 1) || (fd == 2)\n"
>> +    "sys_write: (fd == 0)\n"
>> +    "sys_exit: 1\n"
>> +    "sys_exit_group: 1\n"
>> +    "on_next_syscall: 1";
>> +  prctl(PR_SET_SECCOMP, 2, filters);
>> +
>> +This will setup system call filters for read, write, and exit where reading can
>> +be done only from fds 1 and 2 and writing to fd 0.  The "on_next_syscall" directive tells
>> +seccomp to not enforce the ruleset until after the next system call is run.  This allows
>> +for launchers to apply system call filters to a binary before executing it.
>> +
>> +Once enabled, the access may only be reduced.  For example, a set of filters may be:
>> +
>> +  sys_read: 1
>> +  sys_write: 1
>> +  sys_mmap: 1
>> +  sys_prctl: 1
>> +
>> +Then it may call the following to drop mmap access:
>> +  prctl(PR_SET_SECCOMP, 2, "sys_mmap: 0");
>> +
>> +
>> +Caveats
>> +-------
>> +
>> +The system call names come from ftrace events.  At present, many system
>> +calls are not hooked - such as x86's ptregs wrapped system calls.
>> +
>> +In addition compat_task()s will not be supported until a sys32s begin
>> +being hooked.
>
> Last sentence is hard to read IMO:
> a. what are compat_task()s?
> b. what is a sys32s begin?
> c. awkward wording, maybe change to:   until a sys32s begin has been hooked.

I'll clean it up and try again.  I believe the other thread discussing
the interface will change this last sentence anyway, so once it
settles, I'll update this patch to reflect the new reality.

thanks!

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 18:21           ` Steven Rostedt
@ 2011-04-28 18:34             ` Will Drewry
  2011-04-28 18:54               ` Serge E. Hallyn
  2011-04-28 19:06               ` [PATCH 3/7] seccomp_filter: " Steven Rostedt
  0 siblings, 2 replies; 406+ messages in thread
From: Will Drewry @ 2011-04-28 18:34 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Serge E. Hallyn, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Roland McGrath,
	Peter Zijlstra, Jiri Slaby, David Howells

On Thu, Apr 28, 2011 at 1:21 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Thu, 2011-04-28 at 13:01 -0500, Will Drewry wrote:
>
>> Good to know! My question (below) is if I should even be using an RCU
>> guard at all. I may have been a bit too overzealous.
>>
>> >> >
>> >> > I actually thought you were going to be more extreme about the seccomp
>> >> > state than you are:  I thought you were going to tie a filter list to
>> >> > seccomp state.  So adding or removing a filter would have required
>> >> > duping the seccomp state, duping all the filters, making the change in
>> >> > the copy, and then swapping the new state into place.  Slow in the
>> >> > hopefully rare update case, but safe.
>>
>> Hrm, I think I'm confused now!  This is exactly what I *thought* the
>> code was doing.
>
> I guess my thought by looking at the code is the call_rcu() to free the
> filters in drop_matching_filters().
>
> Also, you have seccomp_drop_all_filters() as a standalone function that
> is even exported to modules.
>
> This to me, seems that the filters can disappear at any time. Because
> the freeing is done with a rcu_call() the access to the filters needs a
> ref count.
>
>>
>> At present, seccomp_state can be shared across predecessor/ancestor
>> relationships using refcounting in fork.c  (get/put). However, the
>> only way to change a given seccomp_state or its filters is either
>> through the one-bit on_next_syscall change or through
>> prctl_set_seccomp.  In prctl_set_seccomp, it does:
>> state = (orig_state ? seccomp_state_dup(orig_state) :
>>                               seccomp_state_new());
>> operates on the new state and then rcu_assign_pointer()s it to the
>> task.  I didn't intentionally provide any way to drop filters from an
>> existing state object nor change the filtered syscalls on an in-use
>> object.  That _dup call should hit the impromperly rcu_locked
>> copy_all_filters returning duplicates of the original filters by
>> reparsing the filter_string.
>>
>> Did I accidentally provide a means to mutate a state object or filter
>> list without dup()ing? :/
>
> That seccomp_drop_all_filters() looks like you can.
>
>>
>> >> > You don't have to do that, but then I'm pretty sure you'll need to add
>> >> > reference counts to each filter and use rcu cycles to a reader from
>> >> > having the filter disappear mid-read.
>>
>> Right now, I don't think it is possible for seccomp_copy_all_filters()
>> to be called with a src list that changes since every change is
>> guarded by a seccomp_state_dup(). If that's not true, then I violated
>> my own invariant :/  If that is the case, should I not treat the list
>> as an RCU list?  There should never be any simultaneous
>> reader/writers, just a single reader/writer or multiple readers.
>
> Again, that seccomp_copy_all_filters() is called free standing (exported
> to modules). Which to me means that it can be called by anyone at
> anytime. There is no protection of this src list.
>
>>
>> >>
>> >> Or you can preallocate the new filters, call rcu_read_lock(), check if
>> >> the number of old filters is the same or less, if more, call
>> >> rcu_read_unlock, and try allocating more, and then call rcu_read_lock()
>> >> again and repeat. Then just copy the filters to the preallocate ones.
>> >> rcu_read_unlock() and then free any unused allocated filters.
>> >>
>> >> Maybe a bit messy, but not that bad.
>> >
>> > Sounds good.
>>
>> I'd prefer a heavy-weight copy ;)
>>
>> I think I'm a bit lost -- am I missing something obvious here?  I was
>> hoping by using a swapped-in-seccomp_state-pointer, locking and
>> consistency internal to the state objects would be a tad easier -
>> though expensive.
>
> Perhaps, but those free standing functions (the ones that are exported
> to modules) seem like they can destroy state.

My intent was to make them available for use by seccomp.c during state
teardown/dup.  I don't think there's benefit to exposing them outside
of that.  Would dropping the export, and adding an local seccomp.h
with the shared functions in them resolve that more cleanly?

> Your code would have been correct if you could call kzalloc under
> rcu_read_lock() (which you can on some kernel configurations but not
> all). The issue is that you need to pull out that allocation from the
> rcu_read_lock() because rcu_read_lock assumes you can't preempt, and
> that allocation can schedule out. The access to the filters must be done
> under rcu_read_lock(), other than that, you're fine.

That makes sense.  I think I'd prefer to not share those functions
rather than guard the list just in case a future consumer of the
interface comes along.  Would that make sense to you?  Since I don't
see any other users right now other than seccomp.c, it might make
sense to tackle the impact when an actual need arises.

I'll go whichever way pointed on this, though.
thanks again,
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-04-28 14:56     ` Eric Paris
@ 2011-04-28 18:37       ` Will Drewry
  2011-04-29 13:18         ` Frederic Weisbecker
  0 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-04-28 18:37 UTC (permalink / raw)
  To: Eric Paris
  Cc: Ingo Molnar, linux-kernel, kees.cook, agl, jmorris, rostedt,
	Randy Dunlap, Linus Torvalds, Andrew Morton, Tom Zanussi,
	Frédéric Weisbecker, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Thu, Apr 28, 2011 at 9:56 AM, Eric Paris <eparis@redhat.com> wrote:
> On Thu, 2011-04-28 at 09:06 +0200, Ingo Molnar wrote:
>> * Will Drewry <wad@chromium.org> wrote:
>>
>> > +A collection of filters may be supplied via prctl, and the current set of
>> > +filters is exposed in /proc/<pid>/seccomp_filter.
>> > +
>> > +For instance,
>> > +  const char filters[] =
>> > +    "sys_read: (fd == 1) || (fd == 2)\n"
>> > +    "sys_write: (fd == 0)\n"
>> > +    "sys_exit: 1\n"
>> > +    "sys_exit_group: 1\n"
>> > +    "on_next_syscall: 1";
>> > +  prctl(PR_SET_SECCOMP, 2, filters);
>> > +
>> > +This will setup system call filters for read, write, and exit where reading can
>> > +be done only from fds 1 and 2 and writing to fd 0.  The "on_next_syscall" directive tells
>> > +seccomp to not enforce the ruleset until after the next system call is run.  This allows
>> > +for launchers to apply system call filters to a binary before executing it.
>> > +
>> > +Once enabled, the access may only be reduced.  For example, a set of filters may be:
>> > +
>> > +  sys_read: 1
>> > +  sys_write: 1
>> > +  sys_mmap: 1
>> > +  sys_prctl: 1
>> > +
>> > +Then it may call the following to drop mmap access:
>> > +  prctl(PR_SET_SECCOMP, 2, "sys_mmap: 0");
>>
>> Ok, color me thoroughly impressed
>
> Me too!
>
>> I've Cc:-ed Linus and Andrew: are you guys opposed to such flexible, dynamic
>> filters conceptually? I think we should really think hard about the actual ABI
>> as this could easily spread to more applications than Chrome/Chromium.

Would it make sense to start, as Frederic has pointed out, by using
the existing ABI - system call numbers - and not system call names?
We could leave name resolution to userspace as it is for all other
system call consumers now.  It might leave the interface for this
support looking more like:
  prctl(PR_SET_SECCOMP, 2, _NR_mmap, "fd == 1");
  prctl(PR_SET_SECCOMP_FILTER_APPLY, now|on_exec);

which may be less of a dramatic ABI change to start with.

> I'll definitely port QEMU to use this new interface rather than my more
> rigid flexible (haha "rigid flexible") seccomp.  I'll see if I run into
> any issues with this ABI in that porting...

Great - also let me know if you have a preference on the interface
Frederic proposed, as it might reduce the parsing footprint and
overall kernel-side complexity but add a little bit more burden on the
userspace side.

>> Btw., i also think that such an approach is actually the sane(r) design to
>> implement security modules: using such filters is far more flexible than the
>> typical LSM approach of privileged user-space uploading various nasty objects
>> into kernel space and implementing silly (and limited and intrusive) hooks
>> there, like SElinux and the other security modules do.
>
> Then you are wrong.  There's no question that this interface can provide
> great extensions to the current discretionary functionality provided by
> legacy security controls but if you actually want to mediate what tasks
> can do to other tasks or can do to arbitrary objects on the system this
> doesn't cut it.  Every system call that takes or uses a structure as an
> argument or that uses copy_from_user (for something other than just
> unparsed data) is uncontrollable.

I think it'd take a fair amount of additional work to turn pure system
call filtering into a robust policy engine (like systrace).  It seems
to me that right now, it would just be a great addition to the
existing LSM model by providing infrastructure the LSMs can use with
their higher level logic.  (Plumbing in all the bits to understand the
system call arguments completely and avoid time-of-check-time-of-use
attacks would be a sizable undertaking - and that's without making
sure the existing LSMs could live happily on top of it .)

> This approach is great and with careful coding of userspace apps can be
> made very useful in constraining those apps, but a replacement for
> mandatory access control it is not.
>
>> This approach also has the ability to become recursive (gets inherited by child
>> tasks, which could add their own filters) and unprivileged - unlike LSMs.
>
> LSMs have that ability.  There's nothing to prevent a module loading
> service to allow unpriv applications to further constrain themselves.
> It's just the different between DAC and MAC.  You are clearly a DAC guy,
> and there is no question this change is great in that mindset,  but you
> don't seem to understand either the flexibility of the LSM or the
> purpose of some of the modules implemented on top of the LSM.
>
>> I like this *a lot* more than any security sandboxing approach i've seen
>> before.
>
> I like this *a lot*.  It will be a HUGE addition to the security
> sandboxing approaches I've seen before.

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 18:01         ` Will Drewry
  2011-04-28 18:21           ` Steven Rostedt
@ 2011-04-28 18:51           ` Serge E. Hallyn
  1 sibling, 0 replies; 406+ messages in thread
From: Serge E. Hallyn @ 2011-04-28 18:51 UTC (permalink / raw)
  To: Will Drewry
  Cc: Steven Rostedt, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Roland McGrath,
	Peter Zijlstra, Jiri Slaby, David Howells

Quoting Will Drewry (wad@chromium.org):
> On Thu, Apr 28, 2011 at 12:39 PM, Serge E. Hallyn <serge@hallyn.com> wrote:
> > Quoting Steven Rostedt (rostedt@goodmis.org):
> >> On Thu, 2011-04-28 at 11:55 -0500, Serge E. Hallyn wrote:
> >> > I actually thought you were going to be more extreme about the seccomp
> >> > state than you are:  I thought you were going to tie a filter list to
> >> > seccomp state.  So adding or removing a filter would have required
> >> > duping the seccomp state, duping all the filters, making the change in
> >> > the copy, and then swapping the new state into place.  Slow in the
> >> > hopefully rare update case, but safe.
> 
> Hrm, I think I'm confused now!  This is exactly what I *thought* the
> code was doing.
> 
> At present, seccomp_state can be shared across predecessor/ancestor
> relationships using refcounting in fork.c  (get/put). However, the
> only way to change a given seccomp_state or its filters is either
> through the one-bit on_next_syscall change or through
> prctl_set_seccomp.  In prctl_set_seccomp, it does:
> state = (orig_state ? seccomp_state_dup(orig_state) :
>                               seccomp_state_new());
> operates on the new state and then rcu_assign_pointer()s it to the
> task.  I didn't intentionally provide any way to drop filters from an
> existing state object nor change the filtered syscalls on an in-use
> object.  That _dup call should hit the impromperly rcu_locked
> copy_all_filters returning duplicates of the original filters by
> reparsing the filter_string.
> 
> Did I accidentally provide a means to mutate a state object or filter
> list without dup()ing? :/

Ah - probably not, I probably misread your intent for the helpers.

So in that case you don't need to rcu-protect the filters at all
when copying.  You just need to inc the seccomp struct refcount
under rcu lock, to make sure it all stays put.  Drop rcu lock, take
your time copying, then when you dec the refcount the whole seccomp
struct either gets freed or not.

Excellent.

-serge

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 18:34             ` Will Drewry
@ 2011-04-28 18:54               ` Serge E. Hallyn
  2011-04-28 19:07                 ` Steven Rostedt
  2011-04-28 19:06               ` [PATCH 3/7] seccomp_filter: " Steven Rostedt
  1 sibling, 1 reply; 406+ messages in thread
From: Serge E. Hallyn @ 2011-04-28 18:54 UTC (permalink / raw)
  To: Will Drewry
  Cc: Steven Rostedt, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Roland McGrath,
	Peter Zijlstra, Jiri Slaby, David Howells

Quoting Will Drewry (wad@chromium.org):
> My intent was to make them available for use by seccomp.c during state
> teardown/dup.  I don't think there's benefit to exposing them outside
> of that.  Would dropping the export, and adding an local seccomp.h
> with the shared functions in them resolve that more cleanly?

And add a clear comment explaining :)

> > Your code would have been correct if you could call kzalloc under
> > rcu_read_lock() (which you can on some kernel configurations but not
> > all). The issue is that you need to pull out that allocation from the
> > rcu_read_lock() because rcu_read_lock assumes you can't preempt, and
> > that allocation can schedule out. The access to the filters must be done
> > under rcu_read_lock(), other than that, you're fine.
> 
> That makes sense.  I think I'd prefer to not share those functions
> rather than guard the list just in case a future consumer of the
> interface comes along.  Would that make sense to you?  Since I don't
> see any other users right now other than seccomp.c, it might make
> sense to tackle the impact when an actual need arises.
> 
> I'll go whichever way pointed on this, though.

Complicating the locking for nonexistent users doesn't seem like the
right way.

-serge

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 18:34             ` Will Drewry
  2011-04-28 18:54               ` Serge E. Hallyn
@ 2011-04-28 19:06               ` Steven Rostedt
  1 sibling, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-04-28 19:06 UTC (permalink / raw)
  To: Will Drewry
  Cc: Serge E. Hallyn, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Roland McGrath,
	Peter Zijlstra, Jiri Slaby, David Howells

On Thu, 2011-04-28 at 13:34 -0500, Will Drewry wrote:

> > Perhaps, but those free standing functions (the ones that are exported
> > to modules) seem like they can destroy state.
> 
> My intent was to make them available for use by seccomp.c during state
> teardown/dup.  I don't think there's benefit to exposing them outside
> of that.  Would dropping the export, and adding an local seccomp.h
> with the shared functions in them resolve that more cleanly?

Yes, having a local seccomp.h in the same directory as seccomp.c is a
way of saying "these are internal functions, don't use them directly".
Adding a function to EXPORT_SYMBOL*() is saying "here you go, have a
field day with it!".


> 
> > Your code would have been correct if you could call kzalloc under
> > rcu_read_lock() (which you can on some kernel configurations but not
> > all). The issue is that you need to pull out that allocation from the
> > rcu_read_lock() because rcu_read_lock assumes you can't preempt, and
> > that allocation can schedule out. The access to the filters must be done
> > under rcu_read_lock(), other than that, you're fine.
> 
> That makes sense.  I think I'd prefer to not share those functions
> rather than guard the list just in case a future consumer of the
> interface comes along.  Would that make sense to you?  Since I don't
> see any other users right now other than seccomp.c, it might make
> sense to tackle the impact when an actual need arises.
> 
> I'll go whichever way pointed on this, though.
> thanks again,

I'm fine either way. If you make them local internal functions, then you
don't need the locking if they are safe in their current context.

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28 18:54               ` Serge E. Hallyn
@ 2011-04-28 19:07                 ` Steven Rostedt
  2011-05-12  3:02                     ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-04-28 19:07 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Will Drewry, linux-kernel, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Roland McGrath,
	Peter Zijlstra, Jiri Slaby, David Howells

On Thu, 2011-04-28 at 13:54 -0500, Serge E. Hallyn wrote:
> Quoting Will Drewry (wad@chromium.org):
> > My intent was to make them available for use by seccomp.c during state
> > teardown/dup.  I don't think there's benefit to exposing them outside
> > of that.  Would dropping the export, and adding an local seccomp.h
> > with the shared functions in them resolve that more cleanly?
> 
> And add a clear comment explaining :)

Yes that always helps.

> 
> > > Your code would have been correct if you could call kzalloc under
> > > rcu_read_lock() (which you can on some kernel configurations but not
> > > all). The issue is that you need to pull out that allocation from the
> > > rcu_read_lock() because rcu_read_lock assumes you can't preempt, and
> > > that allocation can schedule out. The access to the filters must be done
> > > under rcu_read_lock(), other than that, you're fine.
> > 
> > That makes sense.  I think I'd prefer to not share those functions
> > rather than guard the list just in case a future consumer of the
> > interface comes along.  Would that make sense to you?  Since I don't
> > see any other users right now other than seccomp.c, it might make
> > sense to tackle the impact when an actual need arises.
> > 
> > I'll go whichever way pointed on this, though.
> 
> Complicating the locking for nonexistent users doesn't seem like the
> right way.

I agree.

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 4/7] seccomp_filter: add process state reporting
  2011-04-28  3:24     ` Will Drewry
  2011-04-28  3:40       ` Al Viro
@ 2011-04-28 22:54       ` James Morris
  2011-05-02 10:08         ` Will Drewry
  1 sibling, 1 reply; 406+ messages in thread
From: James Morris @ 2011-04-28 22:54 UTC (permalink / raw)
  To: Will Drewry
  Cc: KOSAKI Motohiro, linux-kernel, kees.cook, eparis, agl, mingo,
	rostedt, Andrew Morton, Alexey Dobriyan, David Howells, Al Viro,
	David Rientjes, Stephen Wilson

On Wed, 27 Apr 2011, Will Drewry wrote:

> > Can't you make individual seccomp specific file?
> 
> Definitely.  Would it make sense to have /proc/<pid>/seccomp and
> /proc/<pid>/seccomp_filter?

Do you need the separate seccomp file vs. just checking what's in 
seccomp_filter ?


-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-04-28 18:37       ` Will Drewry
@ 2011-04-29 13:18         ` Frederic Weisbecker
  2011-04-29 16:13           ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Frederic Weisbecker @ 2011-04-29 13:18 UTC (permalink / raw)
  To: Will Drewry
  Cc: Eric Paris, Ingo Molnar, linux-kernel, kees.cook, agl, jmorris,
	rostedt, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

On Thu, Apr 28, 2011 at 01:37:33PM -0500, Will Drewry wrote:
> On Thu, Apr 28, 2011 at 9:56 AM, Eric Paris <eparis@redhat.com> wrote:
> > On Thu, 2011-04-28 at 09:06 +0200, Ingo Molnar wrote:
> >> * Will Drewry <wad@chromium.org> wrote:
> >>
> >> > +A collection of filters may be supplied via prctl, and the current set of
> >> > +filters is exposed in /proc/<pid>/seccomp_filter.
> >> > +
> >> > +For instance,
> >> > +  const char filters[] =
> >> > +    "sys_read: (fd == 1) || (fd == 2)\n"
> >> > +    "sys_write: (fd == 0)\n"
> >> > +    "sys_exit: 1\n"
> >> > +    "sys_exit_group: 1\n"
> >> > +    "on_next_syscall: 1";
> >> > +  prctl(PR_SET_SECCOMP, 2, filters);
> >> > +
> >> > +This will setup system call filters for read, write, and exit where reading can
> >> > +be done only from fds 1 and 2 and writing to fd 0.  The "on_next_syscall" directive tells
> >> > +seccomp to not enforce the ruleset until after the next system call is run.  This allows
> >> > +for launchers to apply system call filters to a binary before executing it.
> >> > +
> >> > +Once enabled, the access may only be reduced.  For example, a set of filters may be:
> >> > +
> >> > +  sys_read: 1
> >> > +  sys_write: 1
> >> > +  sys_mmap: 1
> >> > +  sys_prctl: 1
> >> > +
> >> > +Then it may call the following to drop mmap access:
> >> > +  prctl(PR_SET_SECCOMP, 2, "sys_mmap: 0");
> >>
> >> Ok, color me thoroughly impressed
> >
> > Me too!
> >
> >> I've Cc:-ed Linus and Andrew: are you guys opposed to such flexible, dynamic
> >> filters conceptually? I think we should really think hard about the actual ABI
> >> as this could easily spread to more applications than Chrome/Chromium.
> 
> Would it make sense to start, as Frederic has pointed out, by using
> the existing ABI - system call numbers - and not system call names?
> We could leave name resolution to userspace as it is for all other
> system call consumers now.  It might leave the interface for this
> support looking more like:
>   prctl(PR_SET_SECCOMP, 2, _NR_mmap, "fd == 1");
>   prctl(PR_SET_SECCOMP_FILTER_APPLY, now|on_exec);

PR_SET_SECCOMP_FILTER_APPLY seems only useful if you think there
are other cases than enable_on_exec that would be useful for these
filters.

We can think about a default enable on exec behaviour as Steve pointed
out.

But I have no idea if other cases may be desirable to apply these
filters.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-04-29 13:18         ` Frederic Weisbecker
@ 2011-04-29 16:13           ` Will Drewry
  2011-05-03  1:29             ` Frederic Weisbecker
  0 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-04-29 16:13 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Eric Paris, Ingo Molnar, linux-kernel, kees.cook, agl, jmorris,
	rostedt, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

On Fri, Apr 29, 2011 at 8:18 AM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> On Thu, Apr 28, 2011 at 01:37:33PM -0500, Will Drewry wrote:
>> On Thu, Apr 28, 2011 at 9:56 AM, Eric Paris <eparis@redhat.com> wrote:
>> > On Thu, 2011-04-28 at 09:06 +0200, Ingo Molnar wrote:
>> >> * Will Drewry <wad@chromium.org> wrote:
>> >>
>> >> > +A collection of filters may be supplied via prctl, and the current set of
>> >> > +filters is exposed in /proc/<pid>/seccomp_filter.
>> >> > +
>> >> > +For instance,
>> >> > +  const char filters[] =
>> >> > +    "sys_read: (fd == 1) || (fd == 2)\n"
>> >> > +    "sys_write: (fd == 0)\n"
>> >> > +    "sys_exit: 1\n"
>> >> > +    "sys_exit_group: 1\n"
>> >> > +    "on_next_syscall: 1";
>> >> > +  prctl(PR_SET_SECCOMP, 2, filters);
>> >> > +
>> >> > +This will setup system call filters for read, write, and exit where reading can
>> >> > +be done only from fds 1 and 2 and writing to fd 0.  The "on_next_syscall" directive tells
>> >> > +seccomp to not enforce the ruleset until after the next system call is run.  This allows
>> >> > +for launchers to apply system call filters to a binary before executing it.
>> >> > +
>> >> > +Once enabled, the access may only be reduced.  For example, a set of filters may be:
>> >> > +
>> >> > +  sys_read: 1
>> >> > +  sys_write: 1
>> >> > +  sys_mmap: 1
>> >> > +  sys_prctl: 1
>> >> > +
>> >> > +Then it may call the following to drop mmap access:
>> >> > +  prctl(PR_SET_SECCOMP, 2, "sys_mmap: 0");
>> >>
>> >> Ok, color me thoroughly impressed
>> >
>> > Me too!
>> >
>> >> I've Cc:-ed Linus and Andrew: are you guys opposed to such flexible, dynamic
>> >> filters conceptually? I think we should really think hard about the actual ABI
>> >> as this could easily spread to more applications than Chrome/Chromium.
>>
>> Would it make sense to start, as Frederic has pointed out, by using
>> the existing ABI - system call numbers - and not system call names?
>> We could leave name resolution to userspace as it is for all other
>> system call consumers now.  It might leave the interface for this
>> support looking more like:
>>   prctl(PR_SET_SECCOMP, 2, _NR_mmap, "fd == 1");
>>   prctl(PR_SET_SECCOMP_FILTER_APPLY, now|on_exec);
>
> PR_SET_SECCOMP_FILTER_APPLY seems only useful if you think there
> are other cases than enable_on_exec that would be useful for these
> filters.
>
> We can think about a default enable on exec behaviour as Steve pointed
> out.
>
> But I have no idea if other cases may be desirable to apply these
> filters.

I nearly have all of the changes in, but I'm still updating my tests.
In general, I think having both on_exec and now is reasonable is
because you can write a much tighter filter set if it is embedded in
the application.  E.g., it may load all its shared libraries, which
you allow, then lock itself down before touching untrusted content.
That said, if the default behavior is enable_on_exec, then you'd only
call PR_SET_SECCOMP_FILTER_APPLY when you want to apply _now_.  I like
that.

That said, I have a general interface question :)  Right now I have:
prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_ADD, syscall_nr, filter_string);
prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_DROP, syscall_nr,
filter_string_or_NULL);
prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_APPLY, apply_flags);
  (I will change this to default to apply_on_exec and let FILTER_APPLY
make it apply _now_ exclusively. :)

This can easily be mapped to:
prctl(PR_SET_SECCOMP
       PR_SET_SECOMP_FILTER_ADD
       PR_SET_SECOMP_FILTER_DROP
       PR_SET_SECOMP_FILTER_APPLY
if that'd be preferable (to keep it all in the prctl.h world).

Following along the suggestion of reducing custom parsing, it seemed
to make a lot of sense to make add and drop actions very explicit.
There is no guesswork so a system call filtered process will only be
able to perform DROP operations (if prctl is allowed) to reduce the
allowed system calls.  This also allows more fine grained flexibility
in addition to the in-kernel complexity reduction.  E.g.,
Process starts with
  __NR_read, "fd == 1"
  __NR_read, "fd == 2"
later it can call:
  prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_DROP, __NR_read, "fd == 2");
to drop one of the filters without disabling "fd == 1" reading.  (Or
it could pass in NULL to drop all filters).

FWIW, I also am updating the Kconfig to be dependent EXPERIMENTAL, as
it might make sense to get some use prior to considering it finalized
:)  I'm not sure if that is appropriate though.

Thanks! I'll try to post the v2s today once they're working properly :)
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 4/7] seccomp_filter: add process state reporting
  2011-04-28 22:54       ` James Morris
@ 2011-05-02 10:08         ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-02 10:08 UTC (permalink / raw)
  To: James Morris
  Cc: KOSAKI Motohiro, linux-kernel, kees.cook, eparis, agl, mingo,
	rostedt, Andrew Morton, Alexey Dobriyan, David Howells, Al Viro,
	David Rientjes, Stephen Wilson, Frederic Weisbecker

On Thu, Apr 28, 2011 at 3:54 PM, James Morris <jmorris@namei.org> wrote:
> On Wed, 27 Apr 2011, Will Drewry wrote:
>
>> > Can't you make individual seccomp specific file?
>>
>> Definitely.  Would it make sense to have /proc/<pid>/seccomp and
>> /proc/<pid>/seccomp_filter?
>
> Do you need the separate seccomp file vs. just checking what's in
> seccomp_filter ?

Initially, I would've said yes, because I had modeled it such that any
entries in /proc/<pid>/seccomp_filter could be fed right back into as
filters.  However, as I've reworked it from Frederic and other's
feedback, I don't need to keep the separation - all the relevant info
can just be in seccomp_filter and no secondary file will be useful.

Thanks for pointing it out!  I'm tracking down what is, no doubt, a
dumb bug (still), but once I sort it, I'll repost the series with this
change included.

Cheers!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-04-29 16:13           ` Will Drewry
@ 2011-05-03  1:29             ` Frederic Weisbecker
  2011-05-03  1:47               ` Frederic Weisbecker
  0 siblings, 1 reply; 406+ messages in thread
From: Frederic Weisbecker @ 2011-05-03  1:29 UTC (permalink / raw)
  To: Will Drewry
  Cc: Eric Paris, Ingo Molnar, linux-kernel, kees.cook, agl, jmorris,
	rostedt, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

On Fri, Apr 29, 2011 at 11:13:44AM -0500, Will Drewry wrote:
> On Fri, Apr 29, 2011 at 8:18 AM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> > PR_SET_SECCOMP_FILTER_APPLY seems only useful if you think there
> > are other cases than enable_on_exec that would be useful for these
> > filters.
> >
> > We can think about a default enable on exec behaviour as Steve pointed
> > out.
> >
> > But I have no idea if other cases may be desirable to apply these
> > filters.
> 
> I nearly have all of the changes in, but I'm still updating my tests.
> In general, I think having both on_exec and now is reasonable is
> because you can write a much tighter filter set if it is embedded in
> the application.  E.g., it may load all its shared libraries, which
> you allow, then lock itself down before touching untrusted content.

Well, that makes sense.

> That said, if the default behavior is enable_on_exec, then you'd only
> call PR_SET_SECCOMP_FILTER_APPLY when you want to apply _now_.  I like
> that.

It could be the default behaviour, which could be overriden with
PR_SET_SECCOMP_FILTER_APPLY. However I'm wondering about that enable_on_exec.

Say you want to accept only stdin/stdout read/write, and you blocked
mmap, open, etc... How can ld load the app and mmap all its shared libraries?
The filters are going to be applied once the interpreter is launched. This
makes me wonder now about the general usability of this and also about
the relevance in a default enable on exec behaviour here.

> 
> That said, I have a general interface question :)  Right now I have:
> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_ADD, syscall_nr, filter_string);
> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_DROP, syscall_nr,
> filter_string_or_NULL);
> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_APPLY, apply_flags);
>   (I will change this to default to apply_on_exec and let FILTER_APPLY
> make it apply _now_ exclusively. :)
> 
> This can easily be mapped to:
> prctl(PR_SET_SECCOMP
>        PR_SET_SECOMP_FILTER_ADD
>        PR_SET_SECOMP_FILTER_DROP
>        PR_SET_SECOMP_FILTER_APPLY
> if that'd be preferable (to keep it all in the prctl.h world).
> 
> Following along the suggestion of reducing custom parsing, it seemed
> to make a lot of sense to make add and drop actions very explicit.
> There is no guesswork so a system call filtered process will only be
> able to perform DROP operations (if prctl is allowed) to reduce the
> allowed system calls.  This also allows more fine grained flexibility
> in addition to the in-kernel complexity reduction.  E.g.,
> Process starts with
>   __NR_read, "fd == 1"
>   __NR_read, "fd == 2"
> later it can call:
>   prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_DROP, __NR_read, "fd == 2");
> to drop one of the filters without disabling "fd == 1" reading.  (Or
> it could pass in NULL to drop all filters).

Hm, but then you don't let the childs be able to restrict further
what you allowed before.

Say I have foo(int a, int b), and I apply these filters:

	__NR_foo, "a == 1";
	__NR_foo, "a == 2";

This is basically "a == 1 || a == 2".

Now I apply the filters and I fork. How can the child
(or current task after the filter is applied) restrict
further by only allowing "b == 2", such that with the
inherited parent filters we have:

	"(a == 1 || a == 2) && b == 2"
	
So what you propose seems to me too limited. I'd rather have this:

SECCOMP_FILTER_SET = remove previous filter entirely and set a new one
SECCOMP_FILTER_GET = get the string of the current filter

The rule would be that you can only set a filter that is intersected
with the one that was previously applied.

It means that if you set filter A and you apply it. If you want to set
filter B thereafter, it must be:

	A && B

OTOH, as long as you haven't applied A, you can override it as you wish.
Like you can have "A || B" instead. Or you can remove it with "1". Of course
if a previous filter was applied before A, then your new filter must be
concatenated: "previous && (A || B)".

Right? And note in this scheme you can reproduce your DROP trick. If
"A || B" is the current filter applied, then you can restrict B by
doing: "(A || B) && A".

So the role of SECCOMP_FILTER_GET is to get the string that matches
the current applied filter.

The effect of this is infinite of course. If you apply A, then apply
B then you need A && B. If later you want to apply C, then you need
A && B && C, etc...

Does that look sane?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-03  1:29             ` Frederic Weisbecker
@ 2011-05-03  1:47               ` Frederic Weisbecker
  2011-05-04  9:15                 ` Will Drewry
  2011-05-04 12:16                 ` Steven Rostedt
  0 siblings, 2 replies; 406+ messages in thread
From: Frederic Weisbecker @ 2011-05-03  1:47 UTC (permalink / raw)
  To: Will Drewry
  Cc: Eric Paris, Ingo Molnar, linux-kernel, kees.cook, agl, jmorris,
	rostedt, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

2011/5/3 Frederic Weisbecker <fweisbec@gmail.com>:
> On Fri, Apr 29, 2011 at 11:13:44AM -0500, Will Drewry wrote:
>> That said, I have a general interface question :)  Right now I have:
>> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_ADD, syscall_nr, filter_string);
>> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_DROP, syscall_nr,
>> filter_string_or_NULL);
>> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_APPLY, apply_flags);
>>   (I will change this to default to apply_on_exec and let FILTER_APPLY
>> make it apply _now_ exclusively. :)
>>
>> This can easily be mapped to:
>> prctl(PR_SET_SECCOMP
>>        PR_SET_SECOMP_FILTER_ADD
>>        PR_SET_SECOMP_FILTER_DROP
>>        PR_SET_SECOMP_FILTER_APPLY
>> if that'd be preferable (to keep it all in the prctl.h world).
>>
>> Following along the suggestion of reducing custom parsing, it seemed
>> to make a lot of sense to make add and drop actions very explicit.
>> There is no guesswork so a system call filtered process will only be
>> able to perform DROP operations (if prctl is allowed) to reduce the
>> allowed system calls.  This also allows more fine grained flexibility
>> in addition to the in-kernel complexity reduction.  E.g.,
>> Process starts with
>>   __NR_read, "fd == 1"
>>   __NR_read, "fd == 2"
>> later it can call:
>>   prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_DROP, __NR_read, "fd == 2");
>> to drop one of the filters without disabling "fd == 1" reading.  (Or
>> it could pass in NULL to drop all filters).
>
> Hm, but then you don't let the childs be able to restrict further
> what you allowed before.
>
> Say I have foo(int a, int b), and I apply these filters:
>
>        __NR_foo, "a == 1";
>        __NR_foo, "a == 2";
>
> This is basically "a == 1 || a == 2".
>
> Now I apply the filters and I fork. How can the child
> (or current task after the filter is applied) restrict
> further by only allowing "b == 2", such that with the
> inherited parent filters we have:
>
>        "(a == 1 || a == 2) && b == 2"
>
> So what you propose seems to me too limited. I'd rather have this:
>
> SECCOMP_FILTER_SET = remove previous filter entirely and set a new one
> SECCOMP_FILTER_GET = get the string of the current filter
>
> The rule would be that you can only set a filter that is intersected
> with the one that was previously applied.
>
> It means that if you set filter A and you apply it. If you want to set
> filter B thereafter, it must be:
>
>        A && B
>
> OTOH, as long as you haven't applied A, you can override it as you wish.
> Like you can have "A || B" instead. Or you can remove it with "1". Of course
> if a previous filter was applied before A, then your new filter must be
> concatenated: "previous && (A || B)".
>
> Right? And note in this scheme you can reproduce your DROP trick. If
> "A || B" is the current filter applied, then you can restrict B by
> doing: "(A || B) && A".
>
> So the role of SECCOMP_FILTER_GET is to get the string that matches
> the current applied filter.
>
> The effect of this is infinite of course. If you apply A, then apply
> B then you need A && B. If later you want to apply C, then you need
> A && B && C, etc...
>
> Does that look sane?
>

Even better: applying a filter would always automatically be an
intersection of the previous one.

If you do:

SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
SECCOMP_FILTER_APPLY
SECCOMP_FILTER_SET, __NR_foo, "b == 2"
SECCOMP_FILTER_APPLY
SECCOMP_FILTER_SET, __NR_foo, "c == 3"
SECCOMP_FILTER_APPLY

The end result is:

"(a == 1 || a == 2) && b == 2  && c == 3"

So that we don't push the burden in the kernel to compare the applied
expression with a new one that may or may not be embraced by parenthesis
and other trickies like that. We simply append to the working one.

Ah and OTOH this:

SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
SECCOMP_FILTER_APPLY
SECCOMP_FILTER_SET, __NR_foo, "b == 2"
SECCOMP_FILTER_SET, __NR_foo, "c == 3"

has the following end result:

"(a == 1 || a == 2) && c == 3"

As long as you don't apply the filter, the temporary part is
overriden, but still we keep
the applied part.

Still sane? (or completely nuts?)

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering
  2011-04-28  3:08 ` [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering Will Drewry
                     ` (4 preceding siblings ...)
  2011-04-28 16:55   ` Serge E. Hallyn
@ 2011-05-03  8:39   ` Avi Kivity
  5 siblings, 0 replies; 406+ messages in thread
From: Avi Kivity @ 2011-05-03  8:39 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, eparis, agl, mingo, jmorris, rostedt,
	Frederic Weisbecker, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Peter Zijlstra, Jiri Slaby,
	David Howells, Serge E. Hallyn

On 04/28/2011 06:08 AM, Will Drewry wrote:
> This change adds a new seccomp mode based on the work by
> agl@chromium.org. This mode comes with a bitmask of NR_syscalls size and
> an optional linked list of seccomp_filter objects. When in mode 2, all
> system calls are first checked against the bitmask to determine if they
> are allowed or denied.  If allowed, the list of filters is checked for
> the given syscall number. If all filter predicates for the system call
> match or the system call was allowed without restriction, the process
> continues. Otherwise, it is killed and a KERN_INFO notification is
> posted.
>
> The filter language itself is provided by the ftrace filter engine.
> Related patches tweak to the perf filter trace and free allow the calls
> to be shared. Filters inherit their understanding of types and arguments
> for each system call from the CONFIG_FTRACE_SYSCALLS subsystem which
> predefines this information in syscall_metadata associated enter_event
> (and exit_event) structures.
>
> The result is that a process may reduce its available interfaces to
> the kernel through prctl() without knowing the appropriate system call
> number a priori and with the flexibility of filtering based on
> register-stored arguments.  (String checks suffer from TOCTOU issues and
> should be left to LSMs to provide policy for! Don't get greedy :)

This is potentially very useful for qemu/kvm.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-03  1:47               ` Frederic Weisbecker
@ 2011-05-04  9:15                 ` Will Drewry
  2011-05-04  9:29                   ` Will Drewry
  2011-05-04 17:52                   ` Frederic Weisbecker
  2011-05-04 12:16                 ` Steven Rostedt
  1 sibling, 2 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-04  9:15 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Eric Paris, Ingo Molnar, linux-kernel, kees.cook, agl, jmorris,
	rostedt, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

On Mon, May 2, 2011 at 6:47 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
> 2011/5/3 Frederic Weisbecker <fweisbec@gmail.com>:
>> On Fri, Apr 29, 2011 at 11:13:44AM -0500, Will Drewry wrote:
>>> That said, I have a general interface question :)  Right now I have:
>>> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_ADD, syscall_nr, filter_string);
>>> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_DROP, syscall_nr,
>>> filter_string_or_NULL);
>>> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_APPLY, apply_flags);
>>>   (I will change this to default to apply_on_exec and let FILTER_APPLY
>>> make it apply _now_ exclusively. :)
>>>
>>> This can easily be mapped to:
>>> prctl(PR_SET_SECCOMP
>>>        PR_SET_SECOMP_FILTER_ADD
>>>        PR_SET_SECOMP_FILTER_DROP
>>>        PR_SET_SECOMP_FILTER_APPLY
>>> if that'd be preferable (to keep it all in the prctl.h world).
>>>
>>> Following along the suggestion of reducing custom parsing, it seemed
>>> to make a lot of sense to make add and drop actions very explicit.
>>> There is no guesswork so a system call filtered process will only be
>>> able to perform DROP operations (if prctl is allowed) to reduce the
>>> allowed system calls.  This also allows more fine grained flexibility
>>> in addition to the in-kernel complexity reduction.  E.g.,
>>> Process starts with
>>>   __NR_read, "fd == 1"
>>>   __NR_read, "fd == 2"
>>> later it can call:
>>>   prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_DROP, __NR_read, "fd == 2");
>>> to drop one of the filters without disabling "fd == 1" reading.  (Or
>>> it could pass in NULL to drop all filters).
>>
>> Hm, but then you don't let the childs be able to restrict further
>> what you allowed before.
>>
>> Say I have foo(int a, int b), and I apply these filters:
>>
>>        __NR_foo, "a == 1";
>>        __NR_foo, "a == 2";
>>
>> This is basically "a == 1 || a == 2".
>>
>> Now I apply the filters and I fork. How can the child
>> (or current task after the filter is applied) restrict
>> further by only allowing "b == 2", such that with the
>> inherited parent filters we have:
>>
>>        "(a == 1 || a == 2) && b == 2"
>>
>> So what you propose seems to me too limited. I'd rather have this:
>>
>> SECCOMP_FILTER_SET = remove previous filter entirely and set a new one
>> SECCOMP_FILTER_GET = get the string of the current filter
>>
>> The rule would be that you can only set a filter that is intersected
>> with the one that was previously applied.
>>
>> It means that if you set filter A and you apply it. If you want to set
>> filter B thereafter, it must be:
>>
>>        A && B
>>
>> OTOH, as long as you haven't applied A, you can override it as you wish.
>> Like you can have "A || B" instead. Or you can remove it with "1". Of course
>> if a previous filter was applied before A, then your new filter must be
>> concatenated: "previous && (A || B)".
>>
>> Right? And note in this scheme you can reproduce your DROP trick. If
>> "A || B" is the current filter applied, then you can restrict B by
>> doing: "(A || B) && A".
>>
>> So the role of SECCOMP_FILTER_GET is to get the string that matches
>> the current applied filter.
>>
>> The effect of this is infinite of course. If you apply A, then apply
>> B then you need A && B. If later you want to apply C, then you need
>> A && B && C, etc...
>>
>> Does that look sane?
>>
>
> Even better: applying a filter would always automatically be an
> intersection of the previous one.
>
> If you do:
>
> SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> SECCOMP_FILTER_APPLY
> SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> SECCOMP_FILTER_APPLY
> SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> SECCOMP_FILTER_APPLY
>
> The end result is:
>
> "(a == 1 || a == 2) && b == 2  && c == 3"
>
> So that we don't push the burden in the kernel to compare the applied
> expression with a new one that may or may not be embraced by parenthesis
> and other trickies like that. We simply append to the working one.
>
> Ah and OTOH this:
>
> SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> SECCOMP_FILTER_APPLY
> SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> SECCOMP_FILTER_SET, __NR_foo, "c == 3"
>
> has the following end result:
>
> "(a == 1 || a == 2) && c == 3"
>
> As long as you don't apply the filter, the temporary part is
> overriden, but still we keep
> the applied part.
>
> Still sane? (or completely nuts?)

Okay - so I *think* I'm following.  I really like the use of SET and
GET to allow for further constraint based on additional argument
restrictions instead of purely reducing the filters available.  The
only part I'm stumbling on is using APPLY on a per-filter basis.  In
my current implementation, I consider APPLY to be the global enable
bit.  Whatever filters are set become set in stone and only &&s are
handled.  I'm not sure I understand why it would make sense to do a
per-syscall-filter apply call.  It's certainly doable, but it will
mean that we may be logically storing something like:

__NR_foo: (a == 1 || a == 2), applied
__NR_foo: b == 2, not applied
__NR_foo: c == 3, not applied

after

SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
SECCOMP_FILTER_APPLY
SECCOMP_FILTER_SET, __NR_foo, "b == 2"
SECCOMP_FILTER_SET, __NR_foo, "c == 3"

In that case, would a call to sys_foo even be tested against the
non-applied constraints of b==2 or c==3? Or would the call to set "c
== 3" replace the "b == 2" entry.  I'm not sure I see that the benefit
exceeds the ambiguity that might introduce.  However, if the default
behavior it to always extend with &&, then a consumer of the interface
could just do:

SECCOMP_FILTER_SET, __NR_prctl, "option == 2"
SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
SECCOMP_FILTER_SET, __NR_foo, "b == 2"
SECCOMP_FILTER_APPLY

This would yield a final filter for foo of "(a == 1 || a == 2) && b ==
2".  The call to APPLY would initiate the enforcement of the syscall
filtering and enforce that no new filters may be added for syscalls
that aren't already constrained.  So you could still call

SECCOMP_FILTER_SET, __NR_foo, "0"

which is logically "((a == 1 || a == 2) && b == 2) && 0" and would be
interpreted as just a DROP. But you could not do,

SECCOMP_FILTER_SET, __NR_foo, "0"
SECCOMP_FILTER_SET, __NR_foo, "1"
or
SECCOMP_FILTER_SET, __NR_read, "1"

(since the implicit filter for all syscalls after an APPLY call should
be "0" and additions would just be "0 && whatever").

Am I missing something?  If that makes sense, then we may even be able
to reduce the extra directives by one and get a resulting interface
that looks something like:

/* Appends (&&) a new filter string for a syscall to the current
filter value. "0" clears the filter. */
prctl(PR_SET_SECCOMP_FILTER, syscall_nr, filter_string);
/* Returns the current explicit filter string for a syscall */
prctl(PR_GET_SECCOMP_FILTER, syscall_nr, buf, buflen);
/* Transition to a secure computing mode:
 * 1 - enables traditional seccomp behavior
 * 2 - enables seccomp filter enforcement and changes the implicit
filter for syscalls from "1" to "0"
 */
prctl(PR_SET_SECCOMP, mode);

I'll set aside the v2 of the patch that uses ADD, DROP, and APPLY and
work up a version with this interface.  Do you (or anyone else :) feel
strongly about per-syscall APPLY?  I like the above version, but I'm
certainly willing to explore the other path.  As is, I'll go through
my usecases (and tests once I have a new cut of the patch) and see how
it feels.  At first blush, this appears more succinct _and_ more
expressive than the prior versions!

~~

As to the use of apply_on_exec, even if you whitelist: mmap, fstat64,
brk, uname, open, read, close, set_thread_area, mprotect, munmap, and
access _just_ to allow a process to be exec'd, it is still a
significant reduction in kernel attack surface.  Pairing that with a
LSM, delayed chroot, etc could fill in the gaps with respect to a
greater sandboxing solution.  I'd certainly take that tradeoff for
running binaries that I don't control the source for :)  That said,
LD_PRELOAD, ptrace injection, and other tricks could allow for the
injection of a very targeted filterset, but I don't think that
invalidates the on-exec case given the brittleness relying exclusively
on such tactics.  Does that seem reasonable?


Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04  9:15                 ` Will Drewry
@ 2011-05-04  9:29                   ` Will Drewry
  2011-05-04 17:52                   ` Frederic Weisbecker
  1 sibling, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-04  9:29 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Eric Paris, Ingo Molnar, linux-kernel, kees.cook, agl, jmorris,
	rostedt, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

On Wed, May 4, 2011 at 2:15 AM, Will Drewry <wad@chromium.org> wrote:
> On Mon, May 2, 2011 at 6:47 PM, Frederic Weisbecker <fweisbec@gmail.com> wrote:
>> 2011/5/3 Frederic Weisbecker <fweisbec@gmail.com>:
>>> On Fri, Apr 29, 2011 at 11:13:44AM -0500, Will Drewry wrote:
>>>> That said, I have a general interface question :)  Right now I have:
>>>> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_ADD, syscall_nr, filter_string);
>>>> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_DROP, syscall_nr,
>>>> filter_string_or_NULL);
>>>> prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_APPLY, apply_flags);
>>>>   (I will change this to default to apply_on_exec and let FILTER_APPLY
>>>> make it apply _now_ exclusively. :)
>>>>
>>>> This can easily be mapped to:
>>>> prctl(PR_SET_SECCOMP
>>>>        PR_SET_SECOMP_FILTER_ADD
>>>>        PR_SET_SECOMP_FILTER_DROP
>>>>        PR_SET_SECOMP_FILTER_APPLY
>>>> if that'd be preferable (to keep it all in the prctl.h world).
>>>>
>>>> Following along the suggestion of reducing custom parsing, it seemed
>>>> to make a lot of sense to make add and drop actions very explicit.
>>>> There is no guesswork so a system call filtered process will only be
>>>> able to perform DROP operations (if prctl is allowed) to reduce the
>>>> allowed system calls.  This also allows more fine grained flexibility
>>>> in addition to the in-kernel complexity reduction.  E.g.,
>>>> Process starts with
>>>>   __NR_read, "fd == 1"
>>>>   __NR_read, "fd == 2"
>>>> later it can call:
>>>>   prctl(PR_SET_SECCOMP, 2, SECCOMP_FILTER_DROP, __NR_read, "fd == 2");
>>>> to drop one of the filters without disabling "fd == 1" reading.  (Or
>>>> it could pass in NULL to drop all filters).
>>>
>>> Hm, but then you don't let the childs be able to restrict further
>>> what you allowed before.
>>>
>>> Say I have foo(int a, int b), and I apply these filters:
>>>
>>>        __NR_foo, "a == 1";
>>>        __NR_foo, "a == 2";
>>>
>>> This is basically "a == 1 || a == 2".
>>>
>>> Now I apply the filters and I fork. How can the child
>>> (or current task after the filter is applied) restrict
>>> further by only allowing "b == 2", such that with the
>>> inherited parent filters we have:
>>>
>>>        "(a == 1 || a == 2) && b == 2"
>>>
>>> So what you propose seems to me too limited. I'd rather have this:
>>>
>>> SECCOMP_FILTER_SET = remove previous filter entirely and set a new one
>>> SECCOMP_FILTER_GET = get the string of the current filter
>>>
>>> The rule would be that you can only set a filter that is intersected
>>> with the one that was previously applied.
>>>
>>> It means that if you set filter A and you apply it. If you want to set
>>> filter B thereafter, it must be:
>>>
>>>        A && B
>>>
>>> OTOH, as long as you haven't applied A, you can override it as you wish.
>>> Like you can have "A || B" instead. Or you can remove it with "1". Of course
>>> if a previous filter was applied before A, then your new filter must be
>>> concatenated: "previous && (A || B)".
>>>
>>> Right? And note in this scheme you can reproduce your DROP trick. If
>>> "A || B" is the current filter applied, then you can restrict B by
>>> doing: "(A || B) && A".
>>>
>>> So the role of SECCOMP_FILTER_GET is to get the string that matches
>>> the current applied filter.
>>>
>>> The effect of this is infinite of course. If you apply A, then apply
>>> B then you need A && B. If later you want to apply C, then you need
>>> A && B && C, etc...
>>>
>>> Does that look sane?
>>>
>>
>> Even better: applying a filter would always automatically be an
>> intersection of the previous one.
>>
>> If you do:
>>
>> SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
>> SECCOMP_FILTER_APPLY
>> SECCOMP_FILTER_SET, __NR_foo, "b == 2"
>> SECCOMP_FILTER_APPLY
>> SECCOMP_FILTER_SET, __NR_foo, "c == 3"
>> SECCOMP_FILTER_APPLY
>>
>> The end result is:
>>
>> "(a == 1 || a == 2) && b == 2  && c == 3"
>>
>> So that we don't push the burden in the kernel to compare the applied
>> expression with a new one that may or may not be embraced by parenthesis
>> and other trickies like that. We simply append to the working one.
>>
>> Ah and OTOH this:
>>
>> SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
>> SECCOMP_FILTER_APPLY
>> SECCOMP_FILTER_SET, __NR_foo, "b == 2"
>> SECCOMP_FILTER_SET, __NR_foo, "c == 3"
>>
>> has the following end result:
>>
>> "(a == 1 || a == 2) && c == 3"
>>
>> As long as you don't apply the filter, the temporary part is
>> overriden, but still we keep
>> the applied part.
>>
>> Still sane? (or completely nuts?)
>
> Okay - so I *think* I'm following.  I really like the use of SET and
> GET to allow for further constraint based on additional argument
> restrictions instead of purely reducing the filters available.  The
> only part I'm stumbling on is using APPLY on a per-filter basis.  In
> my current implementation, I consider APPLY to be the global enable
> bit.  Whatever filters are set become set in stone and only &&s are
> handled.  I'm not sure I understand why it would make sense to do a
> per-syscall-filter apply call.  It's certainly doable, but it will
> mean that we may be logically storing something like:
>
> __NR_foo: (a == 1 || a == 2), applied
> __NR_foo: b == 2, not applied
> __NR_foo: c == 3, not applied
>
> after
>
> SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> SECCOMP_FILTER_APPLY
> SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> SECCOMP_FILTER_SET, __NR_foo, "c == 3"
>
> In that case, would a call to sys_foo even be tested against the
> non-applied constraints of b==2 or c==3? Or would the call to set "c
> == 3" replace the "b == 2" entry.  I'm not sure I see that the benefit
> exceeds the ambiguity that might introduce.  However, if the default
> behavior it to always extend with &&, then a consumer of the interface
> could just do:
>
> SECCOMP_FILTER_SET, __NR_prctl, "option == 2"
> SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> SECCOMP_FILTER_APPLY
>
> This would yield a final filter for foo of "(a == 1 || a == 2) && b ==
> 2".  The call to APPLY would initiate the enforcement of the syscall
> filtering and enforce that no new filters may be added for syscalls
> that aren't already constrained.  So you could still call
>
> SECCOMP_FILTER_SET, __NR_foo, "0"
>
> which is logically "((a == 1 || a == 2) && b == 2) && 0" and would be
> interpreted as just a DROP. But you could not do,
>
> SECCOMP_FILTER_SET, __NR_foo, "0"
> SECCOMP_FILTER_SET, __NR_foo, "1"
> or
> SECCOMP_FILTER_SET, __NR_read, "1"
>
> (since the implicit filter for all syscalls after an APPLY call should
> be "0" and additions would just be "0 && whatever").
>
> Am I missing something?  If that makes sense, then we may even be able
> to reduce the extra directives by one and get a resulting interface
> that looks something like:
>
> /* Appends (&&) a new filter string for a syscall to the current
> filter value. "0" clears the filter. */
> prctl(PR_SET_SECCOMP_FILTER, syscall_nr, filter_string);
> /* Returns the current explicit filter string for a syscall */
> prctl(PR_GET_SECCOMP_FILTER, syscall_nr, buf, buflen);
> /* Transition to a secure computing mode:
>  * 1 - enables traditional seccomp behavior
>  * 2 - enables seccomp filter enforcement and changes the implicit
> filter for syscalls from "1" to "0"
>  */
> prctl(PR_SET_SECCOMP, mode);
>
> I'll set aside the v2 of the patch that uses ADD, DROP, and APPLY and
> work up a version with this interface.  Do you (or anyone else :) feel
> strongly about per-syscall APPLY?  I like the above version, but I'm
> certainly willing to explore the other path.  As is, I'll go through
> my usecases (and tests once I have a new cut of the patch) and see how
> it feels.  At first blush, this appears more succinct _and_ more
> expressive than the prior versions!
>
> ~~
>
> As to the use of apply_on_exec, even if you whitelist: mmap, fstat64,
> brk, uname, open, read, close, set_thread_area, mprotect, munmap, and
> access _just_ to allow a process to be exec'd, it is still a
> significant reduction in kernel attack surface.  Pairing that with a
> LSM, delayed chroot, etc could fill in the gaps with respect to a
> greater sandboxing solution.  I'd certainly take that tradeoff for
> running binaries that I don't control the source for :)  That said,
> LD_PRELOAD, ptrace injection, and other tricks could allow for the
> injection of a very targeted filterset, but I don't think that
> invalidates the on-exec case given the brittleness relying exclusively
> on such tactics.  Does that seem reasonable?

With a little more thinking :), I don't think there's an obvious hook
for "on-exec" bit in the interface proposed above.  I think that might
be a worthwhile tradeoff given how much cleaner it is.   It'd still be
possible to write a launcher to provide a reduced kernel surface, it'd
just also need a filter for the exec syscall.

cheers!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-03  1:47               ` Frederic Weisbecker
  2011-05-04  9:15                 ` Will Drewry
@ 2011-05-04 12:16                 ` Steven Rostedt
  2011-05-04 15:54                   ` Eric Paris
  1 sibling, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-05-04 12:16 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Will Drewry, Eric Paris, Ingo Molnar, linux-kernel, kees.cook,
	agl, jmorris, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

On Tue, 2011-05-03 at 03:47 +0200, Frederic Weisbecker wrote:
> 2011/5/3 Frederic Weisbecker <fweisbec@gmail.com>:

> Even better: applying a filter would always automatically be an
> intersection of the previous one.
> 
> If you do:
> 
> SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> SECCOMP_FILTER_APPLY
> SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> SECCOMP_FILTER_APPLY
> SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> SECCOMP_FILTER_APPLY
> 
> The end result is:
> 
> "(a == 1 || a == 2) && b == 2  && c == 3"
> 

I'm a little confused. Why do we have both a FILTER_SET and a
FILTER_APPLY? Maybe this was discussed earlier in the thread and I
missed it or simply forgot.

Why not just apply on the set call?

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 12:16                 ` Steven Rostedt
@ 2011-05-04 15:54                   ` Eric Paris
  2011-05-04 16:06                     ` Steven Rostedt
  0 siblings, 1 reply; 406+ messages in thread
From: Eric Paris @ 2011-05-04 15:54 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frederic Weisbecker, Will Drewry, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Wed, 2011-05-04 at 08:16 -0400, Steven Rostedt wrote:
> On Tue, 2011-05-03 at 03:47 +0200, Frederic Weisbecker wrote:
> > 2011/5/3 Frederic Weisbecker <fweisbec@gmail.com>:
> 
> > Even better: applying a filter would always automatically be an
> > intersection of the previous one.
> > 
> > If you do:
> > 
> > SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> > SECCOMP_FILTER_APPLY
> > SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> > SECCOMP_FILTER_APPLY
> > SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> > SECCOMP_FILTER_APPLY
> > 
> > The end result is:
> > 
> > "(a == 1 || a == 2) && b == 2  && c == 3"
> > 
> 
> I'm a little confused. Why do we have both a FILTER_SET and a
> FILTER_APPLY? Maybe this was discussed earlier in the thread and I
> missed it or simply forgot.
> 
> Why not just apply on the set call?

As this is a deny by default interface which only allows you to further
restrict you couldn't add more than 1 syscall if you didn't have an
explict 'apply' action.

SECCOMP_FILTER_SET, __NR_fo, "a=0"
SECCOMP_FILTER_SET, __NR_read, "1" == EPERM

Maybe apply on set is fine after the first apply, but we definitely need
some way to do more than 1 set before the rules are applied....

-Eric


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 15:54                   ` Eric Paris
@ 2011-05-04 16:06                     ` Steven Rostedt
  2011-05-04 16:22                       ` Eric Paris
  0 siblings, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-05-04 16:06 UTC (permalink / raw)
  To: Eric Paris
  Cc: Frederic Weisbecker, Will Drewry, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Wed, 2011-05-04 at 11:54 -0400, Eric Paris wrote:

> As this is a deny by default interface which only allows you to further
> restrict you couldn't add more than 1 syscall if you didn't have an
> explict 'apply' action.
> 
> SECCOMP_FILTER_SET, __NR_fo, "a=0"
> SECCOMP_FILTER_SET, __NR_read, "1" == EPERM
> 
> Maybe apply on set is fine after the first apply, but we definitely need
> some way to do more than 1 set before the rules are applied....

So we could have SET be 'or' and APPLY be 'and'.

SECCOMP_FILTER_SET, __NR_foo, "a=0"
SECCOMP_FILTER_SET, __NR_read, "1" == EPERM
SECCOPM_FILTER_APPLY

SECCOMP_FILTER_SET, __NR_foo, "b=0"
SECCOPM_FILTER_APPLY

Will end up being:

(foo: a == 0 || read: "1" == EPERM) && (foo: b == 0)

The second set/apply now removes the read option, and foo only works if
a is 0 and b is 0.

This would also work for children, as they can only restrict (with
'and') and can not add more control.

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 16:06                     ` Steven Rostedt
@ 2011-05-04 16:22                       ` Eric Paris
  2011-05-04 16:39                         ` Steven Rostedt
  2011-05-04 17:03                         ` Frederic Weisbecker
  0 siblings, 2 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-04 16:22 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frederic Weisbecker, Will Drewry, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Wed, 2011-05-04 at 12:06 -0400, Steven Rostedt wrote:
> On Wed, 2011-05-04 at 11:54 -0400, Eric Paris wrote:
> 
> > As this is a deny by default interface which only allows you to further
> > restrict you couldn't add more than 1 syscall if you didn't have an
> > explict 'apply' action.
> > 
> > SECCOMP_FILTER_SET, __NR_fo, "a=0"
> > SECCOMP_FILTER_SET, __NR_read, "1" == EPERM
> > 
> > Maybe apply on set is fine after the first apply, but we definitely need
> > some way to do more than 1 set before the rules are applied....
> 
> So we could have SET be 'or' and APPLY be 'and'.
> 
> SECCOMP_FILTER_SET, __NR_foo, "a=0"
> SECCOMP_FILTER_SET, __NR_read, "1" == EPERM

When I said "== EPERM" I meant that the given prctl call would return
EPERM.  I'm going to pretend that you didn't type it.

> SECCOPM_FILTER_APPLY
> 
> SECCOMP_FILTER_SET, __NR_foo, "b=0"
> SECCOPM_FILTER_APPLY
> 
> Will end up being:
> 
> (foo: a == 0 || read: "1") && (foo: b == 0)
> 
> The second set/apply now removes the read option, and foo only works if
> a is 0 and b is 0.
> 
> This would also work for children, as they can only restrict (with
> 'and') and can not add more control.

I think we pretty much agree although I'm pretty that we will have 1
filter per syscall.  So the rules would really be (in your syntax)

Rule1: (foo: a == 0 && b == 0)
OR
Rule2: (read: "1")

Although logically the same, it's not just one huge rule.  I don't see
any need for any operation other than an &&.  Before the first "set" you
can add new syscalls.  After the first set you can only && onto existing
syscalls.  So the following set of operations:

SECCOMP_FILTER_SET, __NR_foo, "a=0"
SECCOMP_FILTER_SET, __NR_read, "1"
SECCOPM_FILTER_APPLY
 
SECCOMP_FILTER_SET, __NR_foo, "b=0"
SECCOMP_FILTER_APPLY

SECCOMP_FILTER_SET, __NR_write, "1"
SECCOMP_FILTER_APPLY

Would return EPERM for the __NR_write entry since it was a new syscall
after a set.  I think we agree on all this.

I do have a question on some syntax proposed a while back.  Given:
SECCOMP_FILTER_SET, __NR_foo, "a=0"
SECCOMP_FILTER_SET, __NR_foo, "b=0"
SECCOMP_FILTER_APPLY

I would think to keep the interface consistent that should result in
foo: (a=0) && (b=0)

But I think the proposal was that we should instead have just
foo: (b=0)

What's the logic behind having a second call overwrite uncommitted
changes?  I sorta feel like if I put it in there, I must have wanted it
in there   :)

-Eric


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 16:22                       ` Eric Paris
@ 2011-05-04 16:39                         ` Steven Rostedt
  2011-05-04 18:02                           ` Eric Paris
  2011-05-04 17:03                         ` Frederic Weisbecker
  1 sibling, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-05-04 16:39 UTC (permalink / raw)
  To: Eric Paris
  Cc: Frederic Weisbecker, Will Drewry, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Wed, 2011-05-04 at 12:22 -0400, Eric Paris wrote:

> > SECCOMP_FILTER_SET, __NR_foo, "a=0"
> > SECCOMP_FILTER_SET, __NR_read, "1" == EPERM
> 
> When I said "== EPERM" I meant that the given prctl call would return
> EPERM.  I'm going to pretend that you didn't type it.

I was confused about that, but I didn't type that. You did. I just cut
and pasted it ;)

> 
> > SECCOPM_FILTER_APPLY
> > 
> > SECCOMP_FILTER_SET, __NR_foo, "b=0"
> > SECCOPM_FILTER_APPLY
> > 
> > Will end up being:
> > 
> > (foo: a == 0 || read: "1") && (foo: b == 0)
> > 
> > The second set/apply now removes the read option, and foo only works if
> > a is 0 and b is 0.
> > 
> > This would also work for children, as they can only restrict (with
> > 'and') and can not add more control.
> 
> I think we pretty much agree although I'm pretty that we will have 1
> filter per syscall.  So the rules would really be (in your syntax)
> 
> Rule1: (foo: a == 0 && b == 0)
> OR
> Rule2: (read: "1")
> 
> Although logically the same, it's not just one huge rule.  I don't see
> any need for any operation other than an &&.  Before the first "set" you
> can add new syscalls.  After the first set you can only && onto existing
> syscalls.  So the following set of operations:
> 
> SECCOMP_FILTER_SET, __NR_foo, "a=0"
> SECCOMP_FILTER_SET, __NR_read, "1"
> SECCOPM_FILTER_APPLY
>  
> SECCOMP_FILTER_SET, __NR_foo, "b=0"
> SECCOMP_FILTER_APPLY
> 
> SECCOMP_FILTER_SET, __NR_write, "1"
> SECCOMP_FILTER_APPLY
> 
> Would return EPERM for the __NR_write entry since it was a new syscall
> after a set.  I think we agree on all this.

Do you mean "after a apply"? As the second line above is a new syscall
after the first set.


> 
> I do have a question on some syntax proposed a while back.  Given:
> SECCOMP_FILTER_SET, __NR_foo, "a=0"
> SECCOMP_FILTER_SET, __NR_foo, "b=0"
> SECCOMP_FILTER_APPLY
> 
> I would think to keep the interface consistent that should result in
> foo: (a=0) && (b=0)

I agree.

> 
> But I think the proposal was that we should instead have just
> foo: (b=0)

Yeah, that's what it looked like Frederic showed. I rather have the
first instance.

Perhaps we could have a "unset"? that would only work on things that
haven't been applied yet.

> 
> What's the logic behind having a second call overwrite uncommitted
> changes?  I sorta feel like if I put it in there, I must have wanted it
> in there   :)

Perhaps for making the user code simpler?

SET a=1
SET b=2

[ some nasty if logic ]

   UNSET b=2

APPLY


Thus a default setting can be made and then we can selectively remove
settings before we do the apply based on information about the process
that we will exec. We can start out with "limit the hell out of it" and
then selectively remove things. I think this is a simply way to
understand what is being done. Kind of like iptables, where you can set
up default rules, but then selectively override them.

One thing I know about security, the easier it is to set up, the more
secure it becomes.

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 16:22                       ` Eric Paris
  2011-05-04 16:39                         ` Steven Rostedt
@ 2011-05-04 17:03                         ` Frederic Weisbecker
  2011-05-04 17:55                           ` Eric Paris
  1 sibling, 1 reply; 406+ messages in thread
From: Frederic Weisbecker @ 2011-05-04 17:03 UTC (permalink / raw)
  To: Eric Paris
  Cc: Steven Rostedt, Will Drewry, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Wed, May 04, 2011 at 12:22:40PM -0400, Eric Paris wrote:
> On Wed, 2011-05-04 at 12:06 -0400, Steven Rostedt wrote:
> > On Wed, 2011-05-04 at 11:54 -0400, Eric Paris wrote:
> > 
> > > As this is a deny by default interface which only allows you to further
> > > restrict you couldn't add more than 1 syscall if you didn't have an
> > > explict 'apply' action.
> > > 
> > > SECCOMP_FILTER_SET, __NR_fo, "a=0"
> > > SECCOMP_FILTER_SET, __NR_read, "1" == EPERM
> > > 
> > > Maybe apply on set is fine after the first apply, but we definitely need
> > > some way to do more than 1 set before the rules are applied....
> > 
> > So we could have SET be 'or' and APPLY be 'and'.
> > 
> > SECCOMP_FILTER_SET, __NR_foo, "a=0"
> > SECCOMP_FILTER_SET, __NR_read, "1" == EPERM
> 
> When I said "== EPERM" I meant that the given prctl call would return
> EPERM.  I'm going to pretend that you didn't type it.
> 
> > SECCOPM_FILTER_APPLY
> > 
> > SECCOMP_FILTER_SET, __NR_foo, "b=0"
> > SECCOPM_FILTER_APPLY
> > 
> > Will end up being:
> > 
> > (foo: a == 0 || read: "1") && (foo: b == 0)
> > 
> > The second set/apply now removes the read option, and foo only works if
> > a is 0 and b is 0.
> > 
> > This would also work for children, as they can only restrict (with
> > 'and') and can not add more control.
> 
> I think we pretty much agree although I'm pretty that we will have 1
> filter per syscall.  So the rules would really be (in your syntax)
> 
> Rule1: (foo: a == 0 && b == 0)
> OR
> Rule2: (read: "1")
> 
> Although logically the same, it's not just one huge rule.  I don't see
> any need for any operation other than an &&.  Before the first "set" you
> can add new syscalls.  After the first set you can only && onto existing
> syscalls.  So the following set of operations:
> 
> SECCOMP_FILTER_SET, __NR_foo, "a=0"
> SECCOMP_FILTER_SET, __NR_read, "1"
> SECCOPM_FILTER_APPLY
>  
> SECCOMP_FILTER_SET, __NR_foo, "b=0"
> SECCOMP_FILTER_APPLY
> 
> SECCOMP_FILTER_SET, __NR_write, "1"
> SECCOMP_FILTER_APPLY
> 
> Would return EPERM for the __NR_write entry since it was a new syscall
> after a set.  I think we agree on all this.

No, why?

The default filter for a syscall, if none have been given for it, is "0".

Thus, if you write "1" later, the entire filter is going to be:

	"0 && 1"

Which is fine, we are not overriding already applied permissions there.

So where is the need to return -EPERM in such a specific case? Is it
worth the corner case to check in the kernel, and to handle in userspace?
And for what reason?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04  9:15                 ` Will Drewry
  2011-05-04  9:29                   ` Will Drewry
@ 2011-05-04 17:52                   ` Frederic Weisbecker
  2011-05-04 18:23                     ` Steven Rostedt
  1 sibling, 1 reply; 406+ messages in thread
From: Frederic Weisbecker @ 2011-05-04 17:52 UTC (permalink / raw)
  To: Will Drewry
  Cc: Eric Paris, Ingo Molnar, linux-kernel, kees.cook, agl, jmorris,
	rostedt, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

On Wed, May 04, 2011 at 02:15:47AM -0700, Will Drewry wrote:
> Okay - so I *think* I'm following.  I really like the use of SET and
> GET to allow for further constraint based on additional argument
> restrictions instead of purely reducing the filters available.  The
> only part I'm stumbling on is using APPLY on a per-filter basis.  In
> my current implementation, I consider APPLY to be the global enable
> bit.  Whatever filters are set become set in stone and only &&s are
> handled.  I'm not sure I understand why it would make sense to do a
> per-syscall-filter apply call.

Nope it's still a global apply..

>  It's certainly doable, but it will
> mean that we may be logically storing something like:
> 
> __NR_foo: (a == 1 || a == 2), applied
> __NR_foo: b == 2, not applied
> __NR_foo: c == 3, not applied
> 
> after
> 
> SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> SECCOMP_FILTER_APPLY
> SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> SECCOMP_FILTER_SET, __NR_foo, "c == 3"

No, the c == 3 would override b == 2.

> In that case, would a call to sys_foo even be tested against the
> non-applied constraints of b==2 or c==3?

No, not as long as it's not applied.

> Or would the call to set "c
> == 3" replace the "b == 2" entry.  I'm not sure I see that the benefit
> exceeds the ambiguity that might introduce.

The rationale behind it is that as long as you haven't applied your filter,
you should be able to override it.

And the simplest way to override it is to do it completely. Remove what
was there before (that wasn't applied).

You could opt for a FILTER_DROP.
Let's take that example:

SECCOMP_FILTER_SET, __NR_foo, "b == 2"
SECCOMP_FILTER_SET, __NR_foo, "c == 3"

Following your logic, we should have "b == 2 && c == 3" after that.
How are you going to remove only the c == 3 sequence?

You would need SECCOMP_FILTER_DROP, __NR_foo, "c == 3"

That said, using a string with a filter expression as an id looks a
bit odd to me. That doesn't work anymore with "((c == 3))", or "c== 3",
unless you compare against the resulting tree but that's complicated.

I mean, that works, most of the time you keep your expression
and pass it as is. But the expression should be identified by its
meaning, not by some random language layout.

That also forces you to scatter your filter representation:

	SECCOMP_FILTER_SET, __NR_foo, "b == 2"
	SECCOMP_FILTER_SET, __NR_foo, "c == 3"

For me this shouldn't be different than

	SECCOMP_FILTER_SET, __NR_foo, "b == 2 && c == 3"

Still nobody will be able to remove a part of the above expression.

So, I'm more for having something that removes everything not applied
than just a part of the non-applied filter.

Two possibilities I see:

	SECCOMP_FILTER_SET, __NR_foo, "b == 2"
	SECCOMP_FILTER_SET, __NR_foo, "c == 3"

	// And result is "c == 3"

Or:

	SECCOMP_FILTER_SET, __NR_foo, "b == 2"
	SECCOMP_FILTER_SET, __NR_foo, "c == 3"

	// And result is "b == 2 && c == 3"

	SECCOMP_FILTER_RESET, __NR_foo //rewind to filter "0"

	SECCOMP_FILTER_SET, __NR_foo, "c == 4"

	// And result is "c == 4"

How does that look?
	


> However, if the default
> behavior it to always extend with &&, then a consumer of the interface
> could just do:
> 
> SECCOMP_FILTER_SET, __NR_prctl, "option == 2"
> SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> SECCOMP_FILTER_APPLY
> 
> This would yield a final filter for foo of "(a == 1 || a == 2) && b ==
> 2".  The call to APPLY would initiate the enforcement of the syscall
> filtering and enforce that no new filters may be added for syscalls
> that aren't already constrained.  So you could still call
> 
> SECCOMP_FILTER_SET, __NR_foo, "0"
> 
> which is logically "((a == 1 || a == 2) && b == 2) && 0" and would be
> interpreted as just a DROP. But you could not do,
> 
> SECCOMP_FILTER_SET, __NR_foo, "0"
> SECCOMP_FILTER_SET, __NR_foo, "1"
> or
> SECCOMP_FILTER_SET, __NR_read, "1"

Why?

"((a == 1 || a == 2) && b == 2) && 0 && 1"

or

"((a == 1 || a == 2) && b == 2) && 1"

does still follow our previous constraints.

> 
> (since the implicit filter for all syscalls after an APPLY call should
> be "0" and additions would just be "0 && whatever").

If the default filter is 0 after apply, setting whatever afterward is
not an issue. You'll have "0 && whatever" which still means 0, that's fine.

> 
> Am I missing something?  If that makes sense, then we may even be able
> to reduce the extra directives by one and get a resulting interface
> that looks something like:
> 
> /* Appends (&&) a new filter string for a syscall to the current
> filter value. "0" clears the filter. */
> prctl(PR_SET_SECCOMP_FILTER, syscall_nr, filter_string);
> /* Returns the current explicit filter string for a syscall */
> prctl(PR_GET_SECCOMP_FILTER, syscall_nr, buf, buflen);

So GET is eventually optional here, as a new filter automatically
append to the previous applied one.

But if a process needs some information about current filter, this
makes sense.

> /* Transition to a secure computing mode:
>  * 1 - enables traditional seccomp behavior
>  * 2 - enables seccomp filter enforcement and changes the implicit
> filter for syscalls from "1" to "0"
>  */
> prctl(PR_SET_SECCOMP, mode);
> 
> I'll set aside the v2 of the patch that uses ADD, DROP, and APPLY and
> work up a version with this interface.  Do you (or anyone else :) feel
> strongly about per-syscall APPLY?

I'm still not sure what you meant by per syscall APPLY :)

> ~~
> 
> As to the use of apply_on_exec, even if you whitelist: mmap, fstat64,
> brk, uname, open, read, close, set_thread_area, mprotect, munmap, and
> access _just_ to allow a process to be exec'd, it is still a
> significant reduction in kernel attack surface.  Pairing that with a
> LSM, delayed chroot, etc could fill in the gaps with respect to a
> greater sandboxing solution.  I'd certainly take that tradeoff for
> running binaries that I don't control the source for :)  That said,
> LD_PRELOAD, ptrace injection, and other tricks could allow for the
> injection of a very targeted filterset, but I don't think that
> invalidates the on-exec case given the brittleness relying exclusively
> on such tactics.  Does that seem reasonable?

Right. But then, does a default enable_on_exec behaviour really matches your
needs? Given your description, it seems that applying it on the next
fork would work too, as a random example.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 17:03                         ` Frederic Weisbecker
@ 2011-05-04 17:55                           ` Eric Paris
  0 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-04 17:55 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Steven Rostedt, Will Drewry, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Wed, 2011-05-04 at 19:03 +0200, Frederic Weisbecker wrote:
> On Wed, May 04, 2011 at 12:22:40PM -0400, Eric Paris wrote:
> So the following set of operations:
> > 
> > SECCOMP_FILTER_SET, __NR_foo, "a=0"
> > SECCOMP_FILTER_SET, __NR_read, "1"
> > SECCOPM_FILTER_APPLY
> >  
> > SECCOMP_FILTER_SET, __NR_foo, "b=0"
> > SECCOMP_FILTER_APPLY
> > 
> > SECCOMP_FILTER_SET, __NR_write, "1"
> > SECCOMP_FILTER_APPLY
> > 
> > Would return EPERM for the __NR_write entry since it was a new syscall
> > after a set.  I think we agree on all this.
> 
> No, why?
> 
> The default filter for a syscall, if none have been given for it, is "0".
> 
> Thus, if you write "1" later, the entire filter is going to be:
> 
> 	"0 && 1"
> 
> Which is fine, we are not overriding already applied permissions there.
> 
> So where is the need to return -EPERM in such a specific case? Is it
> worth the corner case to check in the kernel, and to handle in userspace?
> And for what reason?

I assumed without looking at the code (always a bad idea) that he wasn't
going to explicitly create a rule with "0" and was going to implicitly
deny anything without a rule.  If there is an explicit "0" rule then you
are right, i don't see a need to deny the set operation in the kernel.
But if it is implicit in the non-existence of a filter then it should be
easy to tell userspace it isn't allowed any more.

-Eric

-Eric



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 16:39                         ` Steven Rostedt
@ 2011-05-04 18:02                           ` Eric Paris
  0 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-04 18:02 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frederic Weisbecker, Will Drewry, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Wed, 2011-05-04 at 12:39 -0400, Steven Rostedt wrote:
> On Wed, 2011-05-04 at 12:22 -0400, Eric Paris wrote:

[rewriting history]
> > Although logically the same, it's not just one huge rule.  I don't see
> > any need for any operation other than an &&.  Before the first "apply" you
> > can add new syscalls.  After the first apply you can only && onto existing
> > syscalls.  So the following set of operations:
> > 
> > SECCOMP_FILTER_SET, __NR_foo, "a=0"
> > SECCOMP_FILTER_SET, __NR_read, "1"
> > SECCOPM_FILTER_APPLY
> >  
> > SECCOMP_FILTER_SET, __NR_foo, "b=0"
> > SECCOMP_FILTER_APPLY
> > 
> > SECCOMP_FILTER_SET, __NR_write, "1"
> > SECCOMP_FILTER_APPLY
> > 
> > Would return EPERM for the __NR_write entry since it was a new syscall
> > after an apply.  I think we agree on all this.
> 
> Do you mean "after a apply"? As the second line above is a new syscall
> after the first set.

Clearly.  Especially since given my revisionist history it's what I
said!

> > 
> > I do have a question on some syntax proposed a while back.  Given:
> > SECCOMP_FILTER_SET, __NR_foo, "a=0"
> > SECCOMP_FILTER_SET, __NR_foo, "b=0"
> > SECCOMP_FILTER_APPLY
> > 
> > I would think to keep the interface consistent that should result in
> > foo: (a=0) && (b=0)
> 
> I agree.
> 
> > 
> > But I think the proposal was that we should instead have just
> > foo: (b=0)
> 
> Yeah, that's what it looked like Frederic showed. I rather have the
> first instance.
> 
> Perhaps we could have a "unset"? that would only work on things that
> haven't been applied yet.
> 
> > 
> > What's the logic behind having a second call overwrite uncommitted
> > changes?  I sorta feel like if I put it in there, I must have wanted it
> > in there   :)
> 
> Perhaps for making the user code simpler?
> 
> SET a=1
> SET b=2
> 
> [ some nasty if logic ]
> 
>    UNSET b=2
> 
> APPLY
> 
> 
> Thus a default setting can be made and then we can selectively remove
> settings before we do the apply based on information about the process
> that we will exec. We can start out with "limit the hell out of it" and
> then selectively remove things. I think this is a simply way to
> understand what is being done. Kind of like iptables, where you can set
> up default rules, but then selectively override them.
> 
> One thing I know about security, the easier it is to set up, the more
> secure it becomes.

I'm ok with an explicit unset/remove/delete for rules since the last
apply if anyone thinks it will be useful.  But I don't like an implicit
'overwrite.'

I'm starting to debate if I think between rules should be an implicit &&
or should be an implicit ||.  After an 'apply' I believe the next block
definitely needs an &&.  I feel like the code in userspace becomes
simpler with || but maybe it would be more confusing for the human coder
to have to know the distinctions.  The example in my head, which I think
will be common, involves handling multiple fd's for read.  We could
either do:

int handle_read_fd(int fd)
{
	char buf[4096];
	snprintf(buf, 4096, "a0=%d", fd)
	return prctl(SECCOMP_FILTER_SET, __NR_read, buf);
}

main()
{
....
	fd1 = open("readfile1", O_RDONLY);
	if (fd1 < 0) return
	if (handle_read_fd(fd1)) return
....
	fd2 = open("readfile2, O_RDONLY);
	if (fd2 < 0) return
	if (handle_read_fd(fd2)) return
....
	prctl(SECCOMP_FILTER_APPLY);
}

Or case2 with the implicit && like we talked about to handle an
arbitrary number of read fd's you need (pseudocode):
int handle_read_fds(int *fds, int len)
{
	char buf[4096];
	size_t saved;
	while(len--) {
		saved = snprintf(buf, 4096, "a0=%d ||");
		buf += saved;
	}
	return prctl(SECCOMP_FILTER_SET, __NR_read, buf)
}
main()
{
	int fd[2];
....
	fd[0] = open("readfile1", O_RDONLY);
	if (fd[0] < 0) return
....
	fd[1] = open("readfile2, O_RDONLY);
	if (fd[1] < 0) return
....
	handle_read_fds(fd, 2)
	prctl(SECCOMP_FILTER_APPLY);
}

The latter of the examples requires what I think to be the common case
of complex rules to be required to hold state whereas the other is done
in the kernel.  It's a lot easier to code the first one but it might be
harder on the coder to decide "now is than an && or an ||"?  Like I
said, I'm ok with just declaring everything &&, especially if people
think complex filters are likely to be anything more than rule1 || rule2
|| rule3 || rule4 where ruleX is an independent clause, but I figured
I'd throw it out there....

No matter what after the APPLY I think that:
SECCOMP_FILTER, __NR_read, "0" should result in:
(rule1 || rule2 || rule3 || rule4) && 0

-Eric


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 17:52                   ` Frederic Weisbecker
@ 2011-05-04 18:23                     ` Steven Rostedt
  2011-05-04 18:30                       ` Frederic Weisbecker
  0 siblings, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-05-04 18:23 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Will Drewry, Eric Paris, Ingo Molnar, linux-kernel, kees.cook,
	agl, jmorris, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

On Wed, 2011-05-04 at 19:52 +0200, Frederic Weisbecker wrote:

> >  It's certainly doable, but it will
> > mean that we may be logically storing something like:
> > 
> > __NR_foo: (a == 1 || a == 2), applied
> > __NR_foo: b == 2, not applied
> > __NR_foo: c == 3, not applied
> > 
> > after
> > 
> > SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> > SECCOMP_FILTER_APPLY
> > SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> > SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> 
> No, the c == 3 would override b == 2.

I honestly hate the "override" mode. I like that SETs are or'd among
each other and an APPLY is a "commit"; meaning that you can only limit
it further, but can not extend it (an explicit &&)

> 
> > In that case, would a call to sys_foo even be tested against the
> > non-applied constraints of b==2 or c==3?
> 
> No, not as long as it's not applied.
> 
> > Or would the call to set "c
> > == 3" replace the "b == 2" entry.  I'm not sure I see that the benefit
> > exceeds the ambiguity that might introduce.
> 
> The rationale behind it is that as long as you haven't applied your filter,
> you should be able to override it.

We need a "UNSET" (I like that better than DROP).

> 
> And the simplest way to override it is to do it completely. Remove what
> was there before (that wasn't applied).
> 
> You could opt for a FILTER_DROP.
> Let's take that example:
> 
> SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> 
> Following your logic, we should have "b == 2 && c == 3" after that.
> How are you going to remove only the c == 3 sequence?
> 
> You would need SECCOMP_FILTER_DROP, __NR_foo, "c == 3"
> 
> That said, using a string with a filter expression as an id looks a
> bit odd to me. That doesn't work anymore with "((c == 3))", or "c== 3",
> unless you compare against the resulting tree but that's complicated.

Nah, it should be easy. Actually, in "set" mode (before the apply) we
should keep a link list of "trees" of sets. Then we just find the "tree"
that matches the UNSET and remove it.

> 
> I mean, that works, most of the time you keep your expression
> and pass it as is. But the expression should be identified by its
> meaning, not by some random language layout.
> 
> That also forces you to scatter your filter representation:
> 
> 	SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> 	SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> 
> For me this shouldn't be different than
> 
> 	SECCOMP_FILTER_SET, __NR_foo, "b == 2 && c == 3"
> 
> Still nobody will be able to remove a part of the above expression.

Correct, as the sets will be just link lists of sets. Once we apply it,
it goes into the main tree.

Thus, to find the UNSET set, we would evaluate the tree of that unset,
and search for it. If it is found, remove it, otherwise return EINVAL or
something.

> 
> So, I'm more for having something that removes everything not applied
> than just a part of the non-applied filter.
> 
> Two possibilities I see:
> 
> 	SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> 	SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> 
> 	// And result is "c == 3"
> 
> Or:
> 
> 	SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> 	SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> 
> 	// And result is "b == 2 && c == 3"
> 
> 	SECCOMP_FILTER_RESET, __NR_foo //rewind to filter "0"
> 
> 	SECCOMP_FILTER_SET, __NR_foo, "c == 4"
> 
> 	// And result is "c == 4"
> 
> How does that look?

The thing is, I don't understand where we would use (or want) an
override. I could understand a UNSET, which I described in another
reply. Also, I like the fact that sets can be self contained because
then the user can add wrapper functions for them.

add_filter("foo: c == 4");
add_filter("bar: b < 3");
apply_filter();

That wont work with an override, unless we did the work in userspace to
keep string, and then we need to worry about "string matches" as you
stated if we want to implement a remove_filter("foo: c==4"). (see it
would fail, because this version is missing spaces)

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 18:23                     ` Steven Rostedt
@ 2011-05-04 18:30                       ` Frederic Weisbecker
  2011-05-04 18:46                         ` Steven Rostedt
  0 siblings, 1 reply; 406+ messages in thread
From: Frederic Weisbecker @ 2011-05-04 18:30 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Will Drewry, Eric Paris, Ingo Molnar, linux-kernel, kees.cook,
	agl, jmorris, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

On Wed, May 04, 2011 at 02:23:02PM -0400, Steven Rostedt wrote:
> On Wed, 2011-05-04 at 19:52 +0200, Frederic Weisbecker wrote:
> 
> > >  It's certainly doable, but it will
> > > mean that we may be logically storing something like:
> > > 
> > > __NR_foo: (a == 1 || a == 2), applied
> > > __NR_foo: b == 2, not applied
> > > __NR_foo: c == 3, not applied
> > > 
> > > after
> > > 
> > > SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> > > SECCOMP_FILTER_APPLY
> > > SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> > > SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> > 
> > No, the c == 3 would override b == 2.
> 
> I honestly hate the "override" mode. I like that SETs are or'd among
> each other and an APPLY is a "commit"; meaning that you can only limit
> it further, but can not extend it (an explicit &&)

I'm confused with what you just said...

Somehow I could understand it that way:

SECCOMP_FILTER_SET, __NR_foo, "a == 1"
SECCOMP_FILTER_SET, __NR_foo, "a == 2"
SECCOMP_FILTER_APPLY
SECCOMP_FILTER_SET, __NR_foo, "b == 1"

Would produce:

"(a == 1 || a == 2) && b == 1"

That makes a pretty confusing behaviour for users I think.

> 
> > 
> > > In that case, would a call to sys_foo even be tested against the
> > > non-applied constraints of b==2 or c==3?
> > 
> > No, not as long as it's not applied.
> > 
> > > Or would the call to set "c
> > > == 3" replace the "b == 2" entry.  I'm not sure I see that the benefit
> > > exceeds the ambiguity that might introduce.
> > 
> > The rationale behind it is that as long as you haven't applied your filter,
> > you should be able to override it.
> 
> We need a "UNSET" (I like that better than DROP).

What about a complete erase (RESET) of the temporary filter? Like I explained below
from my previous mail.

> 
> > 
> > And the simplest way to override it is to do it completely. Remove what
> > was there before (that wasn't applied).
> > 
> > You could opt for a FILTER_DROP.
> > Let's take that example:
> > 
> > SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> > SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> > 
> > Following your logic, we should have "b == 2 && c == 3" after that.
> > How are you going to remove only the c == 3 sequence?
> > 
> > You would need SECCOMP_FILTER_DROP, __NR_foo, "c == 3"
> > 
> > That said, using a string with a filter expression as an id looks a
> > bit odd to me. That doesn't work anymore with "((c == 3))", or "c== 3",
> > unless you compare against the resulting tree but that's complicated.
> 
> Nah, it should be easy. Actually, in "set" mode (before the apply) we
> should keep a link list of "trees" of sets. Then we just find the "tree"
> that matches the UNSET and remove it.
> 
> > 
> > I mean, that works, most of the time you keep your expression
> > and pass it as is. But the expression should be identified by its
> > meaning, not by some random language layout.
> > 
> > That also forces you to scatter your filter representation:
> > 
> > 	SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> > 	SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> > 
> > For me this shouldn't be different than
> > 
> > 	SECCOMP_FILTER_SET, __NR_foo, "b == 2 && c == 3"
> > 
> > Still nobody will be able to remove a part of the above expression.
> 
> Correct, as the sets will be just link lists of sets. Once we apply it,
> it goes into the main tree.
> 
> Thus, to find the UNSET set, we would evaluate the tree of that unset,
> and search for it. If it is found, remove it, otherwise return EINVAL or
> something.
> 
> > 
> > So, I'm more for having something that removes everything not applied
> > than just a part of the non-applied filter.
> > 
> > Two possibilities I see:
> > 
> > 	SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> > 	SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> > 
> > 	// And result is "c == 3"
> > 
> > Or:
> > 
> > 	SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> > 	SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> > 
> > 	// And result is "b == 2 && c == 3"
> > 
> > 	SECCOMP_FILTER_RESET, __NR_foo //rewind to filter "0"
> > 
> > 	SECCOMP_FILTER_SET, __NR_foo, "c == 4"
> > 
> > 	// And result is "c == 4"
> > 
> > How does that look?
> 
> The thing is, I don't understand where we would use (or want) an
> override. I could understand a UNSET, which I described in another
> reply. Also, I like the fact that sets can be self contained because
> then the user can add wrapper functions for them.
> 
> add_filter("foo: c == 4");
> add_filter("bar: b < 3");
> apply_filter();
> 
> That wont work with an override, unless we did the work in userspace to
> keep string, and then we need to worry about "string matches" as you
> stated if we want to implement a remove_filter("foo: c==4"). (see it
> would fail, because this version is missing spaces)
> 
> -- Steve
> 
> 

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 18:30                       ` Frederic Weisbecker
@ 2011-05-04 18:46                         ` Steven Rostedt
  2011-05-05  9:21                           ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-05-04 18:46 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Will Drewry, Eric Paris, Ingo Molnar, linux-kernel, kees.cook,
	agl, jmorris, Randy Dunlap, Linus Torvalds, Andrew Morton,
	Tom Zanussi, Arnaldo Carvalho de Melo, Peter Zijlstra,
	Thomas Gleixner

On Wed, 2011-05-04 at 20:30 +0200, Frederic Weisbecker wrote:
> On Wed, May 04, 2011 at 02:23:02PM -0400, Steven Rostedt wrote:
> > On Wed, 2011-05-04 at 19:52 +0200, Frederic Weisbecker wrote:
> > 
> > > >  It's certainly doable, but it will
> > > > mean that we may be logically storing something like:
> > > > 
> > > > __NR_foo: (a == 1 || a == 2), applied
> > > > __NR_foo: b == 2, not applied
> > > > __NR_foo: c == 3, not applied
> > > > 
> > > > after
> > > > 
> > > > SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
> > > > SECCOMP_FILTER_APPLY
> > > > SECCOMP_FILTER_SET, __NR_foo, "b == 2"
> > > > SECCOMP_FILTER_SET, __NR_foo, "c == 3"
> > > 
> > > No, the c == 3 would override b == 2.
> > 
> > I honestly hate the "override" mode. I like that SETs are or'd among
> > each other and an APPLY is a "commit"; meaning that you can only limit
> > it further, but can not extend it (an explicit &&)
> 
> I'm confused with what you just said...
> 
> Somehow I could understand it that way:
> 
> SECCOMP_FILTER_SET, __NR_foo, "a == 1"
> SECCOMP_FILTER_SET, __NR_foo, "a == 2"
> SECCOMP_FILTER_APPLY
> SECCOMP_FILTER_SET, __NR_foo, "b == 1"
> 
> Would produce:
> 
> "(a == 1 || a == 2) && b == 1"

No, it would produce:

 (a == 1 || a == 2)

The b == 1 will not be added until the next apply.

> 
> That makes a pretty confusing behaviour for users I think.


Not really, if it is documented well. Or we can call it:

SECCOMP_FILTER_SET_OR and SECCOMP_FILTER_APPLY_AND

to remove the ambiguity. The reason is actually quite simple. Before you
do an apply, you can modify it to whatever you want. But once you do an
apply, you just limited yourself. An apply can not be reversed.

> 
> > 
> > > 
> > > > In that case, would a call to sys_foo even be tested against the
> > > > non-applied constraints of b==2 or c==3?
> > > 
> > > No, not as long as it's not applied.
> > > 
> > > > Or would the call to set "c
> > > > == 3" replace the "b == 2" entry.  I'm not sure I see that the benefit
> > > > exceeds the ambiguity that might introduce.
> > > 
> > > The rationale behind it is that as long as you haven't applied your filter,
> > > you should be able to override it.
> > 
> > We need a "UNSET" (I like that better than DROP).
> 
> What about a complete erase (RESET) of the temporary filter? Like I explained below
> from my previous mail.

What is a temporary filter? And a RESET could be there too to just
remove all sets that are pending.

I was thinking that we add "SETS" which the kernel can verify are
correct and let the user know at the time of the command if it is valid.
But these sets are not actually implemented until an APPLY is hit.

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-04 18:46                         ` Steven Rostedt
@ 2011-05-05  9:21                           ` Will Drewry
  2011-05-05 13:14                             ` Serge E. Hallyn
                                               ` (2 more replies)
  0 siblings, 3 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-05  9:21 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frederic Weisbecker, Eric Paris, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Wed, May 4, 2011 at 11:46 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Wed, 2011-05-04 at 20:30 +0200, Frederic Weisbecker wrote:
>> On Wed, May 04, 2011 at 02:23:02PM -0400, Steven Rostedt wrote:
>> > On Wed, 2011-05-04 at 19:52 +0200, Frederic Weisbecker wrote:
>> >
>> > > >  It's certainly doable, but it will
>> > > > mean that we may be logically storing something like:
>> > > >
>> > > > __NR_foo: (a == 1 || a == 2), applied
>> > > > __NR_foo: b == 2, not applied
>> > > > __NR_foo: c == 3, not applied
>> > > >
>> > > > after
>> > > >
>> > > > SECCOMP_FILTER_SET, __NR_foo, "a == 1 || a == 2"
>> > > > SECCOMP_FILTER_APPLY
>> > > > SECCOMP_FILTER_SET, __NR_foo, "b == 2"
>> > > > SECCOMP_FILTER_SET, __NR_foo, "c == 3"
>> > >
>> > > No, the c == 3 would override b == 2.
>> >
>> > I honestly hate the "override" mode. I like that SETs are or'd among
>> > each other and an APPLY is a "commit"; meaning that you can only limit
>> > it further, but can not extend it (an explicit &&)
>>
>> I'm confused with what you just said...
>>
>> Somehow I could understand it that way:
>>
>> SECCOMP_FILTER_SET, __NR_foo, "a == 1"
>> SECCOMP_FILTER_SET, __NR_foo, "a == 2"
>> SECCOMP_FILTER_APPLY
>> SECCOMP_FILTER_SET, __NR_foo, "b == 1"
>>
>> Would produce:
>>
>> "(a == 1 || a == 2) && b == 1"
>
> No, it would produce:
>
>  (a == 1 || a == 2)
>
> The b == 1 will not be added until the next apply.
>
>>
>> That makes a pretty confusing behaviour for users I think.
>
>
> Not really, if it is documented well. Or we can call it:
>
> SECCOMP_FILTER_SET_OR and SECCOMP_FILTER_APPLY_AND
>
> to remove the ambiguity. The reason is actually quite simple. Before you
> do an apply, you can modify it to whatever you want. But once you do an
> apply, you just limited yourself. An apply can not be reversed.
>
>>
>> >
>> > >
>> > > > In that case, would a call to sys_foo even be tested against the
>> > > > non-applied constraints of b==2 or c==3?
>> > >
>> > > No, not as long as it's not applied.
>> > >
>> > > > Or would the call to set "c
>> > > > == 3" replace the "b == 2" entry.  I'm not sure I see that the benefit
>> > > > exceeds the ambiguity that might introduce.
>> > >
>> > > The rationale behind it is that as long as you haven't applied your filter,
>> > > you should be able to override it.
>> >
>> > We need a "UNSET" (I like that better than DROP).
>>
>> What about a complete erase (RESET) of the temporary filter? Like I explained below
>> from my previous mail.
>
> What is a temporary filter? And a RESET could be there too to just
> remove all sets that are pending.
>
> I was thinking that we add "SETS" which the kernel can verify are
> correct and let the user know at the time of the command if it is valid.
> But these sets are not actually implemented until an APPLY is hit.
>

The past few mails have proposed quite a few interesting variants, but
I wonder if Eric's code samples show something useful about the
not-yet-applied filters in addition to the behavioral differences
between changing implicit operations (&& versus ||).

In particular, if the userspace code wants to stage some filters and
apply them all at once, when ready, I'm not sure that it makes sense
to me to put that complexity in the kernel itself.  For instance,
Eric's second sample showed a call that took an array of ints and
coalesced them into "fd == %d || ...".  That simple example shows that
we could easily get by with a pretty minimal kernel-supported
interface as long as the richer behavior could live userspace side --
even if just in a simple helper library.  It'd be pretty easy to
implement a userspace library that exposed add_filter(syscall_nr,
filter) and apply_filters() such that it could manage building the
final filter string for a given syscall and pushing it to prctl on
apply.

I think that could also help simplify the primitives.  For instance,
if any separate SET called on a system call resulting in an &&
operation, then the behavior could be consistent prior to enforcement
of the filtering and after.  E.g.,
  SET, __NR_read, "fd == 1"
  SET, __NR_read, "len < 4097"
would result in an evaluated "fd == 1 && len < 4097".  It would do so
after a single APPLY call too:
  SET, __NR_read, "1"
  APPLY
  SET, __NR_read, "fd == 1"
  SET, __NR_read, "len < 4097"
Results in: "1 && fd == 1 && len < 4097", and SET, nr, "0" would
nullify the syscall filter in total.  It seems like that would be
enough to build the SET-SET-...-APPLY, SET-SET-...-SET-APPLY logic
into a userspace library so that all temporary unapplied state doesn't
have to be explicitly managed by the kernel.

While I completely agree with the comment around ease-of-use as being
key to security, I also find that the more the state diagram explodes,
the harder it is to feel confident that a solution is actually secure.
 To try to achieve both objectives, I'd like to limit the kernel
interface to the bare minimum of primitives and build any API
fanciness into userspace.

Does it seem that the tradeoff isn't worth it, or are there some
specific behaviors that aren't addressed using that model?

While writing that, another option occurred to me that touches on the
other proposals but makes the behaviors much more explicit.
A prctl prototype could be provided:
 prctl(<SET|GET>, <AND|OR>, <syscall_nr>, <filter string>)
e.g.,
 prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_OR, __NR_read, "fd == 2");

The explicit prctl argument list would allow the filter strings to be
self-referential and allow the userspace app to decide what behaviors
are allowed and when. If we followed that route, all implicit filters
would be "0" and the initial call to get things started might be:
   #define SET 33
   #define OR 0
   #define AND 1
   SET, OR, __NR_prctl, "option == 33 && (arg1 == 0 || arg1 == 1)"
   prctl(PR_SET_SECCOMP, 2);

So now the "locked down" binary can call prctl to set an OR or AND
filter for any syscall.  A subsequent call could change that:
  SET, OR, __NR_read, "fd == 2"  /* => "0 || fd == 2" */
  SET, AND, __NR_prctl, "(arg2 != 63 || arg1 != 0)"  /* __NR_read == 63 */

This would OR in a __NR_read filter, then disallow a future call to
prctl to OR in more NR_read filters, but for other syscalls ANDing and
ORing is still possible until you pass in something like:

  SET, AND, __NR_prctl, "arg1 == 1"

which would lock down all future prctl calls to only ANDing filters
in.  (The numbers in the examples could then be properly managed in a
userspace library to ensure platform correctness.)

While this would reduce the primitives a bit further, I'm not sure if
this would be the right approach either, but it would open the door to
pushing even more down to userspace very explicitly and further
removing magic policy logic from the kernel-side.  Is this vaguely
interesting or just another layer of confusing-ness?


I'll follow Eric's lead and try out a few different interfaces
proposed earlier and the ones I laid out above and see if it seems to
come out any clearer (for me at least).  I'd love to know if anyone
else thinks we can get away with less primitives and put more of the
complex/delayed logic in userspace exclusively.

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-05  9:21                           ` Will Drewry
@ 2011-05-05 13:14                             ` Serge E. Hallyn
  2011-05-12  3:20                               ` Will Drewry
  2011-05-06 11:53                             ` Steven Rostedt
  2011-05-06 16:30                             ` [PATCH 5/7] " Eric Paris
  2 siblings, 1 reply; 406+ messages in thread
From: Serge E. Hallyn @ 2011-05-05 13:14 UTC (permalink / raw)
  To: Will Drewry
  Cc: Steven Rostedt, Frederic Weisbecker, Eric Paris, Ingo Molnar,
	linux-kernel, kees.cook, agl, jmorris, Randy Dunlap,
	Linus Torvalds, Andrew Morton, Tom Zanussi,
	Arnaldo Carvalho de Melo, Peter Zijlstra, Thomas Gleixner

Quoting Will Drewry (wad@chromium.org):
> In particular, if the userspace code wants to stage some filters and
> apply them all at once, when ready, I'm not sure that it makes sense
> to me to put that complexity in the kernel itself.  For instance,

Hi Will,

just one note - in my original comment I wasn't actually suggesting
disabling setting of filters through a writeable file - I was only
suggesting restricting writing to one's own filters file.

All the better if it is possible to get a nice prctl-only
interface, but if it ends up limiting rule expressiveness (or taking
years to define an interface) then perhaps we should stick with
prctl for setting seccomp mode, and a more expressive file interface
for defining filters.

-serge

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-05  9:21                           ` Will Drewry
  2011-05-05 13:14                             ` Serge E. Hallyn
@ 2011-05-06 11:53                             ` Steven Rostedt
  2011-05-06 13:35                               ` Eric Paris
  2011-05-07  1:58                               ` Will Drewry
  2011-05-06 16:30                             ` [PATCH 5/7] " Eric Paris
  2 siblings, 2 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-06 11:53 UTC (permalink / raw)
  To: Will Drewry
  Cc: Frederic Weisbecker, Eric Paris, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Thu, 2011-05-05 at 02:21 -0700, Will Drewry wrote:

> In particular, if the userspace code wants to stage some filters and
> apply them all at once, when ready, I'm not sure that it makes sense
> to me to put that complexity in the kernel itself.  For instance,
> Eric's second sample showed a call that took an array of ints and
> coalesced them into "fd == %d || ...".  That simple example shows that
> we could easily get by with a pretty minimal kernel-supported
> interface as long as the richer behavior could live userspace side --
> even if just in a simple helper library.  It'd be pretty easy to
> implement a userspace library that exposed add_filter(syscall_nr,
> filter) and apply_filters() such that it could manage building the
> final filter string for a given syscall and pushing it to prctl on
> apply.

I'm fine with a single kernel call and the "temporary filter" be done in
userspace. Making the kernel code less complex is better :)

> 
> I think that could also help simplify the primitives.  For instance,
> if any separate SET called on a system call resulting in an &&
> operation, then the behavior could be consistent prior to enforcement
> of the filtering and after.  E.g.,
>   SET, __NR_read, "fd == 1"
>   SET, __NR_read, "len < 4097"
> would result in an evaluated "fd == 1 && len < 4097".  It would do so
> after a single APPLY call too:
>   SET, __NR_read, "1"
>   APPLY
>   SET, __NR_read, "fd == 1"
>   SET, __NR_read, "len < 4097"
> Results in: "1 && fd == 1 && len < 4097", and SET, nr, "0" would
> nullify the syscall filter in total.

Only that that was not applied? We can't let tasks nullify their
restrictions once they have been applied. This keeps the kernel code
simpler.

>   It seems like that would be
> enough to build the SET-SET-...-APPLY, SET-SET-...-SET-APPLY logic
> into a userspace library so that all temporary unapplied state doesn't
> have to be explicitly managed by the kernel.

Thus, the SETs are done in the userspace library that does not need to
interact with the kernel (besides perhaps allocating memory). Then the
apply would send all the filters to the kernel which would restrict the
task (or the task on exec) further.

> 
> While I completely agree with the comment around ease-of-use as being
> key to security, I also find that the more the state diagram explodes,
> the harder it is to feel confident that a solution is actually secure.
>  To try to achieve both objectives, I'd like to limit the kernel
> interface to the bare minimum of primitives and build any API
> fanciness into userspace.

Fair enough.

> 
> Does it seem that the tradeoff isn't worth it, or are there some
> specific behaviors that aren't addressed using that model?
> 
> While writing that, another option occurred to me that touches on the
> other proposals but makes the behaviors much more explicit.
> A prctl prototype could be provided:
>  prctl(<SET|GET>, <AND|OR>, <syscall_nr>, <filter string>)
> e.g.,
>  prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_OR, __NR_read, "fd == 2");
> 
> The explicit prctl argument list would allow the filter strings to be
> self-referential and allow the userspace app to decide what behaviors
> are allowed and when. If we followed that route, all implicit filters
> would be "0" and the initial call to get things started might be:
>    #define SET 33
>    #define OR 0
>    #define AND 1
>    SET, OR, __NR_prctl, "option == 33 && (arg1 == 0 || arg1 == 1)"
>    prctl(PR_SET_SECCOMP, 2);
> 
> So now the "locked down" binary can call prctl to set an OR or AND
> filter for any syscall.  A subsequent call could change that:
>   SET, OR, __NR_read, "fd == 2"  /* => "0 || fd == 2" */
>   SET, AND, __NR_prctl, "(arg2 != 63 || arg1 != 0)"  /* __NR_read == 63 */
> 
> This would OR in a __NR_read filter, then disallow a future call to
> prctl to OR in more NR_read filters, but for other syscalls ANDing and
> ORing is still possible until you pass in something like:
> 
>   SET, AND, __NR_prctl, "arg1 == 1"
> 
> which would lock down all future prctl calls to only ANDing filters
> in.  (The numbers in the examples could then be properly managed in a
> userspace library to ensure platform correctness.)

I don't know about this. It seems to be starting to get too complex, and
thus error prone. Is there any reason we should allow an OR to the task?
Why would we want to restrict a task where the task could easily
unrestrict itself?

> 
> While this would reduce the primitives a bit further, I'm not sure if
> this would be the right approach either, but it would open the door to
> pushing even more down to userspace very explicitly and further
> removing magic policy logic from the kernel-side.  Is this vaguely
> interesting or just another layer of confusing-ness?

I'm confused, thus I must have hit that layer ;)

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-06 11:53                             ` Steven Rostedt
@ 2011-05-06 13:35                               ` Eric Paris
  2011-05-07  1:58                               ` Will Drewry
  1 sibling, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-06 13:35 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Will Drewry, Frederic Weisbecker, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Fri, 2011-05-06 at 07:53 -0400, Steven Rostedt wrote:
> On Thu, 2011-05-05 at 02:21 -0700, Will Drewry wrote:
> 
> > In particular, if the userspace code wants to stage some filters and
> > apply them all at once, when ready, I'm not sure that it makes sense
> > to me to put that complexity in the kernel itself.  For instance,
> > Eric's second sample showed a call that took an array of ints and
> > coalesced them into "fd == %d || ...".  That simple example shows that
> > we could easily get by with a pretty minimal kernel-supported
> > interface as long as the richer behavior could live userspace side --
> > even if just in a simple helper library.  It'd be pretty easy to
> > implement a userspace library that exposed add_filter(syscall_nr,
> > filter) and apply_filters() such that it could manage building the
> > final filter string for a given syscall and pushing it to prctl on
> > apply.
> 
> I'm fine with a single kernel call and the "temporary filter" be done in
> userspace. Making the kernel code less complex is better :)

I'm also starting to think a userspace library is a good idea as well.
Everything in the kernel is an && with nothing but SET and APPLY.  We
provide interfaces to build the strings as we go along including UNSET
and stuff like that if there are users....

-Eric


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-05  9:21                           ` Will Drewry
  2011-05-05 13:14                             ` Serge E. Hallyn
  2011-05-06 11:53                             ` Steven Rostedt
@ 2011-05-06 16:30                             ` Eric Paris
  2011-05-07  2:11                               ` Will Drewry
  2 siblings, 1 reply; 406+ messages in thread
From: Eric Paris @ 2011-05-06 16:30 UTC (permalink / raw)
  To: Will Drewry
  Cc: Steven Rostedt, Frederic Weisbecker, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

[-- Attachment #1: Type: text/plain, Size: 1322 bytes --]

On Thu, 2011-05-05 at 02:21 -0700, Will Drewry wrote:
> On Wed, May 4, 2011 at 11:46 AM, Steven Rostedt <rostedt@goodmis.org> wrote:

> In particular, if the userspace code wants to stage some filters and
> apply them all at once, when ready, I'm not sure that it makes sense
> to me to put that complexity in the kernel itself.  For instance,
> Eric's second sample showed a call that took an array of ints and
> coalesced them into "fd == %d || ...".  That simple example shows that
> we could easily get by with a pretty minimal kernel-supported
> interface as long as the richer behavior could live userspace side --
> even if just in a simple helper library.  It'd be pretty easy to
> implement a userspace library that exposed add_filter(syscall_nr,
> filter) and apply_filters() such that it could manage building the
> final filter string for a given syscall and pushing it to prctl on
> apply.

Had a few minutes this morning so I started writing something that looks
sorta like a userspace library.  It dosn't have unset yet but I don't
think it'll be that hard for me to implement.  You can build some pretty
complex filters easily.  Not sure when I'll get to work on it more, but
it at least shows the idea of doing the complex work in a library and
keeping a simple kernel interface....  Any thoughts?

-Eric

[-- Attachment #2: libseccomp.c --]
[-- Type: text/x-csrc, Size: 6604 bytes --]

#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include "libseccomp.h"

/*
 * () around the string is 2
 * '\0' at the end is 1
 * Total extra space needed is 3
 */
#define SECCOMP_STRING_BYTES	3

typedef struct seccomp_filter {
#define SECCOMP_FILTER_TYPE_UNSET	0
#define SECCOMP_FILTER_TYPE_NODE	1
#define SECCOMP_FILTER_TYPE_LEAF	2
	char type;
	struct seccomp_filter *parent;
	struct seccomp_filter *left;
	struct seccomp_filter *right;
#define SECCOMP_FILTER_OP_UNSET		0
#define SECCOMP_FILTER_OP_AND		" && "
#define SECCOMP_FILTER_OP_OR		" || "
#define SECCOMP_FILTER_OP_EQUAL		" == "
#define SECCOMP_FILTER_OP_NOT_EQUAL	" != "
#define SECCOMP_FILTER_OP_LESS		" < "
#define SECCOMP_FILTER_OP_LESS_EQUAL	" <= "
#define SECCOMP_FILTER_OP_GREATER	" > "
#define SECCOMP_FILTER_OP_GREATER_EQUAL	" >= "
#define SECCOMP_FILTER_OP_ONE		" 1 "
#define SECCOMP_FILTER_OP_ZERO		" 0 "	
	const char *operation;
	char *arg;
	char *value;
} seccomp_filter_t;

char *seccomp_filter_to_string(seccomp_filter_t *sf)
{
	size_t newlen, len;
	char *left = NULL;
	char *right = NULL;
	char *out;
	char *p;

	if (sf->type == SECCOMP_FILTER_TYPE_NODE) {
		left = seccomp_filter_to_string(sf->left);
		if (!left)
			return NULL;
		right = seccomp_filter_to_string(sf->right);
		if (!right) {
			free(left);
			return NULL;
		}

		if (strcmp(sf->operation, SECCOMP_FILTER_OP_AND) &&
		    strcmp(sf->operation, SECCOMP_FILTER_OP_OR))
			assert(0);
	} else if (sf->type == SECCOMP_FILTER_TYPE_LEAF) {
		if (sf->arg) {
			left = strdup(sf->arg);
			if (!left)
				return NULL;
		}
		if (sf->value) {
			right = strdup(sf->value);
			if (!right) {
				free(left);
				return NULL;
			}
		}

		if (strcmp(sf->operation, SECCOMP_FILTER_OP_EQUAL) &&
		    strcmp(sf->operation, SECCOMP_FILTER_OP_NOT_EQUAL) &&
		    strcmp(sf->operation, SECCOMP_FILTER_OP_LESS) &&
		    strcmp(sf->operation, SECCOMP_FILTER_OP_LESS_EQUAL) &&
		    strcmp(sf->operation, SECCOMP_FILTER_OP_GREATER) &&
		    strcmp(sf->operation, SECCOMP_FILTER_OP_GREATER_EQUAL) &&
		    strcmp(sf->operation, SECCOMP_FILTER_OP_ONE) &&
		    strcmp(sf->operation, SECCOMP_FILTER_OP_ZERO))
			assert(0);
	} else {
		assert(0);
	}

	newlen = 0;
	if (left)
		newlen += strlen(left);
	newlen += strlen(sf->operation);
	if (right)
		newlen += strlen(right);
	newlen += SECCOMP_STRING_BYTES;

	out = malloc(newlen);
	if (!out) {
		free(left);
		free(right);
		return NULL;
	}

	p = out;

	len = sprintf(p, "(");
	p += len;
	if (left) {
		len = sprintf(p, "%s", left);
		p += len;
	}
	len = sprintf(p, "%s", sf->operation);
	p += len;
	if (right) {
		len = sprintf(p, "%s", right);
		p += len;
	}
	len = sprintf(p, ")");
	p += len;

	assert((size_t)(p-out) == (newlen - 1));

	free(left);
	free(right);
	return out;
}

seccomp_filter_t *__seccomp_filter_alloc(void)
{
	seccomp_filter_t *sf;

	sf = malloc(sizeof(*sf));
	if (!sf)
		return NULL;

	memset(sf, 0, sizeof(*sf));
	return sf;
}

void seccomp_filter_free(seccomp_filter_t *sf)
{
	if (sf->type == SECCOMP_FILTER_TYPE_NODE) {
		seccomp_filter_free(sf->left);
		seccomp_filter_free(sf->right);
	} else if (sf->type == SECCOMP_FILTER_TYPE_LEAF) {
		free(sf->arg);
		free(sf->value);
	}
	free(sf);
}

seccomp_filter_t *seccomp_and_filters(seccomp_filter_t *left, seccomp_filter_t *right)
{
	seccomp_filter_t *sf;

	sf = __seccomp_filter_alloc();
	if (!sf)
		return NULL;

	left->parent = sf;
	right->parent = sf;

	sf->type = SECCOMP_FILTER_TYPE_NODE;
	sf->operation = SECCOMP_FILTER_OP_AND;
	sf->left = left;
	sf->right = right;

	return sf;
}

seccomp_filter_t *seccomp_or_filters(seccomp_filter_t *left, seccomp_filter_t *right)
{
	seccomp_filter_t *sf;

	sf = __seccomp_filter_alloc();
	if (!sf)
		return NULL;

	left->parent = sf;
	right->parent = sf;

	sf->type = SECCOMP_FILTER_TYPE_NODE;
	sf->operation = SECCOMP_FILTER_OP_OR;
	sf->left = left;
	sf->right = right;

	return sf;
}

int __seccomp_filter_same(seccomp_filter_t *sf1, seccomp_filter_t *sf2)
{
	if (sf1->type != sf2->type)
		return 0;
	if (strcmp(sf1->operation, sf2->operation))
		return 0;
	if (sf1->type == SECCOMP_FILTER_TYPE_NODE) {
		if (!__seccomp_filter_same(sf1->left, sf2->left))
			return 0;
		if (!__seccomp_filter_same(sf1->right, sf2->right))
			return 0;
	} else if (sf1->type == SECCOMP_FILTER_TYPE_LEAF) {
		if (sf1->arg && !sf2->arg)
			return 0;
		if (!sf1->arg && sf2->arg)
			return 0;
		if (sf1->arg && strcmp(sf1->arg, sf2->arg))
			return 0;
		if (sf1->value && !sf2->value)
			return 0;
		if (!sf1->value && sf2->value)
			return 0;
		if (sf1->value && strcmp(sf1->value, sf2->value))
			return 0;
	} else {
		assert(0);
	}
	return 1;
}

seccomp_filter_t *__seccomp_arg_value(char *operation, char *arg, char *value)
{
	seccomp_filter_t *sf;

	sf = __seccomp_filter_alloc();
	if (!sf)
		return NULL;

	sf->type = SECCOMP_FILTER_TYPE_LEAF;
	sf->operation = operation;
	if (arg) {
		sf->arg = strdup(arg);
		if (!sf->arg)
			goto err;
	}

	if (value) {
		sf->value = strdup(value);
		if (!sf->value)
			goto err;
	}

	return sf;
err:
	free(sf->arg);
	free(sf->value);
	free(sf);
	return NULL;
}

seccomp_filter_t *seccomp_one(void)
{
	return __seccomp_arg_value(SECCOMP_FILTER_OP_ONE, NULL, NULL);
}

seccomp_filter_t *seccomp_zero(void)
{
	return __seccomp_arg_value(SECCOMP_FILTER_OP_ZERO, NULL, NULL);
}

seccomp_filter_t *seccomp_equal_arg_value(char *arg, char *value)
{
	return __seccomp_arg_value(SECCOMP_FILTER_OP_EQUAL, arg, value);
}

seccomp_filter_t *seccomp_not_equal_arg_value(char *arg, char *value)
{
	return __seccomp_arg_value(SECCOMP_FILTER_OP_NOT_EQUAL, arg, value);
}

seccomp_filter_t *seccomp_less_arg_value(char *arg, char *value)
{
	return __seccomp_arg_value(SECCOMP_FILTER_OP_LESS, arg, value);
}

seccomp_filter_t *seccomp_less_equal_arg_value(char *arg, char *value)
{
	return __seccomp_arg_value(SECCOMP_FILTER_OP_LESS_EQUAL, arg, value);
}

seccomp_filter_t *seccomp_greater_arg_value(char *arg, char *value)
{
	return __seccomp_arg_value(SECCOMP_FILTER_OP_GREATER, arg, value);
}

seccomp_filter_t *seccomp_greater_equal_arg_value(char *arg, char *value)
{
	return __seccomp_arg_value(SECCOMP_FILTER_OP_GREATER_EQUAL, arg, value);
}

int seccomp_filter_unset(seccomp_filter_t *tree __attribute__((unused)),
			 seccomp_filter_t *subset __attribute__((unused)))
{
/*
	seccompt_filter_t *sf;

	if (__seccomp_filter_same(tree, subset)) {
	} else if (tree->type == SECCOMP_FILTER_TYPE_NODE) {
		if (!seccomp_filter_unset(tree->left, subset))
			return 0;
		if (!seccomp_filter_unset(tree->right, subset))
			return 0;
	} else {
		return -EINVAL;
	}
 */
	return 0;
}

[-- Attachment #3: libseccomp.h --]
[-- Type: text/x-chdr, Size: 821 bytes --]

typedef struct seccomp_filter seccomp_filter_t;

char *seccomp_filter_to_string(seccomp_filter_t *sf);
seccomp_filter_t *seccomp_and_filters(seccomp_filter_t *left, seccomp_filter_t *right);
seccomp_filter_t *seccomp_or_filters(seccomp_filter_t *left, seccomp_filter_t *right);
seccomp_filter_t *seccomp_equal_arg_value(char *arg, char *value);
seccomp_filter_t *seccomp_not_equal_arg_value(char *arg, char *value);
seccomp_filter_t *seccomp_less_arg_value(char *arg, char *value);
seccomp_filter_t *seccomp_less_equal_arg_value(char *arg, char *value);
seccomp_filter_t *seccomp_greater_arg_value(char *arg, char *value);
seccomp_filter_t *seccomp_greater_equal_arg_value(char *arg, char *value);
seccomp_filter_t *seccomp_one(void);
seccomp_filter_t *seccomp_zero(void);
void seccomp_filter_free(seccomp_filter_t *sf);

[-- Attachment #4: Makefile --]
[-- Type: text/x-makefile, Size: 281 bytes --]

all: libseccomp.o test

libseccomp.o: libseccomp.c libseccomp.h Makefile
	gcc -o libseccomp.o -Wall -Wextra -Werror -W -g -c libseccomp.c

test: test.c libseccomp.h libseccomp.o Makefile
	gcc -o test -Wall -Wextra -Werror -W -g test.c libseccomp.o

clean:
	rm -f test libseccomp.o

[-- Attachment #5: test.c --]
[-- Type: text/x-csrc, Size: 829 bytes --]

#include <stdio.h>

#include "libseccomp.h"

int main(void)
{
	seccomp_filter_t *left, *right, *sf1, *sf2, *sf;
	char *string;

	left = seccomp_equal_arg_value("a0", "1");
	if (!left)
		return 1;

	right = seccomp_not_equal_arg_value("a0", "0");
	if (!right)
		return 1;

	sf1 = seccomp_and_filters(left, right);
	if (!sf1)
		return 1;

	left = seccomp_greater_arg_value("a1", "100");
	if (!left)
		return 1;

	right = seccomp_less_equal_arg_value("a1", "1000");
	if (!right)
		return 1;

	sf2 = seccomp_or_filters(left, right);
	if (!sf2)
		return 1;

	sf = seccomp_and_filters(sf1, sf2);
	if (!sf)
		return 1;

	right = seccomp_one();
	if (!right)
		return 1;

	sf = seccomp_and_filters(sf, right);
	if (!sf)
		return 1;

	string = seccomp_filter_to_string(sf);
	if (!string)
		return 1;

	printf("%s\n", string);
	return 0;
}

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-06 11:53                             ` Steven Rostedt
  2011-05-06 13:35                               ` Eric Paris
@ 2011-05-07  1:58                               ` Will Drewry
  2011-05-12  3:04                                 ` [PATCH 5/5] v2 " Will Drewry
  1 sibling, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-05-07  1:58 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frederic Weisbecker, Eric Paris, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Fri, May 6, 2011 at 4:53 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Thu, 2011-05-05 at 02:21 -0700, Will Drewry wrote:
>
>> In particular, if the userspace code wants to stage some filters and
>> apply them all at once, when ready, I'm not sure that it makes sense
>> to me to put that complexity in the kernel itself.  For instance,
>> Eric's second sample showed a call that took an array of ints and
>> coalesced them into "fd == %d || ...".  That simple example shows that
>> we could easily get by with a pretty minimal kernel-supported
>> interface as long as the richer behavior could live userspace side --
>> even if just in a simple helper library.  It'd be pretty easy to
>> implement a userspace library that exposed add_filter(syscall_nr,
>> filter) and apply_filters() such that it could manage building the
>> final filter string for a given syscall and pushing it to prctl on
>> apply.
>
> I'm fine with a single kernel call and the "temporary filter" be done in
> userspace. Making the kernel code less complex is better :)
>
>>
>> I think that could also help simplify the primitives.  For instance,
>> if any separate SET called on a system call resulting in an &&
>> operation, then the behavior could be consistent prior to enforcement
>> of the filtering and after.  E.g.,
>>   SET, __NR_read, "fd == 1"
>>   SET, __NR_read, "len < 4097"
>> would result in an evaluated "fd == 1 && len < 4097".  It would do so
>> after a single APPLY call too:
>>   SET, __NR_read, "1"
>>   APPLY
>>   SET, __NR_read, "fd == 1"
>>   SET, __NR_read, "len < 4097"
>> Results in: "1 && fd == 1 && len < 4097", and SET, nr, "0" would
>> nullify the syscall filter in total.
>
> Only that that was not applied? We can't let tasks nullify their
> restrictions once they have been applied. This keeps the kernel code
> simpler.

Ah - so I really need to be more explicit when discussing these
things!  In the "simplification" effort, I was thinking any syscall
with no entry has a "0" rule.  So if if nullify it, it becomes a
complete block and if you can't OR, then you can't add permissions.

>>   It seems like that would be
>> enough to build the SET-SET-...-APPLY, SET-SET-...-SET-APPLY logic
>> into a userspace library so that all temporary unapplied state doesn't
>> have to be explicitly managed by the kernel.
>
> Thus, the SETs are done in the userspace library that does not need to
> interact with the kernel (besides perhaps allocating memory). Then the
> apply would send all the filters to the kernel which would restrict the
> task (or the task on exec) further.

Exactly. Smaller patch and less state per-filter entry (I hope!).


>>
>> While I completely agree with the comment around ease-of-use as being
>> key to security, I also find that the more the state diagram explodes,
>> the harder it is to feel confident that a solution is actually secure.
>>  To try to achieve both objectives, I'd like to limit the kernel
>> interface to the bare minimum of primitives and build any API
>> fanciness into userspace.
>
> Fair enough.
>
>>
>> Does it seem that the tradeoff isn't worth it, or are there some
>> specific behaviors that aren't addressed using that model?
>>
>> While writing that, another option occurred to me that touches on the
>> other proposals but makes the behaviors much more explicit.
>> A prctl prototype could be provided:
>>  prctl(<SET|GET>, <AND|OR>, <syscall_nr>, <filter string>)
>> e.g.,
>>  prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_OR, __NR_read, "fd == 2");
>>
>> The explicit prctl argument list would allow the filter strings to be
>> self-referential and allow the userspace app to decide what behaviors
>> are allowed and when. If we followed that route, all implicit filters
>> would be "0" and the initial call to get things started might be:
>>    #define SET 33
>>    #define OR 0
>>    #define AND 1
>>    SET, OR, __NR_prctl, "option == 33 && (arg1 == 0 || arg1 == 1)"
>>    prctl(PR_SET_SECCOMP, 2);
>>
>> So now the "locked down" binary can call prctl to set an OR or AND
>> filter for any syscall.  A subsequent call could change that:
>>   SET, OR, __NR_read, "fd == 2"  /* => "0 || fd == 2" */
>>   SET, AND, __NR_prctl, "(arg2 != 63 || arg1 != 0)"  /* __NR_read == 63 */
>>
>> This would OR in a __NR_read filter, then disallow a future call to
>> prctl to OR in more NR_read filters, but for other syscalls ANDing and
>> ORing is still possible until you pass in something like:
>>
>>   SET, AND, __NR_prctl, "arg1 == 1"
>>
>> which would lock down all future prctl calls to only ANDing filters
>> in.  (The numbers in the examples could then be properly managed in a
>> userspace library to ensure platform correctness.)
>
> I don't know about this. It seems to be starting to get too complex, and
> thus error prone. Is there any reason we should allow an OR to the task?
> Why would we want to restrict a task where the task could easily
> unrestrict itself?

No idea!  I can't think of any good examples where you'd want to do
it, just contrived ones.  In general, I think the above approach would
rarely be used since I expect that something like 80% of the places
where this will be used will just be one-time, upfront filter installs
without any surface reduction after the fact.

That said, if there's no reason to support OR after the fact, then the
interface can just _only_ support &&s and leave the installation to
userspace.  It might makes the multiple-fd-ORing case less fun in
userspace, but it should work for most cases I think.

>>
>> While this would reduce the primitives a bit further, I'm not sure if
>> this would be the right approach either, but it would open the door to
>> pushing even more down to userspace very explicitly and further
>> removing magic policy logic from the kernel-side.  Is this vaguely
>> interesting or just another layer of confusing-ness?
>
> I'm confused, thus I must have hit that layer ;)

Sounds like it.  I'm always a sucker for self-referential mechanisms.
I've been travelling a bit recently so my code output has been a bit
low, but I'll pull together the most minimal approach that I think
we've been iterating toward and hopefully post something in the not
too distant future.

thanks!

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-06 16:30                             ` [PATCH 5/7] " Eric Paris
@ 2011-05-07  2:11                               ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-07  2:11 UTC (permalink / raw)
  To: Eric Paris
  Cc: Steven Rostedt, Frederic Weisbecker, Ingo Molnar, linux-kernel,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner

On Fri, May 6, 2011 at 6:35 AM, Eric Paris <eparis@redhat.com> wrote:
> I'm also starting to think a userspace library is a good idea as well.
> Everything in the kernel is an && with nothing but SET and APPLY.  We
> provide interfaces to build the strings as we go along including UNSET
> and stuff like that if there are users....

That sounds ideal to me. I think it'll be worth tagging this config
option as EXPERIMENTAL to begin with so that there's a chance to see
those of us who want it using it more widely and find out if there are
missing pieces or dumb limitations.  Hopefully by starting minimal, it
won't be a huge departure to evolve it if needed.

On Fri, May 6, 2011 at 9:30 AM, Eric Paris <eparis@redhat.com> wrote:

> On Thu, 2011-05-05 at 02:21 -0700, Will Drewry wrote:
>> On Wed, May 4, 2011 at 11:46 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> In particular, if the userspace code wants to stage some filters and
>> apply them all at once, when ready, I'm not sure that it makes sense
>> to me to put that complexity in the kernel itself.  For instance,
>> Eric's second sample showed a call that took an array of ints and
>> coalesced them into "fd == %d || ...".  That simple example shows that
>> we could easily get by with a pretty minimal kernel-supported
>> interface as long as the richer behavior could live userspace side --
>> even if just in a simple helper library.  It'd be pretty easy to
>> implement a userspace library that exposed add_filter(syscall_nr,
>> filter) and apply_filters() such that it could manage building the
>> final filter string for a given syscall and pushing it to prctl on
>> apply.
>
> Had a few minutes this morning so I started writing something that looks
> sorta like a userspace library.  It dosn't have unset yet but I don't
> think it'll be that hard for me to implement.  You can build some pretty
> complex filters easily.  Not sure when I'll get to work on it more, but
> it at least shows the idea of doing the complex work in a library and
> keeping a simple kernel interface....  Any thoughts?

Cool! I was thrown a little bit by spelling out the operations (eq,
ne, ..), but I can see how it might be useful for being able to
synthesize good filters before passing them off to the kernel.

It sounds like we might be moving towards a good starting point.  I'll
start playing with the interface some (and start on the kernel support
too :).

thanks!

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-04-28 19:07                 ` Steven Rostedt
  2011-05-12  3:02                     ` Will Drewry
@ 2011-05-12  3:02                     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-12  3:02 UTC (permalink / raw)
  To: linux-kernel
  Cc: Steven Rostedt, Frederic Weisbecker, Eric Paris, Ingo Molnar,
	kees.cook, agl, jmorris, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Will Drewry

This change adds a new seccomp mode based on the work by
agl@chromium.org in [1]. This new mode, "filter mode", provides a hash
table of seccomp_filter objects.  When in the new mode (2), all system
calls are checked against the filters - first by system call number,
then by a filter string.  If an entry exists for a given system call and
all filter predicates evaluate to true, then the task may proceed.
Otherwise, the task is killed (as per seccomp_mode == 1).

Filter string parsing and evaluation  is handled by the ftrace filter
engine.  Related patches tweak to the perf filter trace and free allow
the call to be shared. Filters inherit their understanding of types and
arguments for each system call from the CONFIG_FTRACE_SYSCALLS subsystem
which already predefines this information in syscall_metadata associated
enter_event (and exit_event) structures. If CONFIG_FTRACE and
CONFIG_FTRACE_SYSCALLS are not compiled in, only "1" filter strings will
be allowed.

The net result is a process may have its system calls filtered using the
ftrace filter engine's inherent understanding of systems calls. A
logical ruleset for a process that only needs stdin/stdout may be:
  sys_read: fd == 0
  sys_write: fd == 1 || fd == 2
  sys_exit: 1

The set of filters is specified through the PR_SET_SECCOMP path in prctl().
For example:
  prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 0");
  prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd == 1 || fd == 2");
  prctl(PR_SET_SECCOMP_FILTER, __NR_exit, "1");
  prctl(PR_SET_SECCOMP, 2, 0);

v2: - changed to use the existing syscall number ABI.
    - prctl changes to minimize parsing in the kernel:
      prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
      prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
      prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
      prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
    - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
    - added flags
    - provide a default fail syscall_nr_to_meta in ftrace
    - provides fallback for unhooked system calls
    - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
    - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
    - moved to a hlist and 4 bit hash of linked lists
    - added support to operate without CONFIG_FTRACE_SYSCALLS
    - moved Kconfig support next to SECCOMP
      (should this be done in per-platform patches?)
    - made Kconfig entries dependent on EXPERIMENTAL
    - added macros to avoid ifdefs from kernel/fork.c
    - added compat task/filter matching
    - drop seccomp.h inclusion in sched.h and drop seccomp_t
    - added Filtering to "show" output
    - added on_exec state dup'ing when enabling after a fast-path accept.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/arm/Kconfig        |   10 +
 arch/microblaze/Kconfig |   10 +
 arch/mips/Kconfig       |   10 +
 arch/powerpc/Kconfig    |   10 +
 arch/s390/Kconfig       |   10 +
 arch/sh/Kconfig         |   10 +
 arch/sparc/Kconfig      |   10 +
 arch/x86/Kconfig        |   10 +
 include/linux/prctl.h   |    9 +
 include/linux/sched.h   |    5 +-
 include/linux/seccomp.h |  116 +++++++++-
 include/trace/syscall.h |    7 +
 kernel/Makefile         |    3 +
 kernel/fork.c           |    3 +
 kernel/seccomp.c        |  228 +++++++++++++++++-
 kernel/seccomp.h        |   74 ++++++
 kernel/seccomp_filter.c |  581 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c            |   15 +-
 18 files changed, 1100 insertions(+), 21 deletions(-)
 create mode 100644 kernel/seccomp.h
 create mode 100644 kernel/seccomp_filter.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 377a7a5..22e1668 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1664,6 +1664,16 @@ config SECCOMP
 	  and the task is only allowed to execute a few safe syscalls
 	  defined by each seccomp mode.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config CC_STACKPROTECTOR
 	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index eccdefe..7641ee9 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -129,6 +129,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 endmenu
 
 menu "Advanced setup"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8e256cc..fe4cbda 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2245,6 +2245,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config USE_OF
 	bool "Flattened Device Tree support"
 	select OF
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8f4d50b..83499e4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -605,6 +605,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 endmenu
 
 config ISA_DMA_API
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2508a6f..2777515 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -614,6 +614,16 @@ config SECCOMP
 
 	  If unsure, say Y.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 endmenu
 
 menu "Power Management"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4b89da2..00c1521 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -676,6 +676,16 @@ config SECCOMP
 
 	  If unsure, say N.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config SMP
 	bool "Symmetric multi-processing support"
 	depends on SYS_SUPPORTS_SMP
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e560d10..5b42255 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -270,6 +270,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"
 	depends on SPARC64 && SMP
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a..d6d44d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1485,6 +1485,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config CC_STACKPROTECTOR
 	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
 	---help---
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..379b391 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -63,6 +63,15 @@
 /* Get/set process seccomp mode */
 #define PR_GET_SECCOMP	21
 #define PR_SET_SECCOMP	22
+# define PR_SECCOMP_MODE_NONE	0
+# define PR_SECCOMP_MODE_STRICT	1
+# define PR_SECCOMP_MODE_FILTER	2
+# define PR_SECCOMP_FLAG_FILTER_ON_EXEC	(1 << 1)
+
+/* Get/set process seccomp filters */
+#define PR_GET_SECCOMP_FILTER	35
+#define PR_SET_SECCOMP_FILTER	36
+#define PR_CLEAR_SECCOMP_FILTER	37
 
 /* Get/set the capability bounding set (as per security/commoncap.c) */
 #define PR_CAPBSET_READ 23
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 18d63ce..27eacf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -77,7 +77,6 @@ struct sched_param {
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/proportions.h>
-#include <linux/seccomp.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/rtmutex.h>
@@ -1190,6 +1189,8 @@ enum perf_event_task_context {
 	perf_nr_task_contexts,
 };
 
+struct seccomp_state;
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1374,7 +1375,7 @@ struct task_struct {
 	uid_t loginuid;
 	unsigned int sessionid;
 #endif
-	seccomp_t seccomp;
+	struct seccomp_state *seccomp;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 167c333..289c836 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -2,12 +2,34 @@
 #define _LINUX_SECCOMP_H
 
 
+/* Forward declare for proc interface */
+struct seq_file;
+
 #ifdef CONFIG_SECCOMP
 
+#include <linux/errno.h>
+#include <linux/list.h>
 #include <linux/thread_info.h>
+#include <linux/types.h>
 #include <asm/seccomp.h>
 
-typedef struct { int mode; } seccomp_t;
+/**
+ * struct seccomp_state - the state of a seccomp'ed process
+ *
+ * @mode:
+ *     if this is 1, the process is under standard seccomp rules
+ *             is 2, the process is only allowed to make system calls where
+ *                   associated filters evaluate successfully.
+ * @usage: number of references to the current instance.
+ * @flags: a bitmask of behavior altering flags.
+ * @filters: Hash table of filters if using CONFIG_SECCOMP_FILTER.
+ */
+struct seccomp_state {
+	uint16_t mode;
+	atomic_t usage;
+	long flags;
+	struct seccomp_filter_table *filters;
+};
 
 extern void __secure_computing(int);
 static inline void secure_computing(int this_syscall)
@@ -16,27 +38,113 @@ static inline void secure_computing(int this_syscall)
 		__secure_computing(this_syscall);
 }
 
+extern struct seccomp_state *seccomp_state_new(void);
+extern struct seccomp_state *seccomp_state_dup(const struct seccomp_state *);
+extern struct seccomp_state *get_seccomp_state(struct seccomp_state *);
+extern void put_seccomp_state(struct seccomp_state *);
+
+extern long prctl_set_seccomp(unsigned long, unsigned long);
 extern long prctl_get_seccomp(void);
-extern long prctl_set_seccomp(unsigned long);
+
+extern long prctl_set_seccomp_filter(unsigned long, char __user *);
+extern long prctl_get_seccomp_filter(unsigned long, char __user *,
+				     unsigned long);
+extern long prctl_clear_seccomp_filter(unsigned long);
+
+#define inherit_tsk_seccomp_state(_child, _orig) \
+	_child->seccomp = get_seccomp_state(_orig->seccomp);
+#define put_tsk_seccomp_state(_tsk) put_seccomp_state(_tsk->seccomp)
 
 #else /* CONFIG_SECCOMP */
 
 #include <linux/errno.h>
 
-typedef struct { } seccomp_t;
+struct seccomp_state { };
 
 #define secure_computing(x) do { } while (0)
+#define inherit_tsk_seccomp_state(_child, _orig) do { } while (0)
+#define put_tsk_seccomp_state(_tsk) do { } while (0)
 
 static inline long prctl_get_seccomp(void)
 {
 	return -EINVAL;
 }
 
-static inline long prctl_set_seccomp(unsigned long arg2)
+static inline long prctl_set_seccomp(unsigned long a2, unsigned long a3)
 {
 	return -EINVAL;
 }
 
+static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_clear_seccomp_filter(unsigned long a2)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
+					    unsigned long a4)
+{
+	return -ENOSYS;
+}
+
+static inline struct seccomp_state *seccomp_state_new(void)
+{
+	return NULL;
+}
+
+static inline struct seccomp_state *seccomp_state_dup(
+					const struct seccomp_state *state)
+{
+	return NULL;
+}
+
+static inline struct seccomp_state *get_seccomp_state(
+					struct seccomp_state *state)
+{
+	return NULL;
+}
+
+static inline void put_seccomp_state(struct seccomp_state *state)
+{
+}
+
 #endif /* CONFIG_SECCOMP */
 
+#ifdef CONFIG_SECCOMP_FILTER
+
+extern int seccomp_show_filters(struct seccomp_state *, struct seq_file *);
+extern long seccomp_set_filter(int, char *);
+extern long seccomp_clear_filter(int);
+extern long seccomp_get_filter(int, char *, unsigned long);
+
+#else  /* CONFIG_SECCOMP_FILTER */
+
+static inline int seccomp_show_filters(struct seccomp_state *state,
+				       struct seq_file *m)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_set_filter(int syscall_nr, char *filter)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_clear_filter(int syscall_nr)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_get_filter(int syscall_nr,
+				      char *buf, unsigned long available)
+{
+	return -ENOSYS;
+}
+
+#endif  /* CONFIG_SECCOMP_FILTER */
+
 #endif /* _LINUX_SECCOMP_H */
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 242ae04..e061ad0 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -35,6 +35,8 @@ struct syscall_metadata {
 extern unsigned long arch_syscall_addr(int nr);
 extern int init_syscall_trace(struct ftrace_event_call *call);
 
+extern struct syscall_metadata *syscall_nr_to_meta(int);
+
 extern int reg_event_syscall_enter(struct ftrace_event_call *call);
 extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
 extern int reg_event_syscall_exit(struct ftrace_event_call *call);
@@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
 				      struct trace_event *event);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
 				     struct trace_event *event);
+#else
+static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+	return NULL;
+}
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb3..84e7dfb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
+ifeq ($(CONFIG_SECCOMP_FILTER),y)
+obj-$(CONFIG_SECCOMP) += seccomp_filter.o
+endif
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548de..46987d4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
+	put_tsk_seccomp_state(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	if (err)
 		goto out;
 
+	inherit_tsk_seccomp_state(tsk, orig);
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13..502ba04 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,12 +6,15 @@
  * This defines a simple but solid secure-computing mode.
  */
 
+#include <linux/err.h>
+#include <linux/prctl.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/unistd.h>
 
-/* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+#include "seccomp.h"
 
 /*
  * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -32,11 +35,13 @@ static int mode1_syscalls_32[] = {
 
 void __secure_computing(int this_syscall)
 {
-	int mode = current->seccomp.mode;
+	int mode = -1;
+	long ret = 0;
 	int * syscall;
-
+	if (current->seccomp)
+		mode = current->seccomp->mode;
 	switch (mode) {
-	case 1:
+	case PR_SECCOMP_MODE_STRICT:
 		syscall = mode1_syscalls;
 #ifdef CONFIG_COMPAT
 		if (is_compat_task())
@@ -47,6 +52,20 @@ void __secure_computing(int this_syscall)
 				return;
 		} while (*++syscall);
 		break;
+	case PR_SECCOMP_MODE_FILTER:
+		if (this_syscall >= NR_syscalls || this_syscall < 0)
+			break;
+		ret = seccomp_test_filters(current->seccomp, this_syscall);
+		if (!ret)
+			return;
+		/* Only check for an override if an access failure occurred. */
+		if (ret != -EACCES)
+			break;
+		ret = seccomp_maybe_apply_filters(current, this_syscall);
+		if (!ret)
+			return;
+		seccomp_filter_log_failure(this_syscall);
+		break;
 	default:
 		BUG();
 	}
@@ -57,30 +76,213 @@ void __secure_computing(int this_syscall)
 	do_exit(SIGKILL);
 }
 
+/* seccomp_state_new - allocate a new state object. */
+struct seccomp_state *seccomp_state_new()
+{
+	struct seccomp_state *new = kzalloc(sizeof(struct seccomp_state),
+					    GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	new->flags = 0;
+#ifdef CONFIG_COMPAT
+	/* Annotate if this filterset is being created by a compat task. */
+	if (is_compat_task())
+		new->flags |= SECCOMP_FLAG_COMPAT;
+#endif
+
+	atomic_set(&new->usage, 1);
+	new->filters = seccomp_filter_table_new();
+	/* Not supported errors are fine, others are a problem. */
+	if (IS_ERR(new->filters) && PTR_ERR(new->filters) != -ENOSYS) {
+		kfree(new);
+		new = NULL;
+	}
+	return new;
+}
+
+/* seccomp_state_dup - copies an existing state object. */
+struct seccomp_state *seccomp_state_dup(const struct seccomp_state *orig)
+{
+	int err;
+	struct seccomp_state *new_state = seccomp_state_new();
+
+	err = -ENOMEM;
+	if (!new_state)
+		goto fail;
+	new_state->mode = orig->mode;
+	/* Flag copying will hide if the new process is a compat task.  However,
+	 * if the rule was compat/non-compat and the process is the opposite,
+	 * enforcement will terminate it.
+	 */
+	new_state->flags = orig->flags;
+	err = seccomp_copy_all_filters(new_state->filters,
+				       orig->filters);
+	if (err)
+		goto fail;
+
+	return new_state;
+fail:
+	put_seccomp_state(new_state);
+	return NULL;
+}
+
+/* get_seccomp_state - increments the reference count of @orig */
+struct seccomp_state *get_seccomp_state(struct seccomp_state *orig)
+{
+	if (!orig)
+		return NULL;
+	atomic_inc(&orig->usage);
+	return orig;
+}
+
+static void __put_seccomp_state(struct seccomp_state *orig)
+{
+	WARN_ON(atomic_read(&orig->usage));
+	seccomp_drop_all_filters(orig);
+	kfree(orig);
+}
+
+/* put_seccomp_state - decrements the reference count of @orig and may free. */
+void put_seccomp_state(struct seccomp_state *orig)
+{
+	if (!orig)
+		return;
+
+	if (atomic_dec_and_test(&orig->usage))
+		__put_seccomp_state(orig);
+}
+
 long prctl_get_seccomp(void)
 {
-	return current->seccomp.mode;
+	if (!current->seccomp)
+		return 0;
+	return current->seccomp->mode;
 }
 
-long prctl_set_seccomp(unsigned long seccomp_mode)
+long prctl_set_seccomp(unsigned long seccomp_mode, unsigned long flags)
 {
 	long ret;
+	struct seccomp_state *state, *cur_state;
 
+	cur_state = get_seccomp_state(current->seccomp);
 	/* can set it only once to be even more secure */
 	ret = -EPERM;
-	if (unlikely(current->seccomp.mode))
+	if (cur_state && unlikely(cur_state->mode))
 		goto out;
 
 	ret = -EINVAL;
-	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
-		current->seccomp.mode = seccomp_mode;
-		set_thread_flag(TIF_SECCOMP);
+	if (seccomp_mode <= 0 || seccomp_mode > NR_SECCOMP_MODES)
+		goto out;
+
+	ret = -ENOMEM;
+	state = (cur_state ? seccomp_state_dup(cur_state) :
+			     seccomp_state_new());
+	if (!state)
+		goto out;
+
+	if (seccomp_mode == PR_SECCOMP_MODE_STRICT) {
 #ifdef TIF_NOTSC
 		disable_TSC();
 #endif
-		ret = 0;
 	}
 
- out:
+	rcu_assign_pointer(current->seccomp, state);
+	synchronize_rcu();
+	put_seccomp_state(cur_state);	/* For the task */
+
+	/* Convert the ABI flag to the internal flag value. */
+	if (seccomp_mode == PR_SECCOMP_MODE_FILTER &&
+	    (flags & PR_SECCOMP_FLAG_FILTER_ON_EXEC))
+		state->flags |= SECCOMP_FLAG_ON_EXEC;
+	/* Encourage flag values to stay synchronized explicitly. */
+	BUILD_BUG_ON(PR_SECCOMP_FLAG_FILTER_ON_EXEC != SECCOMP_FLAG_ON_EXEC);
+
+	/* Only set the thread flag once after the new state is in place. */
+	state->mode = seccomp_mode;
+	set_thread_flag(TIF_SECCOMP);
+	ret = 0;
+
+out:
+	put_seccomp_state(cur_state);	/* for the get */
+	return ret;
+}
+
+long prctl_set_seccomp_filter(unsigned long syscall_nr,
+			      char __user *user_filter)
+{
+	int nr;
+	long ret;
+	char filter[SECCOMP_MAX_FILTER_LENGTH];
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls || syscall_nr < 0)
+		goto out;
+
+	ret = -EFAULT;
+	if (!user_filter ||
+	    strncpy_from_user(filter, user_filter,
+			      sizeof(filter) - 1) < 0)
+		goto out;
+
+	nr = (int) syscall_nr;
+	ret = seccomp_set_filter(nr, filter);
+
+out:
+	return ret;
+}
+
+long prctl_clear_seccomp_filter(unsigned long syscall_nr)
+{
+	int nr = -1;
+	long ret;
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls || syscall_nr < 0)
+		goto out;
+
+	nr = (int) syscall_nr;
+	ret = seccomp_clear_filter(nr);
+
+out:
+	return ret;
+}
+
+long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
+			      unsigned long available)
+{
+	int ret, nr;
+	unsigned long copied;
+	char *buf = NULL;
+	ret = -EINVAL;
+	if (!available)
+		goto out;
+	/* Ignore extra buffer space. */
+	if (available > SECCOMP_MAX_FILTER_LENGTH)
+		available = SECCOMP_MAX_FILTER_LENGTH;
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls || syscall_nr < 0)
+		goto out;
+	nr = (int) syscall_nr;
+
+	ret = -ENOMEM;
+	buf = kmalloc(available, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	ret = seccomp_get_filter(nr, buf, available);
+	if (ret < 0)
+		goto out;
+
+	/* Include the NUL byte in the copy. */
+	copied = copy_to_user(dst, buf, ret + 1);
+	ret = -ENOSPC;
+	if (copied)
+		goto out;
+
+	ret = 0;
+out:
+	kfree(buf);
 	return ret;
 }
diff --git a/kernel/seccomp.h b/kernel/seccomp.h
new file mode 100644
index 0000000..5abd219
--- /dev/null
+++ b/kernel/seccomp.h
@@ -0,0 +1,74 @@
+/*
+ * seccomp/seccomp_filter shared internal prototypes and state.
+ *
+ * Copyright (C) 2011 Chromium OS Authors.
+ */
+
+#ifndef __KERNEL_SECCOMP_H
+#define __KERNEL_SECCOMP_H
+
+#include <linux/ftrace_event.h>
+#include <linux/seccomp.h>
+
+/* #define SECCOMP_DEBUG 1 */
+#define NR_SECCOMP_MODES 2
+
+/* Inherit the max filter length from the filtering engine. */
+#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
+
+/* Presently, flags only affect SECCOMP_FILTER. */
+#define _SECCOMP_FLAG_COMPAT	0
+#define _SECCOMP_FLAG_ON_EXEC	1
+
+#define SECCOMP_FLAG_COMPAT	(1 << (_SECCOMP_FLAG_COMPAT))
+#define SECCOMP_FLAG_ON_EXEC	(1 << (_SECCOMP_FLAG_ON_EXEC))
+
+
+#ifdef CONFIG_SECCOMP_FILTER
+
+#define SECCOMP_FILTER_HASH_BITS 4
+#define SECCOMP_FILTER_HASH_SIZE (1 << SECCOMP_FILTER_HASH_BITS)
+
+struct seccomp_filter_table;
+extern struct seccomp_filter_table *seccomp_filter_table_new(void);
+extern int seccomp_copy_all_filters(struct seccomp_filter_table *,
+				    const struct seccomp_filter_table *);
+extern void seccomp_drop_all_filters(struct seccomp_state *);
+
+extern int seccomp_test_filters(struct seccomp_state *, int);
+extern int seccomp_maybe_apply_filters(struct task_struct *, int);
+extern void seccomp_filter_log_failure(int);
+
+#else /* CONFIG_SECCOMP_FILTER */
+
+static inline void seccomp_filter_log_failure(int syscall)
+{
+}
+
+static inline int seccomp_maybe_apply_filters(struct task_struct *tsk,
+					      int syscall_nr)
+{
+	return -ENOSYS;
+}
+
+static inline struct seccomp_filter_table *seccomp_filter_table_new(void)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline int seccomp_test_filters(struct seccomp_state *state, int nr)
+{
+	return -ENOSYS;
+}
+
+extern inline int seccomp_copy_all_filters(struct seccomp_filter_table *dst,
+					const struct seccomp_filter_table *src)
+{
+	return 0;
+}
+
+static inline void seccomp_drop_all_filters(struct seccomp_state *state) { }
+
+#endif /* CONFIG_SECCOMP_FILTER */
+
+#endif /* __KERNEL_SECCOMP_H */
diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
new file mode 100644
index 0000000..ff4e055
--- /dev/null
+++ b/kernel/seccomp_filter.c
@@ -0,0 +1,581 @@
+/* filter engine-based seccomp system call filtering.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ */
+
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/hash.h>
+#include <linux/prctl.h>
+#include <linux/seccomp.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/syscall.h>
+#include <trace/syscall.h>
+
+#include "seccomp.h"
+
+#define SECCOMP_MAX_FILTER_COUNT 512
+#define SECCOMP_WILDCARD_FILTER "1"
+
+struct seccomp_filter {
+	struct hlist_node node;
+	int syscall_nr;
+	struct syscall_metadata *data;
+	struct event_filter *event_filter;
+};
+
+struct seccomp_filter_table {
+	struct hlist_head heads[SECCOMP_FILTER_HASH_SIZE];
+	int count;
+};
+
+struct seccomp_filter_table *seccomp_filter_table_new(void)
+{
+	struct seccomp_filter_table *t =
+		kzalloc(sizeof(struct seccomp_filter_table), GFP_KERNEL);
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+	return t;
+}
+
+static inline u32 syscall_hash(int syscall_nr)
+{
+	return hash_32(syscall_nr, SECCOMP_FILTER_HASH_BITS);
+}
+
+static const char *get_filter_string(struct seccomp_filter *f)
+{
+	const char *str = SECCOMP_WILDCARD_FILTER;
+	if (!f)
+		return NULL;
+
+	/* Missing event filters qualify as wildcard matches. */
+	if (!f->event_filter)
+		return str;
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+	str = ftrace_get_filter_string(f->event_filter);
+#endif
+	return str;
+}
+
+static struct seccomp_filter *alloc_seccomp_filter(int syscall_nr,
+						   const char *filter_string)
+{
+	int err = -ENOMEM;
+	struct seccomp_filter *filter = kzalloc(sizeof(struct seccomp_filter),
+						GFP_KERNEL);
+	if (!filter)
+		goto fail;
+
+	INIT_HLIST_NODE(&filter->node);
+	filter->syscall_nr = syscall_nr;
+	filter->data = syscall_nr_to_meta(syscall_nr);
+
+	/* Treat a filter of SECCOMP_WILDCARD_FILTER as a wildcard and skip
+	 * using a predicate at all.
+	 */
+	if (!strcmp(SECCOMP_WILDCARD_FILTER, filter_string))
+		goto out;
+
+	/* Argument-based filtering only works on ftrace-hooked syscalls. */
+	if (!filter->data) {
+		err = -ENOSYS;
+		goto fail;
+	}
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+	err = ftrace_parse_filter(&filter->event_filter,
+				  filter->data->enter_event->event.type,
+				  filter_string);
+	if (err)
+		goto fail;
+#endif
+
+out:
+	return filter;
+
+fail:
+	kfree(filter);
+	return ERR_PTR(err);
+}
+
+static void free_seccomp_filter(struct seccomp_filter *filter)
+{
+#ifdef CONFIG_FTRACE_SYSCALLS
+	ftrace_free_filter(filter->event_filter);
+#endif
+	kfree(filter);
+}
+
+static struct seccomp_filter *copy_seccomp_filter(struct seccomp_filter *orig)
+{
+	return alloc_seccomp_filter(orig->syscall_nr, get_filter_string(orig));
+}
+
+/* Returns the matching filter or NULL */
+static struct seccomp_filter *find_filter(struct seccomp_state *state,
+					  int syscall)
+{
+	struct hlist_node *this, *pos;
+	struct seccomp_filter *filter = NULL;
+
+	u32 head = syscall_hash(syscall);
+	if (head >= SECCOMP_FILTER_HASH_SIZE)
+		goto out;
+
+	hlist_for_each_safe(this, pos, &state->filters->heads[head]) {
+		filter = hlist_entry(this, struct seccomp_filter, node);
+		if (filter->syscall_nr == syscall)
+			goto out;
+	}
+
+	filter = NULL;
+
+out:
+	return filter;
+}
+
+/* Safely drops all filters for a given syscall. This should only be called
+ * on unattached seccomp_state objects.
+ */
+static void drop_filter(struct seccomp_state *state, int syscall_nr)
+{
+	struct seccomp_filter *filter = find_filter(state, syscall_nr);
+	if (!filter)
+		return;
+
+	WARN_ON(state->filters->count == 0);
+	state->filters->count--;
+	hlist_del(&filter->node);
+	free_seccomp_filter(filter);
+}
+
+/* This should only be called on unattached seccomp_state objects. */
+static int add_filter(struct seccomp_state *state, int syscall_nr,
+		      char *filter_string)
+{
+	struct seccomp_filter *filter;
+	struct hlist_head *head;
+	char merged[SECCOMP_MAX_FILTER_LENGTH];
+	int ret;
+	u32 hash = syscall_hash(syscall_nr);
+
+	ret = -EINVAL;
+	if (state->filters->count == SECCOMP_MAX_FILTER_COUNT - 1)
+		goto out;
+
+	filter_string = strstrip(filter_string);
+
+	/* Disallow empty strings. */
+	if (filter_string[0] == 0)
+		goto out;
+
+	/* Get the right list head. */
+	head = &state->filters->heads[hash];
+
+	/* Find out if there is an existing entry to append to and
+	 * build the resultant filter string.  The original filter can be
+	 * destroyed here since the caller should be operating on a copy.
+	 */
+	filter = find_filter(state, syscall_nr);
+	if (filter) {
+		int expected = snprintf(merged, sizeof(merged), "(%s) && %s",
+					get_filter_string(filter),
+					filter_string);
+		ret = -E2BIG;
+		if (expected >= sizeof(merged) || expected < 0)
+			goto out;
+		filter_string = merged;
+		hlist_del(&filter->node);
+		free_seccomp_filter(filter);
+	}
+
+	/* When in seccomp filtering mode, only allow additions. */
+	ret = -EACCES;
+	if (filter == NULL && state->mode == PR_SECCOMP_MODE_FILTER)
+		goto out;
+
+	ret = 0;
+	filter = alloc_seccomp_filter(syscall_nr, filter_string);
+	if (IS_ERR(filter)) {
+		ret = PTR_ERR(filter);
+		goto out;
+	}
+
+	state->filters->count++;
+	hlist_add_head(&filter->node, head);
+out:
+	return ret;
+}
+
+/* Wrap optional ftrace syscall support. Returns 1 on match or if ftrace is not
+ * supported.
+ */
+static int do_ftrace_syscall_match(struct event_filter *event_filter)
+{
+	int err = 1;
+#ifdef CONFIG_FTRACE_SYSCALLS
+	uint8_t syscall_state[64];
+
+	memset(syscall_state, 0, sizeof(syscall_state));
+
+	/* The generic tracing entry can remain zeroed. */
+	err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
+					 NULL);
+	if (err)
+		return 0;
+
+	err = filter_match_preds(event_filter, syscall_state);
+#endif
+	return err;
+}
+
+/* 1 on match, 0 otherwise. */
+static int filter_match_current(struct seccomp_filter *filter)
+{
+	/* If no event filter exists, we assume a wildcard match. */
+	if (!filter->event_filter)
+		return 1;
+
+	return do_ftrace_syscall_match(filter->event_filter);
+}
+
+#ifndef KSTK_EIP
+#define KSTK_EIP(x) 0L
+#endif
+
+static const char *syscall_nr_to_name(int syscall)
+{
+	const char *syscall_name = "unknown";
+	struct syscall_metadata *data = syscall_nr_to_meta(syscall);
+	if (data)
+		syscall_name = data->name;
+	return syscall_name;
+}
+
+void seccomp_filter_log_failure(int syscall)
+{
+	printk(KERN_INFO
+		"%s[%d]: system call %d (%s) blocked at ip:%lx\n",
+		current->comm, task_pid_nr(current), syscall,
+		syscall_nr_to_name(syscall), KSTK_EIP(current));
+}
+
+/**
+ * seccomp_drop_all_filters - cleans up the filter list and frees the table
+ * @state: the seccomp_state to destroy the filters in.
+ */
+void seccomp_drop_all_filters(struct seccomp_state *state)
+{
+	struct hlist_node *this, *pos;
+	int head;
+	if (!state->filters)
+		return;
+	for (head = 0; head < SECCOMP_FILTER_HASH_SIZE; ++head) {
+		hlist_for_each_safe(this, pos, &state->filters->heads[head]) {
+			struct seccomp_filter *f = hlist_entry(this,
+						struct seccomp_filter, node);
+			WARN_ON(state->filters->count == 0);
+			hlist_del(this);
+			free_seccomp_filter(f);
+			state->filters->count--;
+		}
+	}
+	kfree(state->filters);
+}
+
+/**
+ * seccomp_copy_all_filters - copies all filters from src to dst.
+ *
+ * @dst: seccomp_filter_table to populate.
+ * @src: table to read from.
+ * Returns non-zero on failure.
+ * Both the source and the destination should have no simultaneous
+ * writers, and dst should be exclusive to the caller.
+ */
+int seccomp_copy_all_filters(struct seccomp_filter_table *dst,
+			     const struct seccomp_filter_table *src)
+{
+	struct seccomp_filter *filter;
+	int head, ret = 0;
+	BUG_ON(!dst || !src);
+	for (head = 0; head < SECCOMP_FILTER_HASH_SIZE; ++head) {
+		struct hlist_node *pos;
+		hlist_for_each_entry(filter, pos, &src->heads[head], node) {
+			struct seccomp_filter *new_filter =
+				copy_seccomp_filter(filter);
+			if (IS_ERR(new_filter)) {
+				ret = PTR_ERR(new_filter);
+				goto done;
+			}
+			hlist_add_head(&new_filter->node,
+				       &dst->heads[head]);
+			dst->count++;
+		}
+	}
+
+done:
+	return ret;
+}
+
+/**
+ * seccomp_show_filters - prints the filter state to a seq_file
+ * @state: the seccomp_state to enumerate the filter and bitmask of
+ * @m: the prepared seq_file to receive the data
+ *
+ * Returns 0 on a successful write.
+ */
+int seccomp_show_filters(struct seccomp_state *state, struct seq_file *m)
+{
+	int head;
+	struct hlist_node *pos;
+	struct seccomp_filter *filter;
+	int filtering = 0;
+	if (!state)
+		return 0;
+	if (!state->filters)
+		return 0;
+
+	filtering = (state->mode == 2);
+	filtering &= !(state->flags & SECCOMP_FLAG_ON_EXEC);
+	seq_printf(m, "Filtering: %d\n", filtering);
+	seq_printf(m, "FilterCount: %d\n", state->filters->count);
+	for (head = 0; head < SECCOMP_FILTER_HASH_SIZE; ++head) {
+		hlist_for_each_entry(filter, pos, &state->filters->heads[head],
+				     node) {
+			seq_printf(m, "SystemCall: %d (%s)\n",
+				      filter->syscall_nr,
+				      syscall_nr_to_name(filter->syscall_nr));
+			seq_printf(m, "Filter: %s\n",
+				      get_filter_string(filter));
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seccomp_show_filters);
+
+/**
+ * seccomp_maybe_apply_filters - conditionally applies seccomp filters
+ * @tsk: task to update
+ * @syscall_nr: current system call in progress
+ * tsk must already be in seccomp filter mode.
+ *
+ * Returns 0 if the call should be allowed or state has been updated.
+ * This call is only reach if no filters matched the current system call.
+ * In some cases, such as when the ON_EXEC flag is set, failure should
+ * not be terminal.
+ */
+int seccomp_maybe_apply_filters(struct task_struct *tsk, int syscall_nr)
+{
+	struct seccomp_state *state, *new_state = NULL;
+	int ret = -EACCES;
+
+	/* There's no question of application if ON_EXEC is not set. */
+	state = get_seccomp_state(tsk->seccomp);
+	if ((state->flags & SECCOMP_FLAG_ON_EXEC) == 0)
+		goto out;
+
+	ret = 0;
+	if (syscall_nr != __NR_execve)
+		goto out;
+
+	new_state = seccomp_state_dup(state);
+	ret = -ENOMEM;
+	if (!new_state)
+		goto out;
+
+	ret = 0;
+	new_state->flags &= ~(SECCOMP_FLAG_ON_EXEC);
+	rcu_assign_pointer(tsk->seccomp, new_state);
+	synchronize_rcu();
+	put_seccomp_state(state); /* for the task */
+
+out:
+	put_seccomp_state(state); /* for the get */
+	return ret;
+}
+
+/**
+ * seccomp_test_filters - tests 'current' against the given syscall
+ * @state: seccomp_state of current to use.
+ * @syscall: number of the system call to test
+ *
+ * Returns 0 on ok and non-zero on error/failure.
+ */
+int seccomp_test_filters(struct seccomp_state *state, int syscall)
+{
+	struct seccomp_filter *filter = NULL;
+	int ret;
+
+#ifdef CONFIG_COMPAT
+	ret = -EPERM;
+	if (is_compat_task() == !!(state->flags & SECCOMP_FLAG_COMPAT)) {
+		printk(KERN_INFO "%s[%d]: seccomp filter compat() mismatch.\n",
+		       current->comm, task_pid_nr(current));
+		goto out;
+	}
+#endif
+
+	ret = 0;
+	filter = find_filter(state, syscall);
+	if (filter && filter_match_current(filter))
+		goto out;
+
+	ret = -EACCES;
+out:
+	return ret;
+}
+
+/**
+ * seccomp_get_filter - copies the filter_string into "buf"
+ * @syscall_nr: system call number to look up
+ * @buf: destination buffer
+ * @bufsize: available space in the buffer.
+ *
+ * Looks up the filter for the given system call number on current.  If found,
+ * the string length of the NUL-terminated buffer is returned and < 0 is
+ * returned on error. The NUL byte is not included in the length.
+ */
+long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
+{
+	struct seccomp_state *state;
+	struct seccomp_filter *filter;
+	long ret = -ENOENT;
+
+	if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
+		bufsize = SECCOMP_MAX_FILTER_LENGTH;
+
+	state = get_seccomp_state(current->seccomp);
+	if (!state)
+		goto out;
+
+	filter = find_filter(state, syscall_nr);
+	if (!filter)
+		goto out;
+
+	ret = strlcpy(buf, get_filter_string(filter), bufsize);
+	if (ret >= bufsize) {
+		ret = -ENOSPC;
+		goto out;
+	}
+	/* Zero out any remaining buffer, just in case. */
+	memset(buf + ret, 0, bufsize - ret);
+out:
+	put_seccomp_state(state);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_get_filter);
+
+/**
+ * seccomp_clear_filter: clears the seccomp filter for a syscall.
+ * @syscall_nr: the system call number to clear filters for.
+ *
+ * (acts as a frontend for seccomp_set_filter. All restrictions
+ *  apply)
+ *
+ * Returns 0 on success.
+ */
+long seccomp_clear_filter(int syscall_nr)
+{
+	return seccomp_set_filter(syscall_nr, NULL);
+}
+EXPORT_SYMBOL_GPL(seccomp_clear_filter);
+
+/**
+ * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
+ * @syscall_nr: system call number to apply the filter to.
+ * @filter: ftrace filter string to apply.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ *          operates on current. current must be attempting a system call
+ *          when this is called.
+ *
+ * New filters may be added for system calls when the current task is
+ * not in a secure computing mode (seccomp).  Otherwise, filters may only
+ * be added to already filtered system call entries.  Any additions will
+ * be &&'d with the existing filter string to ensure no expansion of privileges
+ * will be possible.
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+long seccomp_set_filter(int syscall_nr, char *filter)
+{
+	struct seccomp_state *state, *orig_state;
+	long ret = -EINVAL;
+
+	orig_state = get_seccomp_state(current->seccomp);
+
+	/* Prior to mutating the state, create a duplicate to avoid modifying
+	 * the behavior of other instances sharing the state and ensure
+	 * consistency.
+	 */
+	state = (orig_state ? seccomp_state_dup(orig_state) :
+			      seccomp_state_new());
+	if (!state) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* A NULL filter doubles as a drop value, but the exposed prctl
+	 * interface requires a trip through seccomp_clear_filter().
+	 * Filter dropping is allowed across the is_compat_task() barrier.
+	 */
+	ret = 0;
+	if (filter == NULL) {
+		drop_filter(state, syscall_nr);
+		goto assign;
+	}
+
+	/* Avoid amiguous filters which may have been inherited from a parent
+	 * with different syscall numbers for the logically same calls.
+	 */
+#ifdef CONFIG_COMPAT
+	ret = -EACCES;
+	if (is_compat_task() != !!(state->flags & SECCOMP_FLAG_COMPAT)) {
+		if (state->filters->count)
+			goto free_state;
+		/* It's safe to add if there are no existing ambiguous rules.*/
+		if (is_compat_task())
+			state->flags |= SECCOMP_FLAG_COMPAT;
+		else
+			state->flags &= ~(SECCOMP_FLAG_COMPAT);
+	}
+#endif
+
+	ret = add_filter(state, syscall_nr, filter);
+	if (ret)
+		goto free_state;
+
+assign:
+	rcu_assign_pointer(current->seccomp, state);
+	synchronize_rcu();
+	put_seccomp_state(orig_state);  /* for the task */
+out:
+	put_seccomp_state(orig_state);  /* for the get */
+	return ret;
+
+free_state:
+	put_seccomp_state(orig_state);  /* for the get */
+	put_seccomp_state(state);  /* drop the dup/new */
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_set_filter);
diff --git a/kernel/sys.c b/kernel/sys.c
index af468ed..d29003a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1698,12 +1698,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_ENDIAN:
 			error = SET_ENDIAN(me, arg2);
 			break;
-
 		case PR_GET_SECCOMP:
 			error = prctl_get_seccomp();
 			break;
 		case PR_SET_SECCOMP:
-			error = prctl_set_seccomp(arg2);
+			error = prctl_set_seccomp(arg2, arg3);
+			break;
+		case PR_SET_SECCOMP_FILTER:
+			error = prctl_set_seccomp_filter(arg2,
+							 (char __user *) arg3);
+			break;
+		case PR_CLEAR_SECCOMP_FILTER:
+			error = prctl_clear_seccomp_filter(arg2);
+			break;
+		case PR_GET_SECCOMP_FILTER:
+			error = prctl_get_seccomp_filter(arg2,
+							 (char __user *) arg3,
+							 arg4);
 			break;
 		case PR_GET_TSC:
 			error = GET_TSC_CTL(arg2);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12  3:02                     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-12  3:02 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, jmorris, Ingo Molnar, linux-arm-kernel, Ingo Molnar,
	Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Martin Schwidefsky, Thomas Gleixner, kees.cook,
	Roland McGrath, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, Oleg Nesterov, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller

This change adds a new seccomp mode based on the work by
agl@chromium.org in [1]. This new mode, "filter mode", provides a hash
table of seccomp_filter objects.  When in the new mode (2), all system
calls are checked against the filters - first by system call number,
then by a filter string.  If an entry exists for a given system call and
all filter predicates evaluate to true, then the task may proceed.
Otherwise, the task is killed (as per seccomp_mode == 1).

Filter string parsing and evaluation  is handled by the ftrace filter
engine.  Related patches tweak to the perf filter trace and free allow
the call to be shared. Filters inherit their understanding of types and
arguments for each system call from the CONFIG_FTRACE_SYSCALLS subsystem
which already predefines this information in syscall_metadata associated
enter_event (and exit_event) structures. If CONFIG_FTRACE and
CONFIG_FTRACE_SYSCALLS are not compiled in, only "1" filter strings will
be allowed.

The net result is a process may have its system calls filtered using the
ftrace filter engine's inherent understanding of systems calls. A
logical ruleset for a process that only needs stdin/stdout may be:
  sys_read: fd == 0
  sys_write: fd == 1 || fd == 2
  sys_exit: 1

The set of filters is specified through the PR_SET_SECCOMP path in prctl().
For example:
  prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 0");
  prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd == 1 || fd == 2");
  prctl(PR_SET_SECCOMP_FILTER, __NR_exit, "1");
  prctl(PR_SET_SECCOMP, 2, 0);

v2: - changed to use the existing syscall number ABI.
    - prctl changes to minimize parsing in the kernel:
      prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
      prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
      prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
      prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
    - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
    - added flags
    - provide a default fail syscall_nr_to_meta in ftrace
    - provides fallback for unhooked system calls
    - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
    - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
    - moved to a hlist and 4 bit hash of linked lists
    - added support to operate without CONFIG_FTRACE_SYSCALLS
    - moved Kconfig support next to SECCOMP
      (should this be done in per-platform patches?)
    - made Kconfig entries dependent on EXPERIMENTAL
    - added macros to avoid ifdefs from kernel/fork.c
    - added compat task/filter matching
    - drop seccomp.h inclusion in sched.h and drop seccomp_t
    - added Filtering to "show" output
    - added on_exec state dup'ing when enabling after a fast-path accept.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/arm/Kconfig        |   10 +
 arch/microblaze/Kconfig |   10 +
 arch/mips/Kconfig       |   10 +
 arch/powerpc/Kconfig    |   10 +
 arch/s390/Kconfig       |   10 +
 arch/sh/Kconfig         |   10 +
 arch/sparc/Kconfig      |   10 +
 arch/x86/Kconfig        |   10 +
 include/linux/prctl.h   |    9 +
 include/linux/sched.h   |    5 +-
 include/linux/seccomp.h |  116 +++++++++-
 include/trace/syscall.h |    7 +
 kernel/Makefile         |    3 +
 kernel/fork.c           |    3 +
 kernel/seccomp.c        |  228 +++++++++++++++++-
 kernel/seccomp.h        |   74 ++++++
 kernel/seccomp_filter.c |  581 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c            |   15 +-
 18 files changed, 1100 insertions(+), 21 deletions(-)
 create mode 100644 kernel/seccomp.h
 create mode 100644 kernel/seccomp_filter.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 377a7a5..22e1668 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1664,6 +1664,16 @@ config SECCOMP
 	  and the task is only allowed to execute a few safe syscalls
 	  defined by each seccomp mode.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config CC_STACKPROTECTOR
 	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index eccdefe..7641ee9 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -129,6 +129,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 endmenu
 
 menu "Advanced setup"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8e256cc..fe4cbda 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2245,6 +2245,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config USE_OF
 	bool "Flattened Device Tree support"
 	select OF
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8f4d50b..83499e4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -605,6 +605,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 endmenu
 
 config ISA_DMA_API
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2508a6f..2777515 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -614,6 +614,16 @@ config SECCOMP
 
 	  If unsure, say Y.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 endmenu
 
 menu "Power Management"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4b89da2..00c1521 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -676,6 +676,16 @@ config SECCOMP
 
 	  If unsure, say N.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config SMP
 	bool "Symmetric multi-processing support"
 	depends on SYS_SUPPORTS_SMP
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e560d10..5b42255 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -270,6 +270,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"
 	depends on SPARC64 && SMP
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a..d6d44d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1485,6 +1485,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config CC_STACKPROTECTOR
 	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
 	---help---
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..379b391 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -63,6 +63,15 @@
 /* Get/set process seccomp mode */
 #define PR_GET_SECCOMP	21
 #define PR_SET_SECCOMP	22
+# define PR_SECCOMP_MODE_NONE	0
+# define PR_SECCOMP_MODE_STRICT	1
+# define PR_SECCOMP_MODE_FILTER	2
+# define PR_SECCOMP_FLAG_FILTER_ON_EXEC	(1 << 1)
+
+/* Get/set process seccomp filters */
+#define PR_GET_SECCOMP_FILTER	35
+#define PR_SET_SECCOMP_FILTER	36
+#define PR_CLEAR_SECCOMP_FILTER	37
 
 /* Get/set the capability bounding set (as per security/commoncap.c) */
 #define PR_CAPBSET_READ 23
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 18d63ce..27eacf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -77,7 +77,6 @@ struct sched_param {
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/proportions.h>
-#include <linux/seccomp.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/rtmutex.h>
@@ -1190,6 +1189,8 @@ enum perf_event_task_context {
 	perf_nr_task_contexts,
 };
 
+struct seccomp_state;
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1374,7 +1375,7 @@ struct task_struct {
 	uid_t loginuid;
 	unsigned int sessionid;
 #endif
-	seccomp_t seccomp;
+	struct seccomp_state *seccomp;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 167c333..289c836 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -2,12 +2,34 @@
 #define _LINUX_SECCOMP_H
 
 
+/* Forward declare for proc interface */
+struct seq_file;
+
 #ifdef CONFIG_SECCOMP
 
+#include <linux/errno.h>
+#include <linux/list.h>
 #include <linux/thread_info.h>
+#include <linux/types.h>
 #include <asm/seccomp.h>
 
-typedef struct { int mode; } seccomp_t;
+/**
+ * struct seccomp_state - the state of a seccomp'ed process
+ *
+ * @mode:
+ *     if this is 1, the process is under standard seccomp rules
+ *             is 2, the process is only allowed to make system calls where
+ *                   associated filters evaluate successfully.
+ * @usage: number of references to the current instance.
+ * @flags: a bitmask of behavior altering flags.
+ * @filters: Hash table of filters if using CONFIG_SECCOMP_FILTER.
+ */
+struct seccomp_state {
+	uint16_t mode;
+	atomic_t usage;
+	long flags;
+	struct seccomp_filter_table *filters;
+};
 
 extern void __secure_computing(int);
 static inline void secure_computing(int this_syscall)
@@ -16,27 +38,113 @@ static inline void secure_computing(int this_syscall)
 		__secure_computing(this_syscall);
 }
 
+extern struct seccomp_state *seccomp_state_new(void);
+extern struct seccomp_state *seccomp_state_dup(const struct seccomp_state *);
+extern struct seccomp_state *get_seccomp_state(struct seccomp_state *);
+extern void put_seccomp_state(struct seccomp_state *);
+
+extern long prctl_set_seccomp(unsigned long, unsigned long);
 extern long prctl_get_seccomp(void);
-extern long prctl_set_seccomp(unsigned long);
+
+extern long prctl_set_seccomp_filter(unsigned long, char __user *);
+extern long prctl_get_seccomp_filter(unsigned long, char __user *,
+				     unsigned long);
+extern long prctl_clear_seccomp_filter(unsigned long);
+
+#define inherit_tsk_seccomp_state(_child, _orig) \
+	_child->seccomp = get_seccomp_state(_orig->seccomp);
+#define put_tsk_seccomp_state(_tsk) put_seccomp_state(_tsk->seccomp)
 
 #else /* CONFIG_SECCOMP */
 
 #include <linux/errno.h>
 
-typedef struct { } seccomp_t;
+struct seccomp_state { };
 
 #define secure_computing(x) do { } while (0)
+#define inherit_tsk_seccomp_state(_child, _orig) do { } while (0)
+#define put_tsk_seccomp_state(_tsk) do { } while (0)
 
 static inline long prctl_get_seccomp(void)
 {
 	return -EINVAL;
 }
 
-static inline long prctl_set_seccomp(unsigned long arg2)
+static inline long prctl_set_seccomp(unsigned long a2, unsigned long a3)
 {
 	return -EINVAL;
 }
 
+static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_clear_seccomp_filter(unsigned long a2)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
+					    unsigned long a4)
+{
+	return -ENOSYS;
+}
+
+static inline struct seccomp_state *seccomp_state_new(void)
+{
+	return NULL;
+}
+
+static inline struct seccomp_state *seccomp_state_dup(
+					const struct seccomp_state *state)
+{
+	return NULL;
+}
+
+static inline struct seccomp_state *get_seccomp_state(
+					struct seccomp_state *state)
+{
+	return NULL;
+}
+
+static inline void put_seccomp_state(struct seccomp_state *state)
+{
+}
+
 #endif /* CONFIG_SECCOMP */
 
+#ifdef CONFIG_SECCOMP_FILTER
+
+extern int seccomp_show_filters(struct seccomp_state *, struct seq_file *);
+extern long seccomp_set_filter(int, char *);
+extern long seccomp_clear_filter(int);
+extern long seccomp_get_filter(int, char *, unsigned long);
+
+#else  /* CONFIG_SECCOMP_FILTER */
+
+static inline int seccomp_show_filters(struct seccomp_state *state,
+				       struct seq_file *m)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_set_filter(int syscall_nr, char *filter)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_clear_filter(int syscall_nr)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_get_filter(int syscall_nr,
+				      char *buf, unsigned long available)
+{
+	return -ENOSYS;
+}
+
+#endif  /* CONFIG_SECCOMP_FILTER */
+
 #endif /* _LINUX_SECCOMP_H */
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 242ae04..e061ad0 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -35,6 +35,8 @@ struct syscall_metadata {
 extern unsigned long arch_syscall_addr(int nr);
 extern int init_syscall_trace(struct ftrace_event_call *call);
 
+extern struct syscall_metadata *syscall_nr_to_meta(int);
+
 extern int reg_event_syscall_enter(struct ftrace_event_call *call);
 extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
 extern int reg_event_syscall_exit(struct ftrace_event_call *call);
@@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
 				      struct trace_event *event);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
 				     struct trace_event *event);
+#else
+static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+	return NULL;
+}
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb3..84e7dfb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
+ifeq ($(CONFIG_SECCOMP_FILTER),y)
+obj-$(CONFIG_SECCOMP) += seccomp_filter.o
+endif
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548de..46987d4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
+	put_tsk_seccomp_state(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	if (err)
 		goto out;
 
+	inherit_tsk_seccomp_state(tsk, orig);
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13..502ba04 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,12 +6,15 @@
  * This defines a simple but solid secure-computing mode.
  */
 
+#include <linux/err.h>
+#include <linux/prctl.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/unistd.h>
 
-/* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+#include "seccomp.h"
 
 /*
  * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -32,11 +35,13 @@ static int mode1_syscalls_32[] = {
 
 void __secure_computing(int this_syscall)
 {
-	int mode = current->seccomp.mode;
+	int mode = -1;
+	long ret = 0;
 	int * syscall;
-
+	if (current->seccomp)
+		mode = current->seccomp->mode;
 	switch (mode) {
-	case 1:
+	case PR_SECCOMP_MODE_STRICT:
 		syscall = mode1_syscalls;
 #ifdef CONFIG_COMPAT
 		if (is_compat_task())
@@ -47,6 +52,20 @@ void __secure_computing(int this_syscall)
 				return;
 		} while (*++syscall);
 		break;
+	case PR_SECCOMP_MODE_FILTER:
+		if (this_syscall >= NR_syscalls || this_syscall < 0)
+			break;
+		ret = seccomp_test_filters(current->seccomp, this_syscall);
+		if (!ret)
+			return;
+		/* Only check for an override if an access failure occurred. */
+		if (ret != -EACCES)
+			break;
+		ret = seccomp_maybe_apply_filters(current, this_syscall);
+		if (!ret)
+			return;
+		seccomp_filter_log_failure(this_syscall);
+		break;
 	default:
 		BUG();
 	}
@@ -57,30 +76,213 @@ void __secure_computing(int this_syscall)
 	do_exit(SIGKILL);
 }
 
+/* seccomp_state_new - allocate a new state object. */
+struct seccomp_state *seccomp_state_new()
+{
+	struct seccomp_state *new = kzalloc(sizeof(struct seccomp_state),
+					    GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	new->flags = 0;
+#ifdef CONFIG_COMPAT
+	/* Annotate if this filterset is being created by a compat task. */
+	if (is_compat_task())
+		new->flags |= SECCOMP_FLAG_COMPAT;
+#endif
+
+	atomic_set(&new->usage, 1);
+	new->filters = seccomp_filter_table_new();
+	/* Not supported errors are fine, others are a problem. */
+	if (IS_ERR(new->filters) && PTR_ERR(new->filters) != -ENOSYS) {
+		kfree(new);
+		new = NULL;
+	}
+	return new;
+}
+
+/* seccomp_state_dup - copies an existing state object. */
+struct seccomp_state *seccomp_state_dup(const struct seccomp_state *orig)
+{
+	int err;
+	struct seccomp_state *new_state = seccomp_state_new();
+
+	err = -ENOMEM;
+	if (!new_state)
+		goto fail;
+	new_state->mode = orig->mode;
+	/* Flag copying will hide if the new process is a compat task.  However,
+	 * if the rule was compat/non-compat and the process is the opposite,
+	 * enforcement will terminate it.
+	 */
+	new_state->flags = orig->flags;
+	err = seccomp_copy_all_filters(new_state->filters,
+				       orig->filters);
+	if (err)
+		goto fail;
+
+	return new_state;
+fail:
+	put_seccomp_state(new_state);
+	return NULL;
+}
+
+/* get_seccomp_state - increments the reference count of @orig */
+struct seccomp_state *get_seccomp_state(struct seccomp_state *orig)
+{
+	if (!orig)
+		return NULL;
+	atomic_inc(&orig->usage);
+	return orig;
+}
+
+static void __put_seccomp_state(struct seccomp_state *orig)
+{
+	WARN_ON(atomic_read(&orig->usage));
+	seccomp_drop_all_filters(orig);
+	kfree(orig);
+}
+
+/* put_seccomp_state - decrements the reference count of @orig and may free. */
+void put_seccomp_state(struct seccomp_state *orig)
+{
+	if (!orig)
+		return;
+
+	if (atomic_dec_and_test(&orig->usage))
+		__put_seccomp_state(orig);
+}
+
 long prctl_get_seccomp(void)
 {
-	return current->seccomp.mode;
+	if (!current->seccomp)
+		return 0;
+	return current->seccomp->mode;
 }
 
-long prctl_set_seccomp(unsigned long seccomp_mode)
+long prctl_set_seccomp(unsigned long seccomp_mode, unsigned long flags)
 {
 	long ret;
+	struct seccomp_state *state, *cur_state;
 
+	cur_state = get_seccomp_state(current->seccomp);
 	/* can set it only once to be even more secure */
 	ret = -EPERM;
-	if (unlikely(current->seccomp.mode))
+	if (cur_state && unlikely(cur_state->mode))
 		goto out;
 
 	ret = -EINVAL;
-	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
-		current->seccomp.mode = seccomp_mode;
-		set_thread_flag(TIF_SECCOMP);
+	if (seccomp_mode <= 0 || seccomp_mode > NR_SECCOMP_MODES)
+		goto out;
+
+	ret = -ENOMEM;
+	state = (cur_state ? seccomp_state_dup(cur_state) :
+			     seccomp_state_new());
+	if (!state)
+		goto out;
+
+	if (seccomp_mode == PR_SECCOMP_MODE_STRICT) {
 #ifdef TIF_NOTSC
 		disable_TSC();
 #endif
-		ret = 0;
 	}
 
- out:
+	rcu_assign_pointer(current->seccomp, state);
+	synchronize_rcu();
+	put_seccomp_state(cur_state);	/* For the task */
+
+	/* Convert the ABI flag to the internal flag value. */
+	if (seccomp_mode == PR_SECCOMP_MODE_FILTER &&
+	    (flags & PR_SECCOMP_FLAG_FILTER_ON_EXEC))
+		state->flags |= SECCOMP_FLAG_ON_EXEC;
+	/* Encourage flag values to stay synchronized explicitly. */
+	BUILD_BUG_ON(PR_SECCOMP_FLAG_FILTER_ON_EXEC != SECCOMP_FLAG_ON_EXEC);
+
+	/* Only set the thread flag once after the new state is in place. */
+	state->mode = seccomp_mode;
+	set_thread_flag(TIF_SECCOMP);
+	ret = 0;
+
+out:
+	put_seccomp_state(cur_state);	/* for the get */
+	return ret;
+}
+
+long prctl_set_seccomp_filter(unsigned long syscall_nr,
+			      char __user *user_filter)
+{
+	int nr;
+	long ret;
+	char filter[SECCOMP_MAX_FILTER_LENGTH];
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls || syscall_nr < 0)
+		goto out;
+
+	ret = -EFAULT;
+	if (!user_filter ||
+	    strncpy_from_user(filter, user_filter,
+			      sizeof(filter) - 1) < 0)
+		goto out;
+
+	nr = (int) syscall_nr;
+	ret = seccomp_set_filter(nr, filter);
+
+out:
+	return ret;
+}
+
+long prctl_clear_seccomp_filter(unsigned long syscall_nr)
+{
+	int nr = -1;
+	long ret;
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls || syscall_nr < 0)
+		goto out;
+
+	nr = (int) syscall_nr;
+	ret = seccomp_clear_filter(nr);
+
+out:
+	return ret;
+}
+
+long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
+			      unsigned long available)
+{
+	int ret, nr;
+	unsigned long copied;
+	char *buf = NULL;
+	ret = -EINVAL;
+	if (!available)
+		goto out;
+	/* Ignore extra buffer space. */
+	if (available > SECCOMP_MAX_FILTER_LENGTH)
+		available = SECCOMP_MAX_FILTER_LENGTH;
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls || syscall_nr < 0)
+		goto out;
+	nr = (int) syscall_nr;
+
+	ret = -ENOMEM;
+	buf = kmalloc(available, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	ret = seccomp_get_filter(nr, buf, available);
+	if (ret < 0)
+		goto out;
+
+	/* Include the NUL byte in the copy. */
+	copied = copy_to_user(dst, buf, ret + 1);
+	ret = -ENOSPC;
+	if (copied)
+		goto out;
+
+	ret = 0;
+out:
+	kfree(buf);
 	return ret;
 }
diff --git a/kernel/seccomp.h b/kernel/seccomp.h
new file mode 100644
index 0000000..5abd219
--- /dev/null
+++ b/kernel/seccomp.h
@@ -0,0 +1,74 @@
+/*
+ * seccomp/seccomp_filter shared internal prototypes and state.
+ *
+ * Copyright (C) 2011 Chromium OS Authors.
+ */
+
+#ifndef __KERNEL_SECCOMP_H
+#define __KERNEL_SECCOMP_H
+
+#include <linux/ftrace_event.h>
+#include <linux/seccomp.h>
+
+/* #define SECCOMP_DEBUG 1 */
+#define NR_SECCOMP_MODES 2
+
+/* Inherit the max filter length from the filtering engine. */
+#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
+
+/* Presently, flags only affect SECCOMP_FILTER. */
+#define _SECCOMP_FLAG_COMPAT	0
+#define _SECCOMP_FLAG_ON_EXEC	1
+
+#define SECCOMP_FLAG_COMPAT	(1 << (_SECCOMP_FLAG_COMPAT))
+#define SECCOMP_FLAG_ON_EXEC	(1 << (_SECCOMP_FLAG_ON_EXEC))
+
+
+#ifdef CONFIG_SECCOMP_FILTER
+
+#define SECCOMP_FILTER_HASH_BITS 4
+#define SECCOMP_FILTER_HASH_SIZE (1 << SECCOMP_FILTER_HASH_BITS)
+
+struct seccomp_filter_table;
+extern struct seccomp_filter_table *seccomp_filter_table_new(void);
+extern int seccomp_copy_all_filters(struct seccomp_filter_table *,
+				    const struct seccomp_filter_table *);
+extern void seccomp_drop_all_filters(struct seccomp_state *);
+
+extern int seccomp_test_filters(struct seccomp_state *, int);
+extern int seccomp_maybe_apply_filters(struct task_struct *, int);
+extern void seccomp_filter_log_failure(int);
+
+#else /* CONFIG_SECCOMP_FILTER */
+
+static inline void seccomp_filter_log_failure(int syscall)
+{
+}
+
+static inline int seccomp_maybe_apply_filters(struct task_struct *tsk,
+					      int syscall_nr)
+{
+	return -ENOSYS;
+}
+
+static inline struct seccomp_filter_table *seccomp_filter_table_new(void)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline int seccomp_test_filters(struct seccomp_state *state, int nr)
+{
+	return -ENOSYS;
+}
+
+extern inline int seccomp_copy_all_filters(struct seccomp_filter_table *dst,
+					const struct seccomp_filter_table *src)
+{
+	return 0;
+}
+
+static inline void seccomp_drop_all_filters(struct seccomp_state *state) { }
+
+#endif /* CONFIG_SECCOMP_FILTER */
+
+#endif /* __KERNEL_SECCOMP_H */
diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
new file mode 100644
index 0000000..ff4e055
--- /dev/null
+++ b/kernel/seccomp_filter.c
@@ -0,0 +1,581 @@
+/* filter engine-based seccomp system call filtering.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ */
+
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/hash.h>
+#include <linux/prctl.h>
+#include <linux/seccomp.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/syscall.h>
+#include <trace/syscall.h>
+
+#include "seccomp.h"
+
+#define SECCOMP_MAX_FILTER_COUNT 512
+#define SECCOMP_WILDCARD_FILTER "1"
+
+struct seccomp_filter {
+	struct hlist_node node;
+	int syscall_nr;
+	struct syscall_metadata *data;
+	struct event_filter *event_filter;
+};
+
+struct seccomp_filter_table {
+	struct hlist_head heads[SECCOMP_FILTER_HASH_SIZE];
+	int count;
+};
+
+struct seccomp_filter_table *seccomp_filter_table_new(void)
+{
+	struct seccomp_filter_table *t =
+		kzalloc(sizeof(struct seccomp_filter_table), GFP_KERNEL);
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+	return t;
+}
+
+static inline u32 syscall_hash(int syscall_nr)
+{
+	return hash_32(syscall_nr, SECCOMP_FILTER_HASH_BITS);
+}
+
+static const char *get_filter_string(struct seccomp_filter *f)
+{
+	const char *str = SECCOMP_WILDCARD_FILTER;
+	if (!f)
+		return NULL;
+
+	/* Missing event filters qualify as wildcard matches. */
+	if (!f->event_filter)
+		return str;
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+	str = ftrace_get_filter_string(f->event_filter);
+#endif
+	return str;
+}
+
+static struct seccomp_filter *alloc_seccomp_filter(int syscall_nr,
+						   const char *filter_string)
+{
+	int err = -ENOMEM;
+	struct seccomp_filter *filter = kzalloc(sizeof(struct seccomp_filter),
+						GFP_KERNEL);
+	if (!filter)
+		goto fail;
+
+	INIT_HLIST_NODE(&filter->node);
+	filter->syscall_nr = syscall_nr;
+	filter->data = syscall_nr_to_meta(syscall_nr);
+
+	/* Treat a filter of SECCOMP_WILDCARD_FILTER as a wildcard and skip
+	 * using a predicate at all.
+	 */
+	if (!strcmp(SECCOMP_WILDCARD_FILTER, filter_string))
+		goto out;
+
+	/* Argument-based filtering only works on ftrace-hooked syscalls. */
+	if (!filter->data) {
+		err = -ENOSYS;
+		goto fail;
+	}
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+	err = ftrace_parse_filter(&filter->event_filter,
+				  filter->data->enter_event->event.type,
+				  filter_string);
+	if (err)
+		goto fail;
+#endif
+
+out:
+	return filter;
+
+fail:
+	kfree(filter);
+	return ERR_PTR(err);
+}
+
+static void free_seccomp_filter(struct seccomp_filter *filter)
+{
+#ifdef CONFIG_FTRACE_SYSCALLS
+	ftrace_free_filter(filter->event_filter);
+#endif
+	kfree(filter);
+}
+
+static struct seccomp_filter *copy_seccomp_filter(struct seccomp_filter *orig)
+{
+	return alloc_seccomp_filter(orig->syscall_nr, get_filter_string(orig));
+}
+
+/* Returns the matching filter or NULL */
+static struct seccomp_filter *find_filter(struct seccomp_state *state,
+					  int syscall)
+{
+	struct hlist_node *this, *pos;
+	struct seccomp_filter *filter = NULL;
+
+	u32 head = syscall_hash(syscall);
+	if (head >= SECCOMP_FILTER_HASH_SIZE)
+		goto out;
+
+	hlist_for_each_safe(this, pos, &state->filters->heads[head]) {
+		filter = hlist_entry(this, struct seccomp_filter, node);
+		if (filter->syscall_nr == syscall)
+			goto out;
+	}
+
+	filter = NULL;
+
+out:
+	return filter;
+}
+
+/* Safely drops all filters for a given syscall. This should only be called
+ * on unattached seccomp_state objects.
+ */
+static void drop_filter(struct seccomp_state *state, int syscall_nr)
+{
+	struct seccomp_filter *filter = find_filter(state, syscall_nr);
+	if (!filter)
+		return;
+
+	WARN_ON(state->filters->count == 0);
+	state->filters->count--;
+	hlist_del(&filter->node);
+	free_seccomp_filter(filter);
+}
+
+/* This should only be called on unattached seccomp_state objects. */
+static int add_filter(struct seccomp_state *state, int syscall_nr,
+		      char *filter_string)
+{
+	struct seccomp_filter *filter;
+	struct hlist_head *head;
+	char merged[SECCOMP_MAX_FILTER_LENGTH];
+	int ret;
+	u32 hash = syscall_hash(syscall_nr);
+
+	ret = -EINVAL;
+	if (state->filters->count == SECCOMP_MAX_FILTER_COUNT - 1)
+		goto out;
+
+	filter_string = strstrip(filter_string);
+
+	/* Disallow empty strings. */
+	if (filter_string[0] == 0)
+		goto out;
+
+	/* Get the right list head. */
+	head = &state->filters->heads[hash];
+
+	/* Find out if there is an existing entry to append to and
+	 * build the resultant filter string.  The original filter can be
+	 * destroyed here since the caller should be operating on a copy.
+	 */
+	filter = find_filter(state, syscall_nr);
+	if (filter) {
+		int expected = snprintf(merged, sizeof(merged), "(%s) && %s",
+					get_filter_string(filter),
+					filter_string);
+		ret = -E2BIG;
+		if (expected >= sizeof(merged) || expected < 0)
+			goto out;
+		filter_string = merged;
+		hlist_del(&filter->node);
+		free_seccomp_filter(filter);
+	}
+
+	/* When in seccomp filtering mode, only allow additions. */
+	ret = -EACCES;
+	if (filter == NULL && state->mode == PR_SECCOMP_MODE_FILTER)
+		goto out;
+
+	ret = 0;
+	filter = alloc_seccomp_filter(syscall_nr, filter_string);
+	if (IS_ERR(filter)) {
+		ret = PTR_ERR(filter);
+		goto out;
+	}
+
+	state->filters->count++;
+	hlist_add_head(&filter->node, head);
+out:
+	return ret;
+}
+
+/* Wrap optional ftrace syscall support. Returns 1 on match or if ftrace is not
+ * supported.
+ */
+static int do_ftrace_syscall_match(struct event_filter *event_filter)
+{
+	int err = 1;
+#ifdef CONFIG_FTRACE_SYSCALLS
+	uint8_t syscall_state[64];
+
+	memset(syscall_state, 0, sizeof(syscall_state));
+
+	/* The generic tracing entry can remain zeroed. */
+	err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
+					 NULL);
+	if (err)
+		return 0;
+
+	err = filter_match_preds(event_filter, syscall_state);
+#endif
+	return err;
+}
+
+/* 1 on match, 0 otherwise. */
+static int filter_match_current(struct seccomp_filter *filter)
+{
+	/* If no event filter exists, we assume a wildcard match. */
+	if (!filter->event_filter)
+		return 1;
+
+	return do_ftrace_syscall_match(filter->event_filter);
+}
+
+#ifndef KSTK_EIP
+#define KSTK_EIP(x) 0L
+#endif
+
+static const char *syscall_nr_to_name(int syscall)
+{
+	const char *syscall_name = "unknown";
+	struct syscall_metadata *data = syscall_nr_to_meta(syscall);
+	if (data)
+		syscall_name = data->name;
+	return syscall_name;
+}
+
+void seccomp_filter_log_failure(int syscall)
+{
+	printk(KERN_INFO
+		"%s[%d]: system call %d (%s) blocked at ip:%lx\n",
+		current->comm, task_pid_nr(current), syscall,
+		syscall_nr_to_name(syscall), KSTK_EIP(current));
+}
+
+/**
+ * seccomp_drop_all_filters - cleans up the filter list and frees the table
+ * @state: the seccomp_state to destroy the filters in.
+ */
+void seccomp_drop_all_filters(struct seccomp_state *state)
+{
+	struct hlist_node *this, *pos;
+	int head;
+	if (!state->filters)
+		return;
+	for (head = 0; head < SECCOMP_FILTER_HASH_SIZE; ++head) {
+		hlist_for_each_safe(this, pos, &state->filters->heads[head]) {
+			struct seccomp_filter *f = hlist_entry(this,
+						struct seccomp_filter, node);
+			WARN_ON(state->filters->count == 0);
+			hlist_del(this);
+			free_seccomp_filter(f);
+			state->filters->count--;
+		}
+	}
+	kfree(state->filters);
+}
+
+/**
+ * seccomp_copy_all_filters - copies all filters from src to dst.
+ *
+ * @dst: seccomp_filter_table to populate.
+ * @src: table to read from.
+ * Returns non-zero on failure.
+ * Both the source and the destination should have no simultaneous
+ * writers, and dst should be exclusive to the caller.
+ */
+int seccomp_copy_all_filters(struct seccomp_filter_table *dst,
+			     const struct seccomp_filter_table *src)
+{
+	struct seccomp_filter *filter;
+	int head, ret = 0;
+	BUG_ON(!dst || !src);
+	for (head = 0; head < SECCOMP_FILTER_HASH_SIZE; ++head) {
+		struct hlist_node *pos;
+		hlist_for_each_entry(filter, pos, &src->heads[head], node) {
+			struct seccomp_filter *new_filter =
+				copy_seccomp_filter(filter);
+			if (IS_ERR(new_filter)) {
+				ret = PTR_ERR(new_filter);
+				goto done;
+			}
+			hlist_add_head(&new_filter->node,
+				       &dst->heads[head]);
+			dst->count++;
+		}
+	}
+
+done:
+	return ret;
+}
+
+/**
+ * seccomp_show_filters - prints the filter state to a seq_file
+ * @state: the seccomp_state to enumerate the filter and bitmask of
+ * @m: the prepared seq_file to receive the data
+ *
+ * Returns 0 on a successful write.
+ */
+int seccomp_show_filters(struct seccomp_state *state, struct seq_file *m)
+{
+	int head;
+	struct hlist_node *pos;
+	struct seccomp_filter *filter;
+	int filtering = 0;
+	if (!state)
+		return 0;
+	if (!state->filters)
+		return 0;
+
+	filtering = (state->mode == 2);
+	filtering &= !(state->flags & SECCOMP_FLAG_ON_EXEC);
+	seq_printf(m, "Filtering: %d\n", filtering);
+	seq_printf(m, "FilterCount: %d\n", state->filters->count);
+	for (head = 0; head < SECCOMP_FILTER_HASH_SIZE; ++head) {
+		hlist_for_each_entry(filter, pos, &state->filters->heads[head],
+				     node) {
+			seq_printf(m, "SystemCall: %d (%s)\n",
+				      filter->syscall_nr,
+				      syscall_nr_to_name(filter->syscall_nr));
+			seq_printf(m, "Filter: %s\n",
+				      get_filter_string(filter));
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seccomp_show_filters);
+
+/**
+ * seccomp_maybe_apply_filters - conditionally applies seccomp filters
+ * @tsk: task to update
+ * @syscall_nr: current system call in progress
+ * tsk must already be in seccomp filter mode.
+ *
+ * Returns 0 if the call should be allowed or state has been updated.
+ * This call is only reach if no filters matched the current system call.
+ * In some cases, such as when the ON_EXEC flag is set, failure should
+ * not be terminal.
+ */
+int seccomp_maybe_apply_filters(struct task_struct *tsk, int syscall_nr)
+{
+	struct seccomp_state *state, *new_state = NULL;
+	int ret = -EACCES;
+
+	/* There's no question of application if ON_EXEC is not set. */
+	state = get_seccomp_state(tsk->seccomp);
+	if ((state->flags & SECCOMP_FLAG_ON_EXEC) == 0)
+		goto out;
+
+	ret = 0;
+	if (syscall_nr != __NR_execve)
+		goto out;
+
+	new_state = seccomp_state_dup(state);
+	ret = -ENOMEM;
+	if (!new_state)
+		goto out;
+
+	ret = 0;
+	new_state->flags &= ~(SECCOMP_FLAG_ON_EXEC);
+	rcu_assign_pointer(tsk->seccomp, new_state);
+	synchronize_rcu();
+	put_seccomp_state(state); /* for the task */
+
+out:
+	put_seccomp_state(state); /* for the get */
+	return ret;
+}
+
+/**
+ * seccomp_test_filters - tests 'current' against the given syscall
+ * @state: seccomp_state of current to use.
+ * @syscall: number of the system call to test
+ *
+ * Returns 0 on ok and non-zero on error/failure.
+ */
+int seccomp_test_filters(struct seccomp_state *state, int syscall)
+{
+	struct seccomp_filter *filter = NULL;
+	int ret;
+
+#ifdef CONFIG_COMPAT
+	ret = -EPERM;
+	if (is_compat_task() == !!(state->flags & SECCOMP_FLAG_COMPAT)) {
+		printk(KERN_INFO "%s[%d]: seccomp filter compat() mismatch.\n",
+		       current->comm, task_pid_nr(current));
+		goto out;
+	}
+#endif
+
+	ret = 0;
+	filter = find_filter(state, syscall);
+	if (filter && filter_match_current(filter))
+		goto out;
+
+	ret = -EACCES;
+out:
+	return ret;
+}
+
+/**
+ * seccomp_get_filter - copies the filter_string into "buf"
+ * @syscall_nr: system call number to look up
+ * @buf: destination buffer
+ * @bufsize: available space in the buffer.
+ *
+ * Looks up the filter for the given system call number on current.  If found,
+ * the string length of the NUL-terminated buffer is returned and < 0 is
+ * returned on error. The NUL byte is not included in the length.
+ */
+long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
+{
+	struct seccomp_state *state;
+	struct seccomp_filter *filter;
+	long ret = -ENOENT;
+
+	if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
+		bufsize = SECCOMP_MAX_FILTER_LENGTH;
+
+	state = get_seccomp_state(current->seccomp);
+	if (!state)
+		goto out;
+
+	filter = find_filter(state, syscall_nr);
+	if (!filter)
+		goto out;
+
+	ret = strlcpy(buf, get_filter_string(filter), bufsize);
+	if (ret >= bufsize) {
+		ret = -ENOSPC;
+		goto out;
+	}
+	/* Zero out any remaining buffer, just in case. */
+	memset(buf + ret, 0, bufsize - ret);
+out:
+	put_seccomp_state(state);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_get_filter);
+
+/**
+ * seccomp_clear_filter: clears the seccomp filter for a syscall.
+ * @syscall_nr: the system call number to clear filters for.
+ *
+ * (acts as a frontend for seccomp_set_filter. All restrictions
+ *  apply)
+ *
+ * Returns 0 on success.
+ */
+long seccomp_clear_filter(int syscall_nr)
+{
+	return seccomp_set_filter(syscall_nr, NULL);
+}
+EXPORT_SYMBOL_GPL(seccomp_clear_filter);
+
+/**
+ * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
+ * @syscall_nr: system call number to apply the filter to.
+ * @filter: ftrace filter string to apply.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ *          operates on current. current must be attempting a system call
+ *          when this is called.
+ *
+ * New filters may be added for system calls when the current task is
+ * not in a secure computing mode (seccomp).  Otherwise, filters may only
+ * be added to already filtered system call entries.  Any additions will
+ * be &&'d with the existing filter string to ensure no expansion of privileges
+ * will be possible.
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+long seccomp_set_filter(int syscall_nr, char *filter)
+{
+	struct seccomp_state *state, *orig_state;
+	long ret = -EINVAL;
+
+	orig_state = get_seccomp_state(current->seccomp);
+
+	/* Prior to mutating the state, create a duplicate to avoid modifying
+	 * the behavior of other instances sharing the state and ensure
+	 * consistency.
+	 */
+	state = (orig_state ? seccomp_state_dup(orig_state) :
+			      seccomp_state_new());
+	if (!state) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* A NULL filter doubles as a drop value, but the exposed prctl
+	 * interface requires a trip through seccomp_clear_filter().
+	 * Filter dropping is allowed across the is_compat_task() barrier.
+	 */
+	ret = 0;
+	if (filter == NULL) {
+		drop_filter(state, syscall_nr);
+		goto assign;
+	}
+
+	/* Avoid amiguous filters which may have been inherited from a parent
+	 * with different syscall numbers for the logically same calls.
+	 */
+#ifdef CONFIG_COMPAT
+	ret = -EACCES;
+	if (is_compat_task() != !!(state->flags & SECCOMP_FLAG_COMPAT)) {
+		if (state->filters->count)
+			goto free_state;
+		/* It's safe to add if there are no existing ambiguous rules.*/
+		if (is_compat_task())
+			state->flags |= SECCOMP_FLAG_COMPAT;
+		else
+			state->flags &= ~(SECCOMP_FLAG_COMPAT);
+	}
+#endif
+
+	ret = add_filter(state, syscall_nr, filter);
+	if (ret)
+		goto free_state;
+
+assign:
+	rcu_assign_pointer(current->seccomp, state);
+	synchronize_rcu();
+	put_seccomp_state(orig_state);  /* for the task */
+out:
+	put_seccomp_state(orig_state);  /* for the get */
+	return ret;
+
+free_state:
+	put_seccomp_state(orig_state);  /* for the get */
+	put_seccomp_state(state);  /* drop the dup/new */
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_set_filter);
diff --git a/kernel/sys.c b/kernel/sys.c
index af468ed..d29003a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1698,12 +1698,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_ENDIAN:
 			error = SET_ENDIAN(me, arg2);
 			break;
-
 		case PR_GET_SECCOMP:
 			error = prctl_get_seccomp();
 			break;
 		case PR_SET_SECCOMP:
-			error = prctl_set_seccomp(arg2);
+			error = prctl_set_seccomp(arg2, arg3);
+			break;
+		case PR_SET_SECCOMP_FILTER:
+			error = prctl_set_seccomp_filter(arg2,
+							 (char __user *) arg3);
+			break;
+		case PR_CLEAR_SECCOMP_FILTER:
+			error = prctl_clear_seccomp_filter(arg2);
+			break;
+		case PR_GET_SECCOMP_FILTER:
+			error = prctl_get_seccomp_filter(arg2,
+							 (char __user *) arg3,
+							 arg4);
 			break;
 		case PR_GET_TSC:
 			error = GET_TSC_CTL(arg2);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12  3:02                     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-12  3:02 UTC (permalink / raw)
  To: linux-arm-kernel

This change adds a new seccomp mode based on the work by
agl at chromium.org in [1]. This new mode, "filter mode", provides a hash
table of seccomp_filter objects.  When in the new mode (2), all system
calls are checked against the filters - first by system call number,
then by a filter string.  If an entry exists for a given system call and
all filter predicates evaluate to true, then the task may proceed.
Otherwise, the task is killed (as per seccomp_mode == 1).

Filter string parsing and evaluation  is handled by the ftrace filter
engine.  Related patches tweak to the perf filter trace and free allow
the call to be shared. Filters inherit their understanding of types and
arguments for each system call from the CONFIG_FTRACE_SYSCALLS subsystem
which already predefines this information in syscall_metadata associated
enter_event (and exit_event) structures. If CONFIG_FTRACE and
CONFIG_FTRACE_SYSCALLS are not compiled in, only "1" filter strings will
be allowed.

The net result is a process may have its system calls filtered using the
ftrace filter engine's inherent understanding of systems calls. A
logical ruleset for a process that only needs stdin/stdout may be:
  sys_read: fd == 0
  sys_write: fd == 1 || fd == 2
  sys_exit: 1

The set of filters is specified through the PR_SET_SECCOMP path in prctl().
For example:
  prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 0");
  prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd == 1 || fd == 2");
  prctl(PR_SET_SECCOMP_FILTER, __NR_exit, "1");
  prctl(PR_SET_SECCOMP, 2, 0);

v2: - changed to use the existing syscall number ABI.
    - prctl changes to minimize parsing in the kernel:
      prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
      prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
      prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
      prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
    - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
    - added flags
    - provide a default fail syscall_nr_to_meta in ftrace
    - provides fallback for unhooked system calls
    - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
    - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
    - moved to a hlist and 4 bit hash of linked lists
    - added support to operate without CONFIG_FTRACE_SYSCALLS
    - moved Kconfig support next to SECCOMP
      (should this be done in per-platform patches?)
    - made Kconfig entries dependent on EXPERIMENTAL
    - added macros to avoid ifdefs from kernel/fork.c
    - added compat task/filter matching
    - drop seccomp.h inclusion in sched.h and drop seccomp_t
    - added Filtering to "show" output
    - added on_exec state dup'ing when enabling after a fast-path accept.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/arm/Kconfig        |   10 +
 arch/microblaze/Kconfig |   10 +
 arch/mips/Kconfig       |   10 +
 arch/powerpc/Kconfig    |   10 +
 arch/s390/Kconfig       |   10 +
 arch/sh/Kconfig         |   10 +
 arch/sparc/Kconfig      |   10 +
 arch/x86/Kconfig        |   10 +
 include/linux/prctl.h   |    9 +
 include/linux/sched.h   |    5 +-
 include/linux/seccomp.h |  116 +++++++++-
 include/trace/syscall.h |    7 +
 kernel/Makefile         |    3 +
 kernel/fork.c           |    3 +
 kernel/seccomp.c        |  228 +++++++++++++++++-
 kernel/seccomp.h        |   74 ++++++
 kernel/seccomp_filter.c |  581 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c            |   15 +-
 18 files changed, 1100 insertions(+), 21 deletions(-)
 create mode 100644 kernel/seccomp.h
 create mode 100644 kernel/seccomp_filter.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 377a7a5..22e1668 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1664,6 +1664,16 @@ config SECCOMP
 	  and the task is only allowed to execute a few safe syscalls
 	  defined by each seccomp mode.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config CC_STACKPROTECTOR
 	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index eccdefe..7641ee9 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -129,6 +129,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 endmenu
 
 menu "Advanced setup"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8e256cc..fe4cbda 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2245,6 +2245,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config USE_OF
 	bool "Flattened Device Tree support"
 	select OF
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8f4d50b..83499e4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -605,6 +605,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 endmenu
 
 config ISA_DMA_API
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2508a6f..2777515 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -614,6 +614,16 @@ config SECCOMP
 
 	  If unsure, say Y.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 endmenu
 
 menu "Power Management"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4b89da2..00c1521 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -676,6 +676,16 @@ config SECCOMP
 
 	  If unsure, say N.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config SMP
 	bool "Symmetric multi-processing support"
 	depends on SYS_SUPPORTS_SMP
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e560d10..5b42255 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -270,6 +270,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"
 	depends on SPARC64 && SMP
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a..d6d44d9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1485,6 +1485,16 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	depends on SECCOMP && EXPERIMENTAL
+	help
+	  Per-process, inherited system call filtering using shared code
+	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
+	  is not available, enhanced filters will not be available.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config CC_STACKPROTECTOR
 	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
 	---help---
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..379b391 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -63,6 +63,15 @@
 /* Get/set process seccomp mode */
 #define PR_GET_SECCOMP	21
 #define PR_SET_SECCOMP	22
+# define PR_SECCOMP_MODE_NONE	0
+# define PR_SECCOMP_MODE_STRICT	1
+# define PR_SECCOMP_MODE_FILTER	2
+# define PR_SECCOMP_FLAG_FILTER_ON_EXEC	(1 << 1)
+
+/* Get/set process seccomp filters */
+#define PR_GET_SECCOMP_FILTER	35
+#define PR_SET_SECCOMP_FILTER	36
+#define PR_CLEAR_SECCOMP_FILTER	37
 
 /* Get/set the capability bounding set (as per security/commoncap.c) */
 #define PR_CAPBSET_READ 23
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 18d63ce..27eacf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -77,7 +77,6 @@ struct sched_param {
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/proportions.h>
-#include <linux/seccomp.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/rtmutex.h>
@@ -1190,6 +1189,8 @@ enum perf_event_task_context {
 	perf_nr_task_contexts,
 };
 
+struct seccomp_state;
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1374,7 +1375,7 @@ struct task_struct {
 	uid_t loginuid;
 	unsigned int sessionid;
 #endif
-	seccomp_t seccomp;
+	struct seccomp_state *seccomp;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 167c333..289c836 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -2,12 +2,34 @@
 #define _LINUX_SECCOMP_H
 
 
+/* Forward declare for proc interface */
+struct seq_file;
+
 #ifdef CONFIG_SECCOMP
 
+#include <linux/errno.h>
+#include <linux/list.h>
 #include <linux/thread_info.h>
+#include <linux/types.h>
 #include <asm/seccomp.h>
 
-typedef struct { int mode; } seccomp_t;
+/**
+ * struct seccomp_state - the state of a seccomp'ed process
+ *
+ * @mode:
+ *     if this is 1, the process is under standard seccomp rules
+ *             is 2, the process is only allowed to make system calls where
+ *                   associated filters evaluate successfully.
+ * @usage: number of references to the current instance.
+ * @flags: a bitmask of behavior altering flags.
+ * @filters: Hash table of filters if using CONFIG_SECCOMP_FILTER.
+ */
+struct seccomp_state {
+	uint16_t mode;
+	atomic_t usage;
+	long flags;
+	struct seccomp_filter_table *filters;
+};
 
 extern void __secure_computing(int);
 static inline void secure_computing(int this_syscall)
@@ -16,27 +38,113 @@ static inline void secure_computing(int this_syscall)
 		__secure_computing(this_syscall);
 }
 
+extern struct seccomp_state *seccomp_state_new(void);
+extern struct seccomp_state *seccomp_state_dup(const struct seccomp_state *);
+extern struct seccomp_state *get_seccomp_state(struct seccomp_state *);
+extern void put_seccomp_state(struct seccomp_state *);
+
+extern long prctl_set_seccomp(unsigned long, unsigned long);
 extern long prctl_get_seccomp(void);
-extern long prctl_set_seccomp(unsigned long);
+
+extern long prctl_set_seccomp_filter(unsigned long, char __user *);
+extern long prctl_get_seccomp_filter(unsigned long, char __user *,
+				     unsigned long);
+extern long prctl_clear_seccomp_filter(unsigned long);
+
+#define inherit_tsk_seccomp_state(_child, _orig) \
+	_child->seccomp = get_seccomp_state(_orig->seccomp);
+#define put_tsk_seccomp_state(_tsk) put_seccomp_state(_tsk->seccomp)
 
 #else /* CONFIG_SECCOMP */
 
 #include <linux/errno.h>
 
-typedef struct { } seccomp_t;
+struct seccomp_state { };
 
 #define secure_computing(x) do { } while (0)
+#define inherit_tsk_seccomp_state(_child, _orig) do { } while (0)
+#define put_tsk_seccomp_state(_tsk) do { } while (0)
 
 static inline long prctl_get_seccomp(void)
 {
 	return -EINVAL;
 }
 
-static inline long prctl_set_seccomp(unsigned long arg2)
+static inline long prctl_set_seccomp(unsigned long a2, unsigned long a3)
 {
 	return -EINVAL;
 }
 
+static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_clear_seccomp_filter(unsigned long a2)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
+					    unsigned long a4)
+{
+	return -ENOSYS;
+}
+
+static inline struct seccomp_state *seccomp_state_new(void)
+{
+	return NULL;
+}
+
+static inline struct seccomp_state *seccomp_state_dup(
+					const struct seccomp_state *state)
+{
+	return NULL;
+}
+
+static inline struct seccomp_state *get_seccomp_state(
+					struct seccomp_state *state)
+{
+	return NULL;
+}
+
+static inline void put_seccomp_state(struct seccomp_state *state)
+{
+}
+
 #endif /* CONFIG_SECCOMP */
 
+#ifdef CONFIG_SECCOMP_FILTER
+
+extern int seccomp_show_filters(struct seccomp_state *, struct seq_file *);
+extern long seccomp_set_filter(int, char *);
+extern long seccomp_clear_filter(int);
+extern long seccomp_get_filter(int, char *, unsigned long);
+
+#else  /* CONFIG_SECCOMP_FILTER */
+
+static inline int seccomp_show_filters(struct seccomp_state *state,
+				       struct seq_file *m)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_set_filter(int syscall_nr, char *filter)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_clear_filter(int syscall_nr)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_get_filter(int syscall_nr,
+				      char *buf, unsigned long available)
+{
+	return -ENOSYS;
+}
+
+#endif  /* CONFIG_SECCOMP_FILTER */
+
 #endif /* _LINUX_SECCOMP_H */
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 242ae04..e061ad0 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -35,6 +35,8 @@ struct syscall_metadata {
 extern unsigned long arch_syscall_addr(int nr);
 extern int init_syscall_trace(struct ftrace_event_call *call);
 
+extern struct syscall_metadata *syscall_nr_to_meta(int);
+
 extern int reg_event_syscall_enter(struct ftrace_event_call *call);
 extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
 extern int reg_event_syscall_exit(struct ftrace_event_call *call);
@@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
 				      struct trace_event *event);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
 				     struct trace_event *event);
+#else
+static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+	return NULL;
+}
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb3..84e7dfb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
+ifeq ($(CONFIG_SECCOMP_FILTER),y)
+obj-$(CONFIG_SECCOMP) += seccomp_filter.o
+endif
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548de..46987d4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
+	put_tsk_seccomp_state(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	if (err)
 		goto out;
 
+	inherit_tsk_seccomp_state(tsk, orig);
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13..502ba04 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,12 +6,15 @@
  * This defines a simple but solid secure-computing mode.
  */
 
+#include <linux/err.h>
+#include <linux/prctl.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/unistd.h>
 
-/* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
+#include "seccomp.h"
 
 /*
  * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -32,11 +35,13 @@ static int mode1_syscalls_32[] = {
 
 void __secure_computing(int this_syscall)
 {
-	int mode = current->seccomp.mode;
+	int mode = -1;
+	long ret = 0;
 	int * syscall;
-
+	if (current->seccomp)
+		mode = current->seccomp->mode;
 	switch (mode) {
-	case 1:
+	case PR_SECCOMP_MODE_STRICT:
 		syscall = mode1_syscalls;
 #ifdef CONFIG_COMPAT
 		if (is_compat_task())
@@ -47,6 +52,20 @@ void __secure_computing(int this_syscall)
 				return;
 		} while (*++syscall);
 		break;
+	case PR_SECCOMP_MODE_FILTER:
+		if (this_syscall >= NR_syscalls || this_syscall < 0)
+			break;
+		ret = seccomp_test_filters(current->seccomp, this_syscall);
+		if (!ret)
+			return;
+		/* Only check for an override if an access failure occurred. */
+		if (ret != -EACCES)
+			break;
+		ret = seccomp_maybe_apply_filters(current, this_syscall);
+		if (!ret)
+			return;
+		seccomp_filter_log_failure(this_syscall);
+		break;
 	default:
 		BUG();
 	}
@@ -57,30 +76,213 @@ void __secure_computing(int this_syscall)
 	do_exit(SIGKILL);
 }
 
+/* seccomp_state_new - allocate a new state object. */
+struct seccomp_state *seccomp_state_new()
+{
+	struct seccomp_state *new = kzalloc(sizeof(struct seccomp_state),
+					    GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	new->flags = 0;
+#ifdef CONFIG_COMPAT
+	/* Annotate if this filterset is being created by a compat task. */
+	if (is_compat_task())
+		new->flags |= SECCOMP_FLAG_COMPAT;
+#endif
+
+	atomic_set(&new->usage, 1);
+	new->filters = seccomp_filter_table_new();
+	/* Not supported errors are fine, others are a problem. */
+	if (IS_ERR(new->filters) && PTR_ERR(new->filters) != -ENOSYS) {
+		kfree(new);
+		new = NULL;
+	}
+	return new;
+}
+
+/* seccomp_state_dup - copies an existing state object. */
+struct seccomp_state *seccomp_state_dup(const struct seccomp_state *orig)
+{
+	int err;
+	struct seccomp_state *new_state = seccomp_state_new();
+
+	err = -ENOMEM;
+	if (!new_state)
+		goto fail;
+	new_state->mode = orig->mode;
+	/* Flag copying will hide if the new process is a compat task.  However,
+	 * if the rule was compat/non-compat and the process is the opposite,
+	 * enforcement will terminate it.
+	 */
+	new_state->flags = orig->flags;
+	err = seccomp_copy_all_filters(new_state->filters,
+				       orig->filters);
+	if (err)
+		goto fail;
+
+	return new_state;
+fail:
+	put_seccomp_state(new_state);
+	return NULL;
+}
+
+/* get_seccomp_state - increments the reference count of @orig */
+struct seccomp_state *get_seccomp_state(struct seccomp_state *orig)
+{
+	if (!orig)
+		return NULL;
+	atomic_inc(&orig->usage);
+	return orig;
+}
+
+static void __put_seccomp_state(struct seccomp_state *orig)
+{
+	WARN_ON(atomic_read(&orig->usage));
+	seccomp_drop_all_filters(orig);
+	kfree(orig);
+}
+
+/* put_seccomp_state - decrements the reference count of @orig and may free. */
+void put_seccomp_state(struct seccomp_state *orig)
+{
+	if (!orig)
+		return;
+
+	if (atomic_dec_and_test(&orig->usage))
+		__put_seccomp_state(orig);
+}
+
 long prctl_get_seccomp(void)
 {
-	return current->seccomp.mode;
+	if (!current->seccomp)
+		return 0;
+	return current->seccomp->mode;
 }
 
-long prctl_set_seccomp(unsigned long seccomp_mode)
+long prctl_set_seccomp(unsigned long seccomp_mode, unsigned long flags)
 {
 	long ret;
+	struct seccomp_state *state, *cur_state;
 
+	cur_state = get_seccomp_state(current->seccomp);
 	/* can set it only once to be even more secure */
 	ret = -EPERM;
-	if (unlikely(current->seccomp.mode))
+	if (cur_state && unlikely(cur_state->mode))
 		goto out;
 
 	ret = -EINVAL;
-	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
-		current->seccomp.mode = seccomp_mode;
-		set_thread_flag(TIF_SECCOMP);
+	if (seccomp_mode <= 0 || seccomp_mode > NR_SECCOMP_MODES)
+		goto out;
+
+	ret = -ENOMEM;
+	state = (cur_state ? seccomp_state_dup(cur_state) :
+			     seccomp_state_new());
+	if (!state)
+		goto out;
+
+	if (seccomp_mode == PR_SECCOMP_MODE_STRICT) {
 #ifdef TIF_NOTSC
 		disable_TSC();
 #endif
-		ret = 0;
 	}
 
- out:
+	rcu_assign_pointer(current->seccomp, state);
+	synchronize_rcu();
+	put_seccomp_state(cur_state);	/* For the task */
+
+	/* Convert the ABI flag to the internal flag value. */
+	if (seccomp_mode == PR_SECCOMP_MODE_FILTER &&
+	    (flags & PR_SECCOMP_FLAG_FILTER_ON_EXEC))
+		state->flags |= SECCOMP_FLAG_ON_EXEC;
+	/* Encourage flag values to stay synchronized explicitly. */
+	BUILD_BUG_ON(PR_SECCOMP_FLAG_FILTER_ON_EXEC != SECCOMP_FLAG_ON_EXEC);
+
+	/* Only set the thread flag once after the new state is in place. */
+	state->mode = seccomp_mode;
+	set_thread_flag(TIF_SECCOMP);
+	ret = 0;
+
+out:
+	put_seccomp_state(cur_state);	/* for the get */
+	return ret;
+}
+
+long prctl_set_seccomp_filter(unsigned long syscall_nr,
+			      char __user *user_filter)
+{
+	int nr;
+	long ret;
+	char filter[SECCOMP_MAX_FILTER_LENGTH];
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls || syscall_nr < 0)
+		goto out;
+
+	ret = -EFAULT;
+	if (!user_filter ||
+	    strncpy_from_user(filter, user_filter,
+			      sizeof(filter) - 1) < 0)
+		goto out;
+
+	nr = (int) syscall_nr;
+	ret = seccomp_set_filter(nr, filter);
+
+out:
+	return ret;
+}
+
+long prctl_clear_seccomp_filter(unsigned long syscall_nr)
+{
+	int nr = -1;
+	long ret;
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls || syscall_nr < 0)
+		goto out;
+
+	nr = (int) syscall_nr;
+	ret = seccomp_clear_filter(nr);
+
+out:
+	return ret;
+}
+
+long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
+			      unsigned long available)
+{
+	int ret, nr;
+	unsigned long copied;
+	char *buf = NULL;
+	ret = -EINVAL;
+	if (!available)
+		goto out;
+	/* Ignore extra buffer space. */
+	if (available > SECCOMP_MAX_FILTER_LENGTH)
+		available = SECCOMP_MAX_FILTER_LENGTH;
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls || syscall_nr < 0)
+		goto out;
+	nr = (int) syscall_nr;
+
+	ret = -ENOMEM;
+	buf = kmalloc(available, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	ret = seccomp_get_filter(nr, buf, available);
+	if (ret < 0)
+		goto out;
+
+	/* Include the NUL byte in the copy. */
+	copied = copy_to_user(dst, buf, ret + 1);
+	ret = -ENOSPC;
+	if (copied)
+		goto out;
+
+	ret = 0;
+out:
+	kfree(buf);
 	return ret;
 }
diff --git a/kernel/seccomp.h b/kernel/seccomp.h
new file mode 100644
index 0000000..5abd219
--- /dev/null
+++ b/kernel/seccomp.h
@@ -0,0 +1,74 @@
+/*
+ * seccomp/seccomp_filter shared internal prototypes and state.
+ *
+ * Copyright (C) 2011 Chromium OS Authors.
+ */
+
+#ifndef __KERNEL_SECCOMP_H
+#define __KERNEL_SECCOMP_H
+
+#include <linux/ftrace_event.h>
+#include <linux/seccomp.h>
+
+/* #define SECCOMP_DEBUG 1 */
+#define NR_SECCOMP_MODES 2
+
+/* Inherit the max filter length from the filtering engine. */
+#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
+
+/* Presently, flags only affect SECCOMP_FILTER. */
+#define _SECCOMP_FLAG_COMPAT	0
+#define _SECCOMP_FLAG_ON_EXEC	1
+
+#define SECCOMP_FLAG_COMPAT	(1 << (_SECCOMP_FLAG_COMPAT))
+#define SECCOMP_FLAG_ON_EXEC	(1 << (_SECCOMP_FLAG_ON_EXEC))
+
+
+#ifdef CONFIG_SECCOMP_FILTER
+
+#define SECCOMP_FILTER_HASH_BITS 4
+#define SECCOMP_FILTER_HASH_SIZE (1 << SECCOMP_FILTER_HASH_BITS)
+
+struct seccomp_filter_table;
+extern struct seccomp_filter_table *seccomp_filter_table_new(void);
+extern int seccomp_copy_all_filters(struct seccomp_filter_table *,
+				    const struct seccomp_filter_table *);
+extern void seccomp_drop_all_filters(struct seccomp_state *);
+
+extern int seccomp_test_filters(struct seccomp_state *, int);
+extern int seccomp_maybe_apply_filters(struct task_struct *, int);
+extern void seccomp_filter_log_failure(int);
+
+#else /* CONFIG_SECCOMP_FILTER */
+
+static inline void seccomp_filter_log_failure(int syscall)
+{
+}
+
+static inline int seccomp_maybe_apply_filters(struct task_struct *tsk,
+					      int syscall_nr)
+{
+	return -ENOSYS;
+}
+
+static inline struct seccomp_filter_table *seccomp_filter_table_new(void)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static inline int seccomp_test_filters(struct seccomp_state *state, int nr)
+{
+	return -ENOSYS;
+}
+
+extern inline int seccomp_copy_all_filters(struct seccomp_filter_table *dst,
+					const struct seccomp_filter_table *src)
+{
+	return 0;
+}
+
+static inline void seccomp_drop_all_filters(struct seccomp_state *state) { }
+
+#endif /* CONFIG_SECCOMP_FILTER */
+
+#endif /* __KERNEL_SECCOMP_H */
diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
new file mode 100644
index 0000000..ff4e055
--- /dev/null
+++ b/kernel/seccomp_filter.c
@@ -0,0 +1,581 @@
+/* filter engine-based seccomp system call filtering.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ */
+
+#include <linux/compat.h>
+#include <linux/errno.h>
+#include <linux/hash.h>
+#include <linux/prctl.h>
+#include <linux/seccomp.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/syscall.h>
+#include <trace/syscall.h>
+
+#include "seccomp.h"
+
+#define SECCOMP_MAX_FILTER_COUNT 512
+#define SECCOMP_WILDCARD_FILTER "1"
+
+struct seccomp_filter {
+	struct hlist_node node;
+	int syscall_nr;
+	struct syscall_metadata *data;
+	struct event_filter *event_filter;
+};
+
+struct seccomp_filter_table {
+	struct hlist_head heads[SECCOMP_FILTER_HASH_SIZE];
+	int count;
+};
+
+struct seccomp_filter_table *seccomp_filter_table_new(void)
+{
+	struct seccomp_filter_table *t =
+		kzalloc(sizeof(struct seccomp_filter_table), GFP_KERNEL);
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+	return t;
+}
+
+static inline u32 syscall_hash(int syscall_nr)
+{
+	return hash_32(syscall_nr, SECCOMP_FILTER_HASH_BITS);
+}
+
+static const char *get_filter_string(struct seccomp_filter *f)
+{
+	const char *str = SECCOMP_WILDCARD_FILTER;
+	if (!f)
+		return NULL;
+
+	/* Missing event filters qualify as wildcard matches. */
+	if (!f->event_filter)
+		return str;
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+	str = ftrace_get_filter_string(f->event_filter);
+#endif
+	return str;
+}
+
+static struct seccomp_filter *alloc_seccomp_filter(int syscall_nr,
+						   const char *filter_string)
+{
+	int err = -ENOMEM;
+	struct seccomp_filter *filter = kzalloc(sizeof(struct seccomp_filter),
+						GFP_KERNEL);
+	if (!filter)
+		goto fail;
+
+	INIT_HLIST_NODE(&filter->node);
+	filter->syscall_nr = syscall_nr;
+	filter->data = syscall_nr_to_meta(syscall_nr);
+
+	/* Treat a filter of SECCOMP_WILDCARD_FILTER as a wildcard and skip
+	 * using a predicate at all.
+	 */
+	if (!strcmp(SECCOMP_WILDCARD_FILTER, filter_string))
+		goto out;
+
+	/* Argument-based filtering only works on ftrace-hooked syscalls. */
+	if (!filter->data) {
+		err = -ENOSYS;
+		goto fail;
+	}
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+	err = ftrace_parse_filter(&filter->event_filter,
+				  filter->data->enter_event->event.type,
+				  filter_string);
+	if (err)
+		goto fail;
+#endif
+
+out:
+	return filter;
+
+fail:
+	kfree(filter);
+	return ERR_PTR(err);
+}
+
+static void free_seccomp_filter(struct seccomp_filter *filter)
+{
+#ifdef CONFIG_FTRACE_SYSCALLS
+	ftrace_free_filter(filter->event_filter);
+#endif
+	kfree(filter);
+}
+
+static struct seccomp_filter *copy_seccomp_filter(struct seccomp_filter *orig)
+{
+	return alloc_seccomp_filter(orig->syscall_nr, get_filter_string(orig));
+}
+
+/* Returns the matching filter or NULL */
+static struct seccomp_filter *find_filter(struct seccomp_state *state,
+					  int syscall)
+{
+	struct hlist_node *this, *pos;
+	struct seccomp_filter *filter = NULL;
+
+	u32 head = syscall_hash(syscall);
+	if (head >= SECCOMP_FILTER_HASH_SIZE)
+		goto out;
+
+	hlist_for_each_safe(this, pos, &state->filters->heads[head]) {
+		filter = hlist_entry(this, struct seccomp_filter, node);
+		if (filter->syscall_nr == syscall)
+			goto out;
+	}
+
+	filter = NULL;
+
+out:
+	return filter;
+}
+
+/* Safely drops all filters for a given syscall. This should only be called
+ * on unattached seccomp_state objects.
+ */
+static void drop_filter(struct seccomp_state *state, int syscall_nr)
+{
+	struct seccomp_filter *filter = find_filter(state, syscall_nr);
+	if (!filter)
+		return;
+
+	WARN_ON(state->filters->count == 0);
+	state->filters->count--;
+	hlist_del(&filter->node);
+	free_seccomp_filter(filter);
+}
+
+/* This should only be called on unattached seccomp_state objects. */
+static int add_filter(struct seccomp_state *state, int syscall_nr,
+		      char *filter_string)
+{
+	struct seccomp_filter *filter;
+	struct hlist_head *head;
+	char merged[SECCOMP_MAX_FILTER_LENGTH];
+	int ret;
+	u32 hash = syscall_hash(syscall_nr);
+
+	ret = -EINVAL;
+	if (state->filters->count == SECCOMP_MAX_FILTER_COUNT - 1)
+		goto out;
+
+	filter_string = strstrip(filter_string);
+
+	/* Disallow empty strings. */
+	if (filter_string[0] == 0)
+		goto out;
+
+	/* Get the right list head. */
+	head = &state->filters->heads[hash];
+
+	/* Find out if there is an existing entry to append to and
+	 * build the resultant filter string.  The original filter can be
+	 * destroyed here since the caller should be operating on a copy.
+	 */
+	filter = find_filter(state, syscall_nr);
+	if (filter) {
+		int expected = snprintf(merged, sizeof(merged), "(%s) && %s",
+					get_filter_string(filter),
+					filter_string);
+		ret = -E2BIG;
+		if (expected >= sizeof(merged) || expected < 0)
+			goto out;
+		filter_string = merged;
+		hlist_del(&filter->node);
+		free_seccomp_filter(filter);
+	}
+
+	/* When in seccomp filtering mode, only allow additions. */
+	ret = -EACCES;
+	if (filter == NULL && state->mode == PR_SECCOMP_MODE_FILTER)
+		goto out;
+
+	ret = 0;
+	filter = alloc_seccomp_filter(syscall_nr, filter_string);
+	if (IS_ERR(filter)) {
+		ret = PTR_ERR(filter);
+		goto out;
+	}
+
+	state->filters->count++;
+	hlist_add_head(&filter->node, head);
+out:
+	return ret;
+}
+
+/* Wrap optional ftrace syscall support. Returns 1 on match or if ftrace is not
+ * supported.
+ */
+static int do_ftrace_syscall_match(struct event_filter *event_filter)
+{
+	int err = 1;
+#ifdef CONFIG_FTRACE_SYSCALLS
+	uint8_t syscall_state[64];
+
+	memset(syscall_state, 0, sizeof(syscall_state));
+
+	/* The generic tracing entry can remain zeroed. */
+	err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
+					 NULL);
+	if (err)
+		return 0;
+
+	err = filter_match_preds(event_filter, syscall_state);
+#endif
+	return err;
+}
+
+/* 1 on match, 0 otherwise. */
+static int filter_match_current(struct seccomp_filter *filter)
+{
+	/* If no event filter exists, we assume a wildcard match. */
+	if (!filter->event_filter)
+		return 1;
+
+	return do_ftrace_syscall_match(filter->event_filter);
+}
+
+#ifndef KSTK_EIP
+#define KSTK_EIP(x) 0L
+#endif
+
+static const char *syscall_nr_to_name(int syscall)
+{
+	const char *syscall_name = "unknown";
+	struct syscall_metadata *data = syscall_nr_to_meta(syscall);
+	if (data)
+		syscall_name = data->name;
+	return syscall_name;
+}
+
+void seccomp_filter_log_failure(int syscall)
+{
+	printk(KERN_INFO
+		"%s[%d]: system call %d (%s) blocked at ip:%lx\n",
+		current->comm, task_pid_nr(current), syscall,
+		syscall_nr_to_name(syscall), KSTK_EIP(current));
+}
+
+/**
+ * seccomp_drop_all_filters - cleans up the filter list and frees the table
+ * @state: the seccomp_state to destroy the filters in.
+ */
+void seccomp_drop_all_filters(struct seccomp_state *state)
+{
+	struct hlist_node *this, *pos;
+	int head;
+	if (!state->filters)
+		return;
+	for (head = 0; head < SECCOMP_FILTER_HASH_SIZE; ++head) {
+		hlist_for_each_safe(this, pos, &state->filters->heads[head]) {
+			struct seccomp_filter *f = hlist_entry(this,
+						struct seccomp_filter, node);
+			WARN_ON(state->filters->count == 0);
+			hlist_del(this);
+			free_seccomp_filter(f);
+			state->filters->count--;
+		}
+	}
+	kfree(state->filters);
+}
+
+/**
+ * seccomp_copy_all_filters - copies all filters from src to dst.
+ *
+ * @dst: seccomp_filter_table to populate.
+ * @src: table to read from.
+ * Returns non-zero on failure.
+ * Both the source and the destination should have no simultaneous
+ * writers, and dst should be exclusive to the caller.
+ */
+int seccomp_copy_all_filters(struct seccomp_filter_table *dst,
+			     const struct seccomp_filter_table *src)
+{
+	struct seccomp_filter *filter;
+	int head, ret = 0;
+	BUG_ON(!dst || !src);
+	for (head = 0; head < SECCOMP_FILTER_HASH_SIZE; ++head) {
+		struct hlist_node *pos;
+		hlist_for_each_entry(filter, pos, &src->heads[head], node) {
+			struct seccomp_filter *new_filter =
+				copy_seccomp_filter(filter);
+			if (IS_ERR(new_filter)) {
+				ret = PTR_ERR(new_filter);
+				goto done;
+			}
+			hlist_add_head(&new_filter->node,
+				       &dst->heads[head]);
+			dst->count++;
+		}
+	}
+
+done:
+	return ret;
+}
+
+/**
+ * seccomp_show_filters - prints the filter state to a seq_file
+ * @state: the seccomp_state to enumerate the filter and bitmask of
+ * @m: the prepared seq_file to receive the data
+ *
+ * Returns 0 on a successful write.
+ */
+int seccomp_show_filters(struct seccomp_state *state, struct seq_file *m)
+{
+	int head;
+	struct hlist_node *pos;
+	struct seccomp_filter *filter;
+	int filtering = 0;
+	if (!state)
+		return 0;
+	if (!state->filters)
+		return 0;
+
+	filtering = (state->mode == 2);
+	filtering &= !(state->flags & SECCOMP_FLAG_ON_EXEC);
+	seq_printf(m, "Filtering: %d\n", filtering);
+	seq_printf(m, "FilterCount: %d\n", state->filters->count);
+	for (head = 0; head < SECCOMP_FILTER_HASH_SIZE; ++head) {
+		hlist_for_each_entry(filter, pos, &state->filters->heads[head],
+				     node) {
+			seq_printf(m, "SystemCall: %d (%s)\n",
+				      filter->syscall_nr,
+				      syscall_nr_to_name(filter->syscall_nr));
+			seq_printf(m, "Filter: %s\n",
+				      get_filter_string(filter));
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seccomp_show_filters);
+
+/**
+ * seccomp_maybe_apply_filters - conditionally applies seccomp filters
+ * @tsk: task to update
+ * @syscall_nr: current system call in progress
+ * tsk must already be in seccomp filter mode.
+ *
+ * Returns 0 if the call should be allowed or state has been updated.
+ * This call is only reach if no filters matched the current system call.
+ * In some cases, such as when the ON_EXEC flag is set, failure should
+ * not be terminal.
+ */
+int seccomp_maybe_apply_filters(struct task_struct *tsk, int syscall_nr)
+{
+	struct seccomp_state *state, *new_state = NULL;
+	int ret = -EACCES;
+
+	/* There's no question of application if ON_EXEC is not set. */
+	state = get_seccomp_state(tsk->seccomp);
+	if ((state->flags & SECCOMP_FLAG_ON_EXEC) == 0)
+		goto out;
+
+	ret = 0;
+	if (syscall_nr != __NR_execve)
+		goto out;
+
+	new_state = seccomp_state_dup(state);
+	ret = -ENOMEM;
+	if (!new_state)
+		goto out;
+
+	ret = 0;
+	new_state->flags &= ~(SECCOMP_FLAG_ON_EXEC);
+	rcu_assign_pointer(tsk->seccomp, new_state);
+	synchronize_rcu();
+	put_seccomp_state(state); /* for the task */
+
+out:
+	put_seccomp_state(state); /* for the get */
+	return ret;
+}
+
+/**
+ * seccomp_test_filters - tests 'current' against the given syscall
+ * @state: seccomp_state of current to use.
+ * @syscall: number of the system call to test
+ *
+ * Returns 0 on ok and non-zero on error/failure.
+ */
+int seccomp_test_filters(struct seccomp_state *state, int syscall)
+{
+	struct seccomp_filter *filter = NULL;
+	int ret;
+
+#ifdef CONFIG_COMPAT
+	ret = -EPERM;
+	if (is_compat_task() == !!(state->flags & SECCOMP_FLAG_COMPAT)) {
+		printk(KERN_INFO "%s[%d]: seccomp filter compat() mismatch.\n",
+		       current->comm, task_pid_nr(current));
+		goto out;
+	}
+#endif
+
+	ret = 0;
+	filter = find_filter(state, syscall);
+	if (filter && filter_match_current(filter))
+		goto out;
+
+	ret = -EACCES;
+out:
+	return ret;
+}
+
+/**
+ * seccomp_get_filter - copies the filter_string into "buf"
+ * @syscall_nr: system call number to look up
+ * @buf: destination buffer
+ * @bufsize: available space in the buffer.
+ *
+ * Looks up the filter for the given system call number on current.  If found,
+ * the string length of the NUL-terminated buffer is returned and < 0 is
+ * returned on error. The NUL byte is not included in the length.
+ */
+long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
+{
+	struct seccomp_state *state;
+	struct seccomp_filter *filter;
+	long ret = -ENOENT;
+
+	if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
+		bufsize = SECCOMP_MAX_FILTER_LENGTH;
+
+	state = get_seccomp_state(current->seccomp);
+	if (!state)
+		goto out;
+
+	filter = find_filter(state, syscall_nr);
+	if (!filter)
+		goto out;
+
+	ret = strlcpy(buf, get_filter_string(filter), bufsize);
+	if (ret >= bufsize) {
+		ret = -ENOSPC;
+		goto out;
+	}
+	/* Zero out any remaining buffer, just in case. */
+	memset(buf + ret, 0, bufsize - ret);
+out:
+	put_seccomp_state(state);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_get_filter);
+
+/**
+ * seccomp_clear_filter: clears the seccomp filter for a syscall.
+ * @syscall_nr: the system call number to clear filters for.
+ *
+ * (acts as a frontend for seccomp_set_filter. All restrictions
+ *  apply)
+ *
+ * Returns 0 on success.
+ */
+long seccomp_clear_filter(int syscall_nr)
+{
+	return seccomp_set_filter(syscall_nr, NULL);
+}
+EXPORT_SYMBOL_GPL(seccomp_clear_filter);
+
+/**
+ * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
+ * @syscall_nr: system call number to apply the filter to.
+ * @filter: ftrace filter string to apply.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ *          operates on current. current must be attempting a system call
+ *          when this is called.
+ *
+ * New filters may be added for system calls when the current task is
+ * not in a secure computing mode (seccomp).  Otherwise, filters may only
+ * be added to already filtered system call entries.  Any additions will
+ * be &&'d with the existing filter string to ensure no expansion of privileges
+ * will be possible.
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+long seccomp_set_filter(int syscall_nr, char *filter)
+{
+	struct seccomp_state *state, *orig_state;
+	long ret = -EINVAL;
+
+	orig_state = get_seccomp_state(current->seccomp);
+
+	/* Prior to mutating the state, create a duplicate to avoid modifying
+	 * the behavior of other instances sharing the state and ensure
+	 * consistency.
+	 */
+	state = (orig_state ? seccomp_state_dup(orig_state) :
+			      seccomp_state_new());
+	if (!state) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* A NULL filter doubles as a drop value, but the exposed prctl
+	 * interface requires a trip through seccomp_clear_filter().
+	 * Filter dropping is allowed across the is_compat_task() barrier.
+	 */
+	ret = 0;
+	if (filter == NULL) {
+		drop_filter(state, syscall_nr);
+		goto assign;
+	}
+
+	/* Avoid amiguous filters which may have been inherited from a parent
+	 * with different syscall numbers for the logically same calls.
+	 */
+#ifdef CONFIG_COMPAT
+	ret = -EACCES;
+	if (is_compat_task() != !!(state->flags & SECCOMP_FLAG_COMPAT)) {
+		if (state->filters->count)
+			goto free_state;
+		/* It's safe to add if there are no existing ambiguous rules.*/
+		if (is_compat_task())
+			state->flags |= SECCOMP_FLAG_COMPAT;
+		else
+			state->flags &= ~(SECCOMP_FLAG_COMPAT);
+	}
+#endif
+
+	ret = add_filter(state, syscall_nr, filter);
+	if (ret)
+		goto free_state;
+
+assign:
+	rcu_assign_pointer(current->seccomp, state);
+	synchronize_rcu();
+	put_seccomp_state(orig_state);  /* for the task */
+out:
+	put_seccomp_state(orig_state);  /* for the get */
+	return ret;
+
+free_state:
+	put_seccomp_state(orig_state);  /* for the get */
+	put_seccomp_state(state);  /* drop the dup/new */
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_set_filter);
diff --git a/kernel/sys.c b/kernel/sys.c
index af468ed..d29003a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1698,12 +1698,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_ENDIAN:
 			error = SET_ENDIAN(me, arg2);
 			break;
-
 		case PR_GET_SECCOMP:
 			error = prctl_get_seccomp();
 			break;
 		case PR_SET_SECCOMP:
-			error = prctl_set_seccomp(arg2);
+			error = prctl_set_seccomp(arg2, arg3);
+			break;
+		case PR_SET_SECCOMP_FILTER:
+			error = prctl_set_seccomp_filter(arg2,
+							 (char __user *) arg3);
+			break;
+		case PR_CLEAR_SECCOMP_FILTER:
+			error = prctl_clear_seccomp_filter(arg2);
+			break;
+		case PR_GET_SECCOMP_FILTER:
+			error = prctl_get_seccomp_filter(arg2,
+							 (char __user *) arg3,
+							 arg4);
 			break;
 		case PR_GET_TSC:
 			error = GET_TSC_CTL(arg2);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH 4/5] v2 seccomp_filter: add process state reporting
  2011-04-28  3:08 ` [PATCH 4/7] seccomp_filter: add process state reporting Will Drewry
  2011-04-28  3:21   ` KOSAKI Motohiro
@ 2011-05-12  3:04   ` Will Drewry
  1 sibling, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-12  3:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Steven Rostedt, Serge E. Hallyn, kees.cook, eparis, agl, mingo,
	jmorris, Frederic Weisbecker, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Roland McGrath,
	Peter Zijlstra, Jiri Slaby, David Howells, Will Drewry

Adds seccomp_filter status reporting to proc.
/proc/<pid>/seccomp_filter will provide read-only access to the current
filter set.

v2: removed status entry, added seccomp file.
    (requested by kosaki.motohiro@jp.fujitsu.com)
    allowed S_IRUGO reading of entries
    (requested by viro@zeniv.linux.org.uk)
    added flags
    got rid of the seccomp_t type
    dropped seccomp file

Signed-off-by: Will Drewry <wad@chromium.org>
---
 fs/proc/base.c |   25 +++++++++++++++++++++++++
 1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa5327..f991d1a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -73,6 +73,7 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
+#include <linux/seccomp.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
@@ -579,6 +580,24 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
+/*
+ * Print out the current seccomp filter set for the task.
+ */
+#ifdef CONFIG_SECCOMP_FILTER
+int proc_pid_seccomp_filter_show(struct seq_file *m, struct pid_namespace *ns,
+				 struct pid *pid, struct task_struct *task)
+{
+	struct seccomp_state *state;
+
+	rcu_read_lock();
+	state = get_seccomp_state(task->seccomp);
+	rcu_read_unlock();
+	seccomp_show_filters(state, m);
+	put_seccomp_state(state);
+	return 0;
+}
+#endif /* CONFIG_SECCOMP_FILTER */
+
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -2838,6 +2857,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",    S_IRUGO, proc_pid_syscall),
 #endif
+#ifdef CONFIG_SECCOMP_FILTER
+	ONE("seccomp_filter",     S_IRUGO, proc_pid_seccomp_filter_show),
+#endif
 	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
 	ONE("stat",       S_IRUGO, proc_tgid_stat),
 	ONE("statm",      S_IRUGO, proc_pid_statm),
@@ -3180,6 +3202,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",   S_IRUGO, proc_pid_syscall),
 #endif
+#ifdef CONFIG_SECCOMP_FILTER
+	ONE("seccomp_filter",     S_IRUGO, proc_pid_seccomp_filter_show),
+#endif
 	INF("cmdline",   S_IRUGO, proc_pid_cmdline),
 	ONE("stat",      S_IRUGO, proc_tid_stat),
 	ONE("statm",     S_IRUGO, proc_pid_statm),
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH 5/5] v2 seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-07  1:58                               ` Will Drewry
@ 2011-05-12  3:04                                 ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-12  3:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Steven Rostedt, Frederic Weisbecker, Eric Paris, Ingo Molnar,
	kees.cook, agl, jmorris, Randy Dunlap, Linus Torvalds,
	Andrew Morton, Tom Zanussi, Arnaldo Carvalho de Melo,
	Peter Zijlstra, Thomas Gleixner, Will Drewry

Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
implemented presently, and what it may be used for.  In addition,
the limitations and caveats of the proposed implementation are
included.

v2: moved to prctl/
    updated for the v2 syntax.
    adds a note about compat behavior

Signed-off-by: Will Drewry <wad@chromium.org>
---
 Documentation/prctl/seccomp_filter.txt |  156 ++++++++++++++++++++++++++++++++
 1 files changed, 156 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/prctl/seccomp_filter.txt

diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
new file mode 100644
index 0000000..4c1686a
--- /dev/null
+++ b/Documentation/prctl/seccomp_filter.txt
@@ -0,0 +1,156 @@
+		Seccomp filtering
+		=================
+
+Introduction
+------------
+
+A large number of system calls are exposed to every userland process
+with many of them going unused for the entire lifetime of the process.
+As system calls change and mature, bugs are found and eradicated.  A
+certain subset of userland applications benefit by having a reduce set
+of available system calls.  The reduced set reduces the total kernel
+surface exposed to the application.  System call filtering is meant for
+use with those applications.
+
+The implementation currently leverages both the existing seccomp
+infrastructure and the kernel tracing infrastructure.  By centralizing
+hooks for attack surface reduction in seccomp, it is possible to assure
+attention to security that is less relevant in normal ftrace scenarios,
+such as time-of-check, time-of-use attacks.  However, ftrace provides a
+rich, human-friendly environment for interfacing with system call
+specific arguments.  (As such, this requires FTRACE_SYSCALLS for any
+introspective filtering support.)
+
+
+What it isn't
+-------------
+
+System call filtering isn't a sandbox.  It provides a clearly defined
+mechanism for minimizing the exposed kernel surface.  Beyond that,
+policy for logical behavior and information flow should be managed with
+an LSM of your choosing. Filtering based on the ftrace filter engine
+provides further options down this path (avoiding pathological sizes,
+for instance), but it could be misconstrued for a real sandbox.
+
+
+Usage
+-----
+
+An additional seccomp mode is exposed through mode '2',
+PR_SECCOMP_MODE_FILTER.  This mode depends on CONFIG_SECCOMP_FILTER
+which in turn depends on CONFIG_FTRACE_SYSCALLS.
+
+A collection of filters may be supplied via prctl, and the current set
+of filters is exposed in /proc/<pid>/seccomp_filter.
+
+Interacting with seccomp filters can be done through three new prctl calls
+and one existing one.
+
+PR_SET_SECCOMP: A pre-existing option for enabling strict seccomp
+	mode (1) or filtering seccomp. This option now takes an
+	additional "flags" argument.
+
+	Usage:
+		prctl(PR_SET_SECCOMP, 1);
+		prctl(PR_SET_SECCOMP, PR_SECCOMP_MODE_FILTER, 0);
+	Flags:
+	- 0: Empty set.
+	- PR_SECCOMP_FLAG_FILTER_ON_EXEC: Delays enforcement of seccomp
+	  enforcment only on MODE_FILTER until an exec() call is seen.
+
+PR_SET_SECCOMP_FILTER: Allows the specification of a new filter for
+	a given system call, by number, and filter string. If
+	CONFIG_FTRACE_SYSCALLS is supported, the filter string may be
+	any valid value for the given system call.  If it is not
+	supported, the filter string may only be "1" or "0".
+
+	All calls to PR_SET_SECCOMP_FILTER for a given system
+	call will append the supplied string to any existing filters.
+	Filter construction looks as follows:
+		(Nothing) + "fd == 1 || fd == 2" => fd == 1 || fd == 2
+		... + "fd != 2" => (fd == 1 || fd == 2) && fd != 2
+		... + "size < 100" =>
+			((fd == 1 || fd == 2) && fd != 2) && size < 100
+	If there is no filter and the seccomp mode has already
+	transitioned to filtering, additions cannot be made.  Filters
+	may only be added that reduce the available kernel surface.
+
+	Usage (per the construction example above):
+		prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd == 1 || fd == 2");
+		prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
+		prctl(PR_SET_SECCOMP_FILTER, __NR_write, "size < 100");
+
+PR_CLEAR_SECCOMP_FILTER: Removes all filter entries for a given system
+	call number.  When called prior to entering seccomp filtering
+	mode, it allows for new filters to be applied to the same system
+	call.  After transition, however, it completely drops access to
+	the call.
+
+	Usage:
+		prctl(PR_CLEAR_SECCOMP_FILTER, __NR_open);
+
+PR_GET_SECCOMP_FILTER: Returns the aggregated filter string for a system
+	call into a user-supplied buffer of a given length.
+
+	Usage:
+		prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf,
+		      sizeof(buf));
+
+All of the above calls return 0 on success and non-zero on error.
+
+
+Example
+-------
+
+Assume a process would like to cleanly read and write to stdin/out/err
+as well as access its filters after seccomp enforcement begins.  This
+may be done as follows:
+
+  prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 0");
+  prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd == 1 || fd == 2");
+  prctl(PR_SET_SECCOMP_FILTER, __NR_exit, "1");
+  prctl(PR_SET_SECCOMP_FILTER, __NR_prctl, "1");
+
+  prctl(PR_SET_SECCOMP, PR_SECCOMP_MODE_FILTER, 0);
+
+  /* Do stuff with fdset . . .*/
+
+  /* Drop read access and keep only write access to fd 1. */
+  prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
+  prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
+
+  /* Perform any final processing . . . */
+  syscall(__NR_exit, 0);
+
+If the initial setup had been handled through a launcher of some sort,
+the call to PR_SET_SECCOMP may have been replaced with:
+  prctl(PR_SET_SECCOMP, PR_SECCOMP_MODE_FILTER, PR_SECCOMP_FLAG_FILTER_ON_EXEC);
+  /* ... */
+  execve(path, args);
+
+This will continue to allow system calls to proceed uninspected until an
+exec*() call is seen.  From that point onward, the calling process will
+have filters enforced.
+
+
+Caveats
+-------
+
+- The filter event subsystem comes from CONFIG_TRACE_EVENTS, and the
+system call events come from CONFIG_FTRACE_SYSCALLS.  However, if
+neither are available, a filter string of "1" will be honored, and it may
+be removed using PR_CLEAR_SECCOMP_FILTER.  With ftrace filtering,
+calling PR_SET_SECCOMP_FILTER with a filter of "0" would have similar
+affect but would not be consistent on a kernel without the support.
+
+- Some platforms support a 32-bit userspace with 64-bit kernels.  In
+these cases (CONFIG_COMPAT), system call numbers may not match across
+64-bit and 32-bit system calls.  This may be especially relevant when
+filters are inherited across execution contexts.  If filters are created
+in a non-compat context then inherited into a compat context, the
+inheriting process will be terminated if seccomp filtering mode is
+enabled.  If it is not yet enabled, the inheriting process may iterate
+over the available system calls clearing any existing values.  Once no
+filters remain, it can begin setting new filters based on its own
+context.  (This behavior is bidirectional: compat->non-compat,
+non-compat->compat.)
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* Re: [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-05 13:14                             ` Serge E. Hallyn
@ 2011-05-12  3:20                               ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-12  3:20 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Steven Rostedt, Frederic Weisbecker, Eric Paris, Ingo Molnar,
	linux-kernel, kees.cook, agl, jmorris, Randy Dunlap,
	Linus Torvalds, Andrew Morton, Tom Zanussi,
	Arnaldo Carvalho de Melo, Peter Zijlstra, Thomas Gleixner

On Thu, May 5, 2011 at 6:14 AM, Serge E. Hallyn <serge@hallyn.com> wrote:
> Quoting Will Drewry (wad@chromium.org):
>> In particular, if the userspace code wants to stage some filters and
>> apply them all at once, when ready, I'm not sure that it makes sense
>> to me to put that complexity in the kernel itself.  For instance,
>
> Hi Will,
>
> just one note - in my original comment I wasn't actually suggesting
> disabling setting of filters through a writeable file - I was only
> suggesting restricting writing to one's own filters file.
>
> All the better if it is possible to get a nice prctl-only
> interface, but if it ends up limiting rule expressiveness (or taking
> years to define an interface) then perhaps we should stick with
> prctl for setting seccomp mode, and a more expressive file interface
> for defining filters.

Didn't want you to think I missed this -- thanks for clarifying!  I've
attempted to pull together a prctl interface that balances the
directions proposed by Eric, Steven, Frederic, and co.  Upon
reflection of the /proc interface, it seems to have similar
challenges, but if the new patchset tanks and a /proc interface would
have more flexibility, I'll definitely explore that route.  I'd
certainly like to avoid spending years defining this, especially
upfront, and I'll take any guidance as to how to best reach a
reasonable starting place!  (Of course, I'd appreciate feedback on
this round of patches too :)

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12  3:02                     ` Will Drewry
  (?)
@ 2011-05-12  7:48                       ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-12  7:48 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, Steven Rostedt, Frederic Weisbecker, Eric Paris,
	kees.cook, agl, jmorris, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds


Ok, i like the direction here, but i think the ABI should be done differently.

In this patch the ftrace event filter mechanism is used:

* Will Drewry <wad@chromium.org> wrote:

> +static struct seccomp_filter *alloc_seccomp_filter(int syscall_nr,
> +						   const char *filter_string)
> +{
> +	int err = -ENOMEM;
> +	struct seccomp_filter *filter = kzalloc(sizeof(struct seccomp_filter),
> +						GFP_KERNEL);
> +	if (!filter)
> +		goto fail;
> +
> +	INIT_HLIST_NODE(&filter->node);
> +	filter->syscall_nr = syscall_nr;
> +	filter->data = syscall_nr_to_meta(syscall_nr);
> +
> +	/* Treat a filter of SECCOMP_WILDCARD_FILTER as a wildcard and skip
> +	 * using a predicate at all.
> +	 */
> +	if (!strcmp(SECCOMP_WILDCARD_FILTER, filter_string))
> +		goto out;
> +
> +	/* Argument-based filtering only works on ftrace-hooked syscalls. */
> +	if (!filter->data) {
> +		err = -ENOSYS;
> +		goto fail;
> +	}
> +
> +#ifdef CONFIG_FTRACE_SYSCALLS
> +	err = ftrace_parse_filter(&filter->event_filter,
> +				  filter->data->enter_event->event.type,
> +				  filter_string);
> +	if (err)
> +		goto fail;
> +#endif
> +
> +out:
> +	return filter;
> +
> +fail:
> +	kfree(filter);
> +	return ERR_PTR(err);
> +}

Via a prctl() ABI:

> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1698,12 +1698,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  		case PR_SET_ENDIAN:
>  			error = SET_ENDIAN(me, arg2);
>  			break;
> -
>  		case PR_GET_SECCOMP:
>  			error = prctl_get_seccomp();
>  			break;
>  		case PR_SET_SECCOMP:
> -			error = prctl_set_seccomp(arg2);
> +			error = prctl_set_seccomp(arg2, arg3);
> +			break;
> +		case PR_SET_SECCOMP_FILTER:
> +			error = prctl_set_seccomp_filter(arg2,
> +							 (char __user *) arg3);
> +			break;
> +		case PR_CLEAR_SECCOMP_FILTER:
> +			error = prctl_clear_seccomp_filter(arg2);
> +			break;
> +		case PR_GET_SECCOMP_FILTER:
> +			error = prctl_get_seccomp_filter(arg2,
> +							 (char __user *) arg3,
> +							 arg4);

To restrict execution to system calls.

Two observations:

1) We already have a specific ABI for this: you can set filters for events via 
   an event fd.

   Why not extend that mechanism instead and improve *both* your sandboxing
   bits and the events code? This new seccomp code has a lot more
   to do with trace event filters than the minimal old seccomp code ...

   kernel/trace/trace_event_filter.c is 2000 lines of tricky code that
   interprets the ASCII filter expressions. kernel/seccomp.c is 86 lines of
   mostly trivial code.

2) Why should this concept not be made available wider, to allow the 
   restriction of not just system calls but other security relevant components 
   of the kernel as well?

   This too, if you approach the problem via the events code, will be a natural 
   end result, while if you approach it from the seccomp prctl angle it will be
   a limited hack only.

Note, the end result will be the same - just using a different ABI.

So i really think the ABI itself should be closer related to the event code. 
What this "seccomp" code does is that it uses specific syscall events to 
restrict execution of certain event generating codepaths, such as system calls.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12  7:48                       ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-12  7:48 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, jmorris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Peter Zijlstra,
	microblaze-uclinux, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller


Ok, i like the direction here, but i think the ABI should be done differently.

In this patch the ftrace event filter mechanism is used:

* Will Drewry <wad@chromium.org> wrote:

> +static struct seccomp_filter *alloc_seccomp_filter(int syscall_nr,
> +						   const char *filter_string)
> +{
> +	int err = -ENOMEM;
> +	struct seccomp_filter *filter = kzalloc(sizeof(struct seccomp_filter),
> +						GFP_KERNEL);
> +	if (!filter)
> +		goto fail;
> +
> +	INIT_HLIST_NODE(&filter->node);
> +	filter->syscall_nr = syscall_nr;
> +	filter->data = syscall_nr_to_meta(syscall_nr);
> +
> +	/* Treat a filter of SECCOMP_WILDCARD_FILTER as a wildcard and skip
> +	 * using a predicate at all.
> +	 */
> +	if (!strcmp(SECCOMP_WILDCARD_FILTER, filter_string))
> +		goto out;
> +
> +	/* Argument-based filtering only works on ftrace-hooked syscalls. */
> +	if (!filter->data) {
> +		err = -ENOSYS;
> +		goto fail;
> +	}
> +
> +#ifdef CONFIG_FTRACE_SYSCALLS
> +	err = ftrace_parse_filter(&filter->event_filter,
> +				  filter->data->enter_event->event.type,
> +				  filter_string);
> +	if (err)
> +		goto fail;
> +#endif
> +
> +out:
> +	return filter;
> +
> +fail:
> +	kfree(filter);
> +	return ERR_PTR(err);
> +}

Via a prctl() ABI:

> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1698,12 +1698,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  		case PR_SET_ENDIAN:
>  			error = SET_ENDIAN(me, arg2);
>  			break;
> -
>  		case PR_GET_SECCOMP:
>  			error = prctl_get_seccomp();
>  			break;
>  		case PR_SET_SECCOMP:
> -			error = prctl_set_seccomp(arg2);
> +			error = prctl_set_seccomp(arg2, arg3);
> +			break;
> +		case PR_SET_SECCOMP_FILTER:
> +			error = prctl_set_seccomp_filter(arg2,
> +							 (char __user *) arg3);
> +			break;
> +		case PR_CLEAR_SECCOMP_FILTER:
> +			error = prctl_clear_seccomp_filter(arg2);
> +			break;
> +		case PR_GET_SECCOMP_FILTER:
> +			error = prctl_get_seccomp_filter(arg2,
> +							 (char __user *) arg3,
> +							 arg4);

To restrict execution to system calls.

Two observations:

1) We already have a specific ABI for this: you can set filters for events via 
   an event fd.

   Why not extend that mechanism instead and improve *both* your sandboxing
   bits and the events code? This new seccomp code has a lot more
   to do with trace event filters than the minimal old seccomp code ...

   kernel/trace/trace_event_filter.c is 2000 lines of tricky code that
   interprets the ASCII filter expressions. kernel/seccomp.c is 86 lines of
   mostly trivial code.

2) Why should this concept not be made available wider, to allow the 
   restriction of not just system calls but other security relevant components 
   of the kernel as well?

   This too, if you approach the problem via the events code, will be a natural 
   end result, while if you approach it from the seccomp prctl angle it will be
   a limited hack only.

Note, the end result will be the same - just using a different ABI.

So i really think the ABI itself should be closer related to the event code. 
What this "seccomp" code does is that it uses specific syscall events to 
restrict execution of certain event generating codepaths, such as system calls.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12  7:48                       ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-12  7:48 UTC (permalink / raw)
  To: linux-arm-kernel


Ok, i like the direction here, but i think the ABI should be done differently.

In this patch the ftrace event filter mechanism is used:

* Will Drewry <wad@chromium.org> wrote:

> +static struct seccomp_filter *alloc_seccomp_filter(int syscall_nr,
> +						   const char *filter_string)
> +{
> +	int err = -ENOMEM;
> +	struct seccomp_filter *filter = kzalloc(sizeof(struct seccomp_filter),
> +						GFP_KERNEL);
> +	if (!filter)
> +		goto fail;
> +
> +	INIT_HLIST_NODE(&filter->node);
> +	filter->syscall_nr = syscall_nr;
> +	filter->data = syscall_nr_to_meta(syscall_nr);
> +
> +	/* Treat a filter of SECCOMP_WILDCARD_FILTER as a wildcard and skip
> +	 * using a predicate at all.
> +	 */
> +	if (!strcmp(SECCOMP_WILDCARD_FILTER, filter_string))
> +		goto out;
> +
> +	/* Argument-based filtering only works on ftrace-hooked syscalls. */
> +	if (!filter->data) {
> +		err = -ENOSYS;
> +		goto fail;
> +	}
> +
> +#ifdef CONFIG_FTRACE_SYSCALLS
> +	err = ftrace_parse_filter(&filter->event_filter,
> +				  filter->data->enter_event->event.type,
> +				  filter_string);
> +	if (err)
> +		goto fail;
> +#endif
> +
> +out:
> +	return filter;
> +
> +fail:
> +	kfree(filter);
> +	return ERR_PTR(err);
> +}

Via a prctl() ABI:

> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1698,12 +1698,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  		case PR_SET_ENDIAN:
>  			error = SET_ENDIAN(me, arg2);
>  			break;
> -
>  		case PR_GET_SECCOMP:
>  			error = prctl_get_seccomp();
>  			break;
>  		case PR_SET_SECCOMP:
> -			error = prctl_set_seccomp(arg2);
> +			error = prctl_set_seccomp(arg2, arg3);
> +			break;
> +		case PR_SET_SECCOMP_FILTER:
> +			error = prctl_set_seccomp_filter(arg2,
> +							 (char __user *) arg3);
> +			break;
> +		case PR_CLEAR_SECCOMP_FILTER:
> +			error = prctl_clear_seccomp_filter(arg2);
> +			break;
> +		case PR_GET_SECCOMP_FILTER:
> +			error = prctl_get_seccomp_filter(arg2,
> +							 (char __user *) arg3,
> +							 arg4);

To restrict execution to system calls.

Two observations:

1) We already have a specific ABI for this: you can set filters for events via 
   an event fd.

   Why not extend that mechanism instead and improve *both* your sandboxing
   bits and the events code? This new seccomp code has a lot more
   to do with trace event filters than the minimal old seccomp code ...

   kernel/trace/trace_event_filter.c is 2000 lines of tricky code that
   interprets the ASCII filter expressions. kernel/seccomp.c is 86 lines of
   mostly trivial code.

2) Why should this concept not be made available wider, to allow the 
   restriction of not just system calls but other security relevant components 
   of the kernel as well?

   This too, if you approach the problem via the events code, will be a natural 
   end result, while if you approach it from the seccomp prctl angle it will be
   a limited hack only.

Note, the end result will be the same - just using a different ABI.

So i really think the ABI itself should be closer related to the event code. 
What this "seccomp" code does is that it uses specific syscall events to 
restrict execution of certain event generating codepaths, such as system calls.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12  7:48                       ` Ingo Molnar
  (?)
@ 2011-05-12  9:24                         ` Kees Cook
  -1 siblings, 0 replies; 406+ messages in thread
From: Kees Cook @ 2011-05-12  9:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, agl, jmorris, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds

Hi,

On Thu, May 12, 2011 at 09:48:50AM +0200, Ingo Molnar wrote:
> 1) We already have a specific ABI for this: you can set filters for events via 
>    an event fd.
> 
>    Why not extend that mechanism instead and improve *both* your sandboxing
>    bits and the events code? This new seccomp code has a lot more
>    to do with trace event filters than the minimal old seccomp code ...

Would this require privileges to get the event fd to start with? If so,
I would prefer to avoid that, since using prctl() as shown in the patch
set won't require any privs.

-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12  9:24                         ` Kees Cook
  0 siblings, 0 replies; 406+ messages in thread
From: Kees Cook @ 2011-05-12  9:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, jmorris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, Serge E. Hallyn, Peter Zijlstra,
	microblaze-uclinux, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Tejun Heo, linux390, Andrew Morton, agl,
	David S. Miller

Hi,

On Thu, May 12, 2011 at 09:48:50AM +0200, Ingo Molnar wrote:
> 1) We already have a specific ABI for this: you can set filters for events via 
>    an event fd.
> 
>    Why not extend that mechanism instead and improve *both* your sandboxing
>    bits and the events code? This new seccomp code has a lot more
>    to do with trace event filters than the minimal old seccomp code ...

Would this require privileges to get the event fd to start with? If so,
I would prefer to avoid that, since using prctl() as shown in the patch
set won't require any privs.

-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12  9:24                         ` Kees Cook
  0 siblings, 0 replies; 406+ messages in thread
From: Kees Cook @ 2011-05-12  9:24 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

On Thu, May 12, 2011 at 09:48:50AM +0200, Ingo Molnar wrote:
> 1) We already have a specific ABI for this: you can set filters for events via 
>    an event fd.
> 
>    Why not extend that mechanism instead and improve *both* your sandboxing
>    bits and the events code? This new seccomp code has a lot more
>    to do with trace event filters than the minimal old seccomp code ...

Would this require privileges to get the event fd to start with? If so,
I would prefer to avoid that, since using prctl() as shown in the patch
set won't require any privs.

-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12  9:24                         ` Kees Cook
  (?)
@ 2011-05-12 10:49                           ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-12 10:49 UTC (permalink / raw)
  To: Kees Cook
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, agl, jmorris, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds


* Kees Cook <kees.cook@canonical.com> wrote:

> Hi,
> 
> On Thu, May 12, 2011 at 09:48:50AM +0200, Ingo Molnar wrote:
> > 1) We already have a specific ABI for this: you can set filters for events via 
> >    an event fd.
> > 
> >    Why not extend that mechanism instead and improve *both* your sandboxing
> >    bits and the events code? This new seccomp code has a lot more
> >    to do with trace event filters than the minimal old seccomp code ...
> 
> Would this require privileges to get the event fd to start with? [...]

No special privileges with the default perf_events_paranoid value.

> [...] If so, I would prefer to avoid that, since using prctl() as shown in 
> the patch set won't require any privs.

and we could also explicitly allow syscall events without any privileges, 
regardless of the setting of 'perf_events_paranoid' config value.

Obviously a sandboxing host process wants to run with as low privileges as it 
can.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 10:49                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-12 10:49 UTC (permalink / raw)
  To: Kees Cook
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, jmorris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, Serge E. Hallyn, Peter Zijlstra,
	microblaze-uclinux, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Tejun Heo, linux390, Andrew Morton, agl,
	David S. Miller


* Kees Cook <kees.cook@canonical.com> wrote:

> Hi,
> 
> On Thu, May 12, 2011 at 09:48:50AM +0200, Ingo Molnar wrote:
> > 1) We already have a specific ABI for this: you can set filters for events via 
> >    an event fd.
> > 
> >    Why not extend that mechanism instead and improve *both* your sandboxing
> >    bits and the events code? This new seccomp code has a lot more
> >    to do with trace event filters than the minimal old seccomp code ...
> 
> Would this require privileges to get the event fd to start with? [...]

No special privileges with the default perf_events_paranoid value.

> [...] If so, I would prefer to avoid that, since using prctl() as shown in 
> the patch set won't require any privs.

and we could also explicitly allow syscall events without any privileges, 
regardless of the setting of 'perf_events_paranoid' config value.

Obviously a sandboxing host process wants to run with as low privileges as it 
can.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 10:49                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-12 10:49 UTC (permalink / raw)
  To: linux-arm-kernel


* Kees Cook <kees.cook@canonical.com> wrote:

> Hi,
> 
> On Thu, May 12, 2011 at 09:48:50AM +0200, Ingo Molnar wrote:
> > 1) We already have a specific ABI for this: you can set filters for events via 
> >    an event fd.
> > 
> >    Why not extend that mechanism instead and improve *both* your sandboxing
> >    bits and the events code? This new seccomp code has a lot more
> >    to do with trace event filters than the minimal old seccomp code ...
> 
> Would this require privileges to get the event fd to start with? [...]

No special privileges with the default perf_events_paranoid value.

> [...] If so, I would prefer to avoid that, since using prctl() as shown in 
> the patch set won't require any privs.

and we could also explicitly allow syscall events without any privileges, 
regardless of the setting of 'perf_events_paranoid' config value.

Obviously a sandboxing host process wants to run with as low privileges as it 
can.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12  3:02                     ` Will Drewry
  (?)
@ 2011-05-12 11:33                       ` James Morris
  -1 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-12 11:33 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, Steven Rostedt, Frederic Weisbecker, Eric Paris,
	Ingo Molnar, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux

On Wed, 11 May 2011, Will Drewry wrote:

> +void seccomp_filter_log_failure(int syscall)
> +{
> +	printk(KERN_INFO
> +		"%s[%d]: system call %d (%s) blocked at ip:%lx\n",
> +		current->comm, task_pid_nr(current), syscall,
> +		syscall_nr_to_name(syscall), KSTK_EIP(current));
> +}

I think it'd be a good idea to utilize the audit facility here.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 11:33                       ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-12 11:33 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, Ingo Molnar, linux-arm-kernel, kees.cook,
	Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Martin Schwidefsky, Thomas Gleixner, Ingo Molnar,
	Roland McGrath, Michal Marek, Michal Simek, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller

On Wed, 11 May 2011, Will Drewry wrote:

> +void seccomp_filter_log_failure(int syscall)
> +{
> +	printk(KERN_INFO
> +		"%s[%d]: system call %d (%s) blocked at ip:%lx\n",
> +		current->comm, task_pid_nr(current), syscall,
> +		syscall_nr_to_name(syscall), KSTK_EIP(current));
> +}

I think it'd be a good idea to utilize the audit facility here.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 11:33                       ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-12 11:33 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 11 May 2011, Will Drewry wrote:

> +void seccomp_filter_log_failure(int syscall)
> +{
> +	printk(KERN_INFO
> +		"%s[%d]: system call %d (%s) blocked at ip:%lx\n",
> +		current->comm, task_pid_nr(current), syscall,
> +		syscall_nr_to_name(syscall), KSTK_EIP(current));
> +}

I think it'd be a good idea to utilize the audit facility here.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12  7:48                       ` Ingo Molnar
  (?)
@ 2011-05-12 11:44                         ` James Morris
  -1 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-12 11:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds

On Thu, 12 May 2011, Ingo Molnar wrote:

> 
> 2) Why should this concept not be made available wider, to allow the 
>    restriction of not just system calls but other security relevant components 
>    of the kernel as well?

Because the aim of this is to reduce the attack surface of the syscall 
interface.

LSM is the correct level of abstraction for general security mediation, 
because it allows you to take into account all relevant security 
information in a race-free context.


>    This too, if you approach the problem via the events code, will be a natural 
>    end result, while if you approach it from the seccomp prctl angle it will be
>    a limited hack only.

I'd say it's a well-defined and readily understandable feature.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 11:44                         ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-12 11:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Martin Schwidefsky, Thomas Gleixner,
	Roland McGrath, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller

On Thu, 12 May 2011, Ingo Molnar wrote:

> 
> 2) Why should this concept not be made available wider, to allow the 
>    restriction of not just system calls but other security relevant components 
>    of the kernel as well?

Because the aim of this is to reduce the attack surface of the syscall 
interface.

LSM is the correct level of abstraction for general security mediation, 
because it allows you to take into account all relevant security 
information in a race-free context.


>    This too, if you approach the problem via the events code, will be a natural 
>    end result, while if you approach it from the seccomp prctl angle it will be
>    a limited hack only.

I'd say it's a well-defined and readily understandable feature.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 11:44                         ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-12 11:44 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, 12 May 2011, Ingo Molnar wrote:

> 
> 2) Why should this concept not be made available wider, to allow the 
>    restriction of not just system calls but other security relevant components 
>    of the kernel as well?

Because the aim of this is to reduce the attack surface of the syscall 
interface.

LSM is the correct level of abstraction for general security mediation, 
because it allows you to take into account all relevant security 
information in a race-free context.


>    This too, if you approach the problem via the events code, will be a natural 
>    end result, while if you approach it from the seccomp prctl angle it will be
>    a limited hack only.

I'd say it's a well-defined and readily understandable feature.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12  7:48                       ` Ingo Molnar
  (?)
@ 2011-05-12 12:15                         ` Frederic Weisbecker
  -1 siblings, 0 replies; 406+ messages in thread
From: Frederic Weisbecker @ 2011-05-12 12:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Eric Paris, kees.cook,
	agl, jmorris, Peter Zijlstra, Serge E. Hallyn, Ingo Molnar,
	Andrew Morton, Tejun Heo, Michal Marek, Oleg Nesterov,
	Roland McGrath, Jiri Slaby, David Howells, Russell King,
	Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds

On Thu, May 12, 2011 at 09:48:50AM +0200, Ingo Molnar wrote:
> To restrict execution to system calls.
> 
> Two observations:
> 
> 1) We already have a specific ABI for this: you can set filters for events via 
>    an event fd.
> 
>    Why not extend that mechanism instead and improve *both* your sandboxing
>    bits and the events code? This new seccomp code has a lot more
>    to do with trace event filters than the minimal old seccomp code ...
> 
>    kernel/trace/trace_event_filter.c is 2000 lines of tricky code that
>    interprets the ASCII filter expressions. kernel/seccomp.c is 86 lines of
>    mostly trivial code.
> 
> 2) Why should this concept not be made available wider, to allow the 
>    restriction of not just system calls but other security relevant components 
>    of the kernel as well?
> 
>    This too, if you approach the problem via the events code, will be a natural 
>    end result, while if you approach it from the seccomp prctl angle it will be
>    a limited hack only.
> 
> Note, the end result will be the same - just using a different ABI.
> 
> So i really think the ABI itself should be closer related to the event code. 
> What this "seccomp" code does is that it uses specific syscall events to 
> restrict execution of certain event generating codepaths, such as system calls.
> 
> Thanks,
> 
> 	Ingo

What's positive with that approach is that the code is all there already.
Create a perf event for a given trace event, attach a filter to it.

What needs to be added is an override of the effect of the filter. By default
it's dropping the event, but there may be different flavours, including sending
a signal. All in one, extending the current code to allow that looks trivial.

The negative points are that

* trace events are supposed to stay passive and not act on the system, except
doing some endpoint things like writing to a buffer. We can't call do_exit()
from a tracepoint for example, preemption is disabled there.

* Also, is it actually relevant to extend that seccomp filtering to other events
than syscalls? Exposing kernel events to filtering sounds actually to me bringing
a new potential security issue. But with fine restrictions this can probably
be dealt with. Especially if by default only syscalls can be filtered

* I think Peter did not want to give such "active" role to perf in the system.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 12:15                         ` Frederic Weisbecker
  0 siblings, 0 replies; 406+ messages in thread
From: Frederic Weisbecker @ 2011-05-12 12:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, jmorris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Martin Schwidefsky, Thomas Gleixner,
	Roland McGrath, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller

On Thu, May 12, 2011 at 09:48:50AM +0200, Ingo Molnar wrote:
> To restrict execution to system calls.
> 
> Two observations:
> 
> 1) We already have a specific ABI for this: you can set filters for events via 
>    an event fd.
> 
>    Why not extend that mechanism instead and improve *both* your sandboxing
>    bits and the events code? This new seccomp code has a lot more
>    to do with trace event filters than the minimal old seccomp code ...
> 
>    kernel/trace/trace_event_filter.c is 2000 lines of tricky code that
>    interprets the ASCII filter expressions. kernel/seccomp.c is 86 lines of
>    mostly trivial code.
> 
> 2) Why should this concept not be made available wider, to allow the 
>    restriction of not just system calls but other security relevant components 
>    of the kernel as well?
> 
>    This too, if you approach the problem via the events code, will be a natural 
>    end result, while if you approach it from the seccomp prctl angle it will be
>    a limited hack only.
> 
> Note, the end result will be the same - just using a different ABI.
> 
> So i really think the ABI itself should be closer related to the event code. 
> What this "seccomp" code does is that it uses specific syscall events to 
> restrict execution of certain event generating codepaths, such as system calls.
> 
> Thanks,
> 
> 	Ingo

What's positive with that approach is that the code is all there already.
Create a perf event for a given trace event, attach a filter to it.

What needs to be added is an override of the effect of the filter. By default
it's dropping the event, but there may be different flavours, including sending
a signal. All in one, extending the current code to allow that looks trivial.

The negative points are that

* trace events are supposed to stay passive and not act on the system, except
doing some endpoint things like writing to a buffer. We can't call do_exit()
from a tracepoint for example, preemption is disabled there.

* Also, is it actually relevant to extend that seccomp filtering to other events
than syscalls? Exposing kernel events to filtering sounds actually to me bringing
a new potential security issue. But with fine restrictions this can probably
be dealt with. Especially if by default only syscalls can be filtered

* I think Peter did not want to give such "active" role to perf in the system.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 12:15                         ` Frederic Weisbecker
  0 siblings, 0 replies; 406+ messages in thread
From: Frederic Weisbecker @ 2011-05-12 12:15 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, May 12, 2011 at 09:48:50AM +0200, Ingo Molnar wrote:
> To restrict execution to system calls.
> 
> Two observations:
> 
> 1) We already have a specific ABI for this: you can set filters for events via 
>    an event fd.
> 
>    Why not extend that mechanism instead and improve *both* your sandboxing
>    bits and the events code? This new seccomp code has a lot more
>    to do with trace event filters than the minimal old seccomp code ...
> 
>    kernel/trace/trace_event_filter.c is 2000 lines of tricky code that
>    interprets the ASCII filter expressions. kernel/seccomp.c is 86 lines of
>    mostly trivial code.
> 
> 2) Why should this concept not be made available wider, to allow the 
>    restriction of not just system calls but other security relevant components 
>    of the kernel as well?
> 
>    This too, if you approach the problem via the events code, will be a natural 
>    end result, while if you approach it from the seccomp prctl angle it will be
>    a limited hack only.
> 
> Note, the end result will be the same - just using a different ABI.
> 
> So i really think the ABI itself should be closer related to the event code. 
> What this "seccomp" code does is that it uses specific syscall events to 
> restrict execution of certain event generating codepaths, such as system calls.
> 
> Thanks,
> 
> 	Ingo

What's positive with that approach is that the code is all there already.
Create a perf event for a given trace event, attach a filter to it.

What needs to be added is an override of the effect of the filter. By default
it's dropping the event, but there may be different flavours, including sending
a signal. All in one, extending the current code to allow that looks trivial.

The negative points are that

* trace events are supposed to stay passive and not act on the system, except
doing some endpoint things like writing to a buffer. We can't call do_exit()
from a tracepoint for example, preemption is disabled there.

* Also, is it actually relevant to extend that seccomp filtering to other events
than syscalls? Exposing kernel events to filtering sounds actually to me bringing
a new potential security issue. But with fine restrictions this can probably
be dealt with. Especially if by default only syscalls can be filtered

* I think Peter did not want to give such "active" role to perf in the system.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12 11:44                         ` James Morris
  (?)
@ 2011-05-12 13:01                           ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-12 13:01 UTC (permalink / raw)
  To: James Morris
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds


* James Morris <jmorris@namei.org> wrote:

> On Thu, 12 May 2011, Ingo Molnar wrote:
> 
> > 2) Why should this concept not be made available wider, to allow the 
> >    restriction of not just system calls but other security relevant components 
> >    of the kernel as well?
> 
> Because the aim of this is to reduce the attack surface of the syscall 
> interface.

What i suggest achieves the same, my argument is that we could aim it to be 
even more flexible and even more useful.

> LSM is the correct level of abstraction for general security mediation, 
> because it allows you to take into account all relevant security information 
> in a race-free context.

I don't care about LSM though, i find it poorly designed.

The approach implemented here, the ability for *unprivileged code* to define 
(the seeds of ...) flexible security policies, in a proper Linuxish way, which 
is inherited along the task parent/child hieararchy and which allows nesting 
etc. is a *lot* more flexible.

What Will implemented here is pretty huge in my opinion: it turns security from 
a root-only kind of weird hack into an essential component of its APIs, 
available to *any* app not just the select security policy/mechanism chosen by 
the distributor ...

If implemented properly this could replace LSM in the long run.

As a prctl() hack bound to seccomp (which, by all means, is a natural extension 
to the current seccomp ABI, so perfectly fine if we only want that scope), that 
is much less likely to happen.

And if we merge the seccomp interface prematurely then interest towards a more 
flexible approach will disappear, so either we do it properly now or it will 
take some time for someone to come around and do it ...

Also note that i do not consider the perf events ABI itself cast into stone - 
and we could very well add a new system call for this, independent of perf 
events. I just think that the seccomp scope itself is exciting but looks 
limited to what the real potential of this could be.

> >    This too, if you approach the problem via the events code, will be a natural 
> >    end result, while if you approach it from the seccomp prctl angle it will be
> >    a limited hack only.
> 
> I'd say it's a well-defined and readily understandable feature.

Note, it was me who suggested this very event-filter-engine design a year ago, 
when the first submission still used a crude bitmap of allowed seccomp 
syscalls:

  http://lwn.net/Articles/332974/

Funnily enough, back then you wrote this:

  " I'm concerned that we're seeing yet another security scheme being designed on 
    the fly, without a well-formed threat model, and without taking into account 
    lessons learned from the seemingly endless parade of similar, failed schemes. "

so when and how did your opinion of this scheme turn from it being an "endless 
parade of failed schemes" to it being a "well-defined and readily 
understandable feature"? :-)

The idea itself has not changed since last year, what happened is that the 
filter engine got a couple of new features and Will has separated it out and 
has implemented a working prototype for sandboxing.

What i do here is to suggest *further* steps down the same road, now that we 
see that this scheme can indeed be used to implement sandboxing ... I think 
it's a valid line of inquiry.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 13:01                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-12 13:01 UTC (permalink / raw)
  To: James Morris
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Martin Schwidefsky, Thomas Gleixner,
	Roland McGrath, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller


* James Morris <jmorris@namei.org> wrote:

> On Thu, 12 May 2011, Ingo Molnar wrote:
> 
> > 2) Why should this concept not be made available wider, to allow the 
> >    restriction of not just system calls but other security relevant components 
> >    of the kernel as well?
> 
> Because the aim of this is to reduce the attack surface of the syscall 
> interface.

What i suggest achieves the same, my argument is that we could aim it to be 
even more flexible and even more useful.

> LSM is the correct level of abstraction for general security mediation, 
> because it allows you to take into account all relevant security information 
> in a race-free context.

I don't care about LSM though, i find it poorly designed.

The approach implemented here, the ability for *unprivileged code* to define 
(the seeds of ...) flexible security policies, in a proper Linuxish way, which 
is inherited along the task parent/child hieararchy and which allows nesting 
etc. is a *lot* more flexible.

What Will implemented here is pretty huge in my opinion: it turns security from 
a root-only kind of weird hack into an essential component of its APIs, 
available to *any* app not just the select security policy/mechanism chosen by 
the distributor ...

If implemented properly this could replace LSM in the long run.

As a prctl() hack bound to seccomp (which, by all means, is a natural extension 
to the current seccomp ABI, so perfectly fine if we only want that scope), that 
is much less likely to happen.

And if we merge the seccomp interface prematurely then interest towards a more 
flexible approach will disappear, so either we do it properly now or it will 
take some time for someone to come around and do it ...

Also note that i do not consider the perf events ABI itself cast into stone - 
and we could very well add a new system call for this, independent of perf 
events. I just think that the seccomp scope itself is exciting but looks 
limited to what the real potential of this could be.

> >    This too, if you approach the problem via the events code, will be a natural 
> >    end result, while if you approach it from the seccomp prctl angle it will be
> >    a limited hack only.
> 
> I'd say it's a well-defined and readily understandable feature.

Note, it was me who suggested this very event-filter-engine design a year ago, 
when the first submission still used a crude bitmap of allowed seccomp 
syscalls:

  http://lwn.net/Articles/332974/

Funnily enough, back then you wrote this:

  " I'm concerned that we're seeing yet another security scheme being designed on 
    the fly, without a well-formed threat model, and without taking into account 
    lessons learned from the seemingly endless parade of similar, failed schemes. "

so when and how did your opinion of this scheme turn from it being an "endless 
parade of failed schemes" to it being a "well-defined and readily 
understandable feature"? :-)

The idea itself has not changed since last year, what happened is that the 
filter engine got a couple of new features and Will has separated it out and 
has implemented a working prototype for sandboxing.

What i do here is to suggest *further* steps down the same road, now that we 
see that this scheme can indeed be used to implement sandboxing ... I think 
it's a valid line of inquiry.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 13:01                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-12 13:01 UTC (permalink / raw)
  To: linux-arm-kernel


* James Morris <jmorris@namei.org> wrote:

> On Thu, 12 May 2011, Ingo Molnar wrote:
> 
> > 2) Why should this concept not be made available wider, to allow the 
> >    restriction of not just system calls but other security relevant components 
> >    of the kernel as well?
> 
> Because the aim of this is to reduce the attack surface of the syscall 
> interface.

What i suggest achieves the same, my argument is that we could aim it to be 
even more flexible and even more useful.

> LSM is the correct level of abstraction for general security mediation, 
> because it allows you to take into account all relevant security information 
> in a race-free context.

I don't care about LSM though, i find it poorly designed.

The approach implemented here, the ability for *unprivileged code* to define 
(the seeds of ...) flexible security policies, in a proper Linuxish way, which 
is inherited along the task parent/child hieararchy and which allows nesting 
etc. is a *lot* more flexible.

What Will implemented here is pretty huge in my opinion: it turns security from 
a root-only kind of weird hack into an essential component of its APIs, 
available to *any* app not just the select security policy/mechanism chosen by 
the distributor ...

If implemented properly this could replace LSM in the long run.

As a prctl() hack bound to seccomp (which, by all means, is a natural extension 
to the current seccomp ABI, so perfectly fine if we only want that scope), that 
is much less likely to happen.

And if we merge the seccomp interface prematurely then interest towards a more 
flexible approach will disappear, so either we do it properly now or it will 
take some time for someone to come around and do it ...

Also note that i do not consider the perf events ABI itself cast into stone - 
and we could very well add a new system call for this, independent of perf 
events. I just think that the seccomp scope itself is exciting but looks 
limited to what the real potential of this could be.

> >    This too, if you approach the problem via the events code, will be a natural 
> >    end result, while if you approach it from the seccomp prctl angle it will be
> >    a limited hack only.
> 
> I'd say it's a well-defined and readily understandable feature.

Note, it was me who suggested this very event-filter-engine design a year ago, 
when the first submission still used a crude bitmap of allowed seccomp 
syscalls:

  http://lwn.net/Articles/332974/

Funnily enough, back then you wrote this:

  " I'm concerned that we're seeing yet another security scheme being designed on 
    the fly, without a well-formed threat model, and without taking into account 
    lessons learned from the seemingly endless parade of similar, failed schemes. "

so when and how did your opinion of this scheme turn from it being an "endless 
parade of failed schemes" to it being a "well-defined and readily 
understandable feature"? :-)

The idea itself has not changed since last year, what happened is that the 
filter engine got a couple of new features and Will has separated it out and 
has implemented a working prototype for sandboxing.

What i do here is to suggest *further* steps down the same road, now that we 
see that this scheme can indeed be used to implement sandboxing ... I think 
it's a valid line of inquiry.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12 13:01                           ` Ingo Molnar
  (?)
@ 2011-05-12 16:26                             ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-12 16:26 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds

[Thanks to everyone for the continued feedback and insights - I appreciate it!]

On Thu, May 12, 2011 at 8:01 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * James Morris <jmorris@namei.org> wrote:
>
>> On Thu, 12 May 2011, Ingo Molnar wrote:
>>
>> > 2) Why should this concept not be made available wider, to allow the
>> >    restriction of not just system calls but other security relevant components
>> >    of the kernel as well?
>>
>> Because the aim of this is to reduce the attack surface of the syscall
>> interface.
>
> What i suggest achieves the same, my argument is that we could aim it to be
> even more flexible and even more useful.
>
>> LSM is the correct level of abstraction for general security mediation,
>> because it allows you to take into account all relevant security information
>> in a race-free context.
>
> I don't care about LSM though, i find it poorly designed.
>
> The approach implemented here, the ability for *unprivileged code* to define
> (the seeds of ...) flexible security policies, in a proper Linuxish way, which
> is inherited along the task parent/child hieararchy and which allows nesting
> etc. is a *lot* more flexible.
>
> What Will implemented here is pretty huge in my opinion: it turns security from
> a root-only kind of weird hack into an essential component of its APIs,
> available to *any* app not just the select security policy/mechanism chosen by
> the distributor ...
>
> If implemented properly this could replace LSM in the long run.
>
> As a prctl() hack bound to seccomp (which, by all means, is a natural extension
> to the current seccomp ABI, so perfectly fine if we only want that scope), that
> is much less likely to happen.
>
> And if we merge the seccomp interface prematurely then interest towards a more
> flexible approach will disappear, so either we do it properly now or it will
> take some time for someone to come around and do it ...
>
> Also note that i do not consider the perf events ABI itself cast into stone -
> and we could very well add a new system call for this, independent of perf
> events. I just think that the seccomp scope itself is exciting but looks
> limited to what the real potential of this could be.

I agree with you on many of these points!  However, I don't think that
the views around LSMs, perf/ftrace infrastructure, or the current
seccomp filtering implementation are necessarily in conflict.  Here is
my understanding of how the different worlds fit together and where I
see this patchset living, along with where I could see future work
going.  Perhaps I'm being a trifle naive, but here goes anyway:

1. LSMs provide a global mechanism for hooking "security relevant"
events at a point where all the incoming user-sourced data has been
preprocessed and moved into userspace.  The hooks are called every
time one of those boundaries are crossed.
2. Perf and the ftrace infrastructure provide global function tracing
and system call hooks with direct access to the caller's registers
(and memory).
3. seccomp (as it exists today) provides a global system call entry
hook point with a binary per-process decision about whether to provide
"secure computing" behavior.

When I boil that down to abstractions, I see:
A. Globally scoped: LSMs, ftrace/perf
B. Locally/process scoped: seccomp

The result of that logical equivalence is that I see room for:
I. A per-process, locally scoped security event hooking interface (the
proposed changes in this patchset)
II. A globally scoped security event hooking interface _prior_ to
argument processing
III. A globally scoped security event hooking interface _post_
argument processing

II and III could be reduced further if I assume that ftrace/perf
provides (II) and a simple intermediary layer (hook entry/exit)
provides the argument processing steps that then call out a global
security policy system.

The driving motivation for this patchset is kernel attack surface
reduction, but that need arises because we lack a process-scoped
mechanism for making security decisions -- everything is global:
creds/DAC, containers, LSM, etc.   Adding ftrace filtering to agl's
original bitmask-seccomp proposal opens up the process-local security
world.  At present, it can limit the attack surface with simple binary
filters or apply limited security policy through the use of filter
strings.

Based on your mails, I see two main deficiencies in my proposed patchset:
a. Deep argument analysis: Any arguments that live in user memory
needs to be copied into the kernel, then checked, and substituted for
the actual system call, then have the original pointers restored (when
applicable) on system call exit.  There is a large overhead here and
the LSM hooks provide much of this support on a global level.
b. Lack of support for non-system call events.

For (a), if the long term view of ftrace/perf & LSMs is that LSM-like
functionality will live on top of the ftrace/perf infrastructure, then
adding support for the intermediary layer to analyze arguments will
come with time.  It's also likely that for process-local stuff (e.g.,)
a new predicate could be added to callback to a userspace supervisor,
or even a more generic ability for modules to register new
predicates/functions in the filtering engine itself -- like "fd == 1
&& check_path(path) == '/etc/safe.conf'" or "check_xattr(path,
expected)".  Of course, I'm just making stuff up right now :)

For (b), we could just add a field we don't use right now in the prctl
interface:
  prctl(PR_SET_SECCOMP_FILTER, int event_type, int
event_or_syscall_nr, char *filter)
[or something similar]

Then we can add process-local/scoped supported event types somewhere
down the road without an ABI change.

Tying it all together, it'd look like:
* Now -- add process-scoped security support: secocmp filter with
support for "future" event types
* Soon -- expand ftrace syscall hooks to hook more system calls
* Later -- expand ftrace filter language to support either deep
argument analysis and/or custom registered predicates
* Later, later -- implement a LSM-like hooking layer for "interesting"
event types on top of the ftrace hooks

That would yield process-scoped security controls and global security
controls and the ability to continue to create new and interesting
security modules.

All that said, I'm in over my head.  I've focused primarily on the
process-scoped security.  I think James, some of the LSM authors, and
out-of-tree security system maintainers would be good to help guide
direction toward the security view you have in mind to ensure the
flexibility desired exists.  And that's even assuming this sketch is
even vaguely interesting...

[snip]

> What i do here is to suggest *further* steps down the same road, now that we
> see that this scheme can indeed be used to implement sandboxing ... I think
> it's a valid line of inquiry.

I certainly agree that it's a valid line of inquiry, but I worry about
the massive scope expansion.  I know it hurts my head, but I'm hoping
the brain-dump above frames up how I think about this patch and your
line of inquiry.  ftrace hooking and the perf code certainly look a
lot like LSMs if I squint hard :)  But there is a substantial amount
of work to merge the worlds, and (thankfully) I don't think that
future directly impacts process-scoped security mechanisms even if
they can interact nicely.

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 16:26                             ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-12 16:26 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Peter Zijlstra,
	microblaze-uclinux, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller

[Thanks to everyone for the continued feedback and insights - I appreciate =
it!]

On Thu, May 12, 2011 at 8:01 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * James Morris <jmorris@namei.org> wrote:
>
>> On Thu, 12 May 2011, Ingo Molnar wrote:
>>
>> > 2) Why should this concept not be made available wider, to allow the
>> > =A0 =A0restriction of not just system calls but other security relevan=
t components
>> > =A0 =A0of the kernel as well?
>>
>> Because the aim of this is to reduce the attack surface of the syscall
>> interface.
>
> What i suggest achieves the same, my argument is that we could aim it to =
be
> even more flexible and even more useful.
>
>> LSM is the correct level of abstraction for general security mediation,
>> because it allows you to take into account all relevant security informa=
tion
>> in a race-free context.
>
> I don't care about LSM though, i find it poorly designed.
>
> The approach implemented here, the ability for *unprivileged code* to def=
ine
> (the seeds of ...) flexible security policies, in a proper Linuxish way, =
which
> is inherited along the task parent/child hieararchy and which allows nest=
ing
> etc. is a *lot* more flexible.
>
> What Will implemented here is pretty huge in my opinion: it turns securit=
y from
> a root-only kind of weird hack into an essential component of its APIs,
> available to *any* app not just the select security policy/mechanism chos=
en by
> the distributor ...
>
> If implemented properly this could replace LSM in the long run.
>
> As a prctl() hack bound to seccomp (which, by all means, is a natural ext=
ension
> to the current seccomp ABI, so perfectly fine if we only want that scope)=
, that
> is much less likely to happen.
>
> And if we merge the seccomp interface prematurely then interest towards a=
 more
> flexible approach will disappear, so either we do it properly now or it w=
ill
> take some time for someone to come around and do it ...
>
> Also note that i do not consider the perf events ABI itself cast into sto=
ne -
> and we could very well add a new system call for this, independent of per=
f
> events. I just think that the seccomp scope itself is exciting but looks
> limited to what the real potential of this could be.

I agree with you on many of these points!  However, I don't think that
the views around LSMs, perf/ftrace infrastructure, or the current
seccomp filtering implementation are necessarily in conflict.  Here is
my understanding of how the different worlds fit together and where I
see this patchset living, along with where I could see future work
going.  Perhaps I'm being a trifle naive, but here goes anyway:

1. LSMs provide a global mechanism for hooking "security relevant"
events at a point where all the incoming user-sourced data has been
preprocessed and moved into userspace.  The hooks are called every
time one of those boundaries are crossed.
2. Perf and the ftrace infrastructure provide global function tracing
and system call hooks with direct access to the caller's registers
(and memory).
3. seccomp (as it exists today) provides a global system call entry
hook point with a binary per-process decision about whether to provide
"secure computing" behavior.

When I boil that down to abstractions, I see:
A. Globally scoped: LSMs, ftrace/perf
B. Locally/process scoped: seccomp

The result of that logical equivalence is that I see room for:
I. A per-process, locally scoped security event hooking interface (the
proposed changes in this patchset)
II. A globally scoped security event hooking interface _prior_ to
argument processing
III. A globally scoped security event hooking interface _post_
argument processing

II and III could be reduced further if I assume that ftrace/perf
provides (II) and a simple intermediary layer (hook entry/exit)
provides the argument processing steps that then call out a global
security policy system.

The driving motivation for this patchset is kernel attack surface
reduction, but that need arises because we lack a process-scoped
mechanism for making security decisions -- everything is global:
creds/DAC, containers, LSM, etc.   Adding ftrace filtering to agl's
original bitmask-seccomp proposal opens up the process-local security
world.  At present, it can limit the attack surface with simple binary
filters or apply limited security policy through the use of filter
strings.

Based on your mails, I see two main deficiencies in my proposed patchset:
a. Deep argument analysis: Any arguments that live in user memory
needs to be copied into the kernel, then checked, and substituted for
the actual system call, then have the original pointers restored (when
applicable) on system call exit.  There is a large overhead here and
the LSM hooks provide much of this support on a global level.
b. Lack of support for non-system call events.

For (a), if the long term view of ftrace/perf & LSMs is that LSM-like
functionality will live on top of the ftrace/perf infrastructure, then
adding support for the intermediary layer to analyze arguments will
come with time.  It's also likely that for process-local stuff (e.g.,)
a new predicate could be added to callback to a userspace supervisor,
or even a more generic ability for modules to register new
predicates/functions in the filtering engine itself -- like "fd =3D=3D 1
&& check_path(path) =3D=3D '/etc/safe.conf'" or "check_xattr(path,
expected)".  Of course, I'm just making stuff up right now :)

For (b), we could just add a field we don't use right now in the prctl
interface:
  prctl(PR_SET_SECCOMP_FILTER, int event_type, int
event_or_syscall_nr, char *filter)
[or something similar]

Then we can add process-local/scoped supported event types somewhere
down the road without an ABI change.

Tying it all together, it'd look like:
* Now -- add process-scoped security support: secocmp filter with
support for "future" event types
* Soon -- expand ftrace syscall hooks to hook more system calls
* Later -- expand ftrace filter language to support either deep
argument analysis and/or custom registered predicates
* Later, later -- implement a LSM-like hooking layer for "interesting"
event types on top of the ftrace hooks

That would yield process-scoped security controls and global security
controls and the ability to continue to create new and interesting
security modules.

All that said, I'm in over my head.  I've focused primarily on the
process-scoped security.  I think James, some of the LSM authors, and
out-of-tree security system maintainers would be good to help guide
direction toward the security view you have in mind to ensure the
flexibility desired exists.  And that's even assuming this sketch is
even vaguely interesting...

[snip]

> What i do here is to suggest *further* steps down the same road, now that=
 we
> see that this scheme can indeed be used to implement sandboxing ... I thi=
nk
> it's a valid line of inquiry.

I certainly agree that it's a valid line of inquiry, but I worry about
the massive scope expansion.  I know it hurts my head, but I'm hoping
the brain-dump above frames up how I think about this patch and your
line of inquiry.  ftrace hooking and the perf code certainly look a
lot like LSMs if I squint hard :)  But there is a substantial amount
of work to merge the worlds, and (thankfully) I don't think that
future directly impacts process-scoped security mechanisms even if
they can interact nicely.

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-12 16:26                             ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-12 16:26 UTC (permalink / raw)
  To: linux-arm-kernel

[Thanks to everyone for the continued feedback and insights - I appreciate it!]

On Thu, May 12, 2011 at 8:01 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * James Morris <jmorris@namei.org> wrote:
>
>> On Thu, 12 May 2011, Ingo Molnar wrote:
>>
>> > 2) Why should this concept not be made available wider, to allow the
>> > ? ?restriction of not just system calls but other security relevant components
>> > ? ?of the kernel as well?
>>
>> Because the aim of this is to reduce the attack surface of the syscall
>> interface.
>
> What i suggest achieves the same, my argument is that we could aim it to be
> even more flexible and even more useful.
>
>> LSM is the correct level of abstraction for general security mediation,
>> because it allows you to take into account all relevant security information
>> in a race-free context.
>
> I don't care about LSM though, i find it poorly designed.
>
> The approach implemented here, the ability for *unprivileged code* to define
> (the seeds of ...) flexible security policies, in a proper Linuxish way, which
> is inherited along the task parent/child hieararchy and which allows nesting
> etc. is a *lot* more flexible.
>
> What Will implemented here is pretty huge in my opinion: it turns security from
> a root-only kind of weird hack into an essential component of its APIs,
> available to *any* app not just the select security policy/mechanism chosen by
> the distributor ...
>
> If implemented properly this could replace LSM in the long run.
>
> As a prctl() hack bound to seccomp (which, by all means, is a natural extension
> to the current seccomp ABI, so perfectly fine if we only want that scope), that
> is much less likely to happen.
>
> And if we merge the seccomp interface prematurely then interest towards a more
> flexible approach will disappear, so either we do it properly now or it will
> take some time for someone to come around and do it ...
>
> Also note that i do not consider the perf events ABI itself cast into stone -
> and we could very well add a new system call for this, independent of perf
> events. I just think that the seccomp scope itself is exciting but looks
> limited to what the real potential of this could be.

I agree with you on many of these points!  However, I don't think that
the views around LSMs, perf/ftrace infrastructure, or the current
seccomp filtering implementation are necessarily in conflict.  Here is
my understanding of how the different worlds fit together and where I
see this patchset living, along with where I could see future work
going.  Perhaps I'm being a trifle naive, but here goes anyway:

1. LSMs provide a global mechanism for hooking "security relevant"
events at a point where all the incoming user-sourced data has been
preprocessed and moved into userspace.  The hooks are called every
time one of those boundaries are crossed.
2. Perf and the ftrace infrastructure provide global function tracing
and system call hooks with direct access to the caller's registers
(and memory).
3. seccomp (as it exists today) provides a global system call entry
hook point with a binary per-process decision about whether to provide
"secure computing" behavior.

When I boil that down to abstractions, I see:
A. Globally scoped: LSMs, ftrace/perf
B. Locally/process scoped: seccomp

The result of that logical equivalence is that I see room for:
I. A per-process, locally scoped security event hooking interface (the
proposed changes in this patchset)
II. A globally scoped security event hooking interface _prior_ to
argument processing
III. A globally scoped security event hooking interface _post_
argument processing

II and III could be reduced further if I assume that ftrace/perf
provides (II) and a simple intermediary layer (hook entry/exit)
provides the argument processing steps that then call out a global
security policy system.

The driving motivation for this patchset is kernel attack surface
reduction, but that need arises because we lack a process-scoped
mechanism for making security decisions -- everything is global:
creds/DAC, containers, LSM, etc.   Adding ftrace filtering to agl's
original bitmask-seccomp proposal opens up the process-local security
world.  At present, it can limit the attack surface with simple binary
filters or apply limited security policy through the use of filter
strings.

Based on your mails, I see two main deficiencies in my proposed patchset:
a. Deep argument analysis: Any arguments that live in user memory
needs to be copied into the kernel, then checked, and substituted for
the actual system call, then have the original pointers restored (when
applicable) on system call exit.  There is a large overhead here and
the LSM hooks provide much of this support on a global level.
b. Lack of support for non-system call events.

For (a), if the long term view of ftrace/perf & LSMs is that LSM-like
functionality will live on top of the ftrace/perf infrastructure, then
adding support for the intermediary layer to analyze arguments will
come with time.  It's also likely that for process-local stuff (e.g.,)
a new predicate could be added to callback to a userspace supervisor,
or even a more generic ability for modules to register new
predicates/functions in the filtering engine itself -- like "fd == 1
&& check_path(path) == '/etc/safe.conf'" or "check_xattr(path,
expected)".  Of course, I'm just making stuff up right now :)

For (b), we could just add a field we don't use right now in the prctl
interface:
  prctl(PR_SET_SECCOMP_FILTER, int event_type, int
event_or_syscall_nr, char *filter)
[or something similar]

Then we can add process-local/scoped supported event types somewhere
down the road without an ABI change.

Tying it all together, it'd look like:
* Now -- add process-scoped security support: secocmp filter with
support for "future" event types
* Soon -- expand ftrace syscall hooks to hook more system calls
* Later -- expand ftrace filter language to support either deep
argument analysis and/or custom registered predicates
* Later, later -- implement a LSM-like hooking layer for "interesting"
event types on top of the ftrace hooks

That would yield process-scoped security controls and global security
controls and the ability to continue to create new and interesting
security modules.

All that said, I'm in over my head.  I've focused primarily on the
process-scoped security.  I think James, some of the LSM authors, and
out-of-tree security system maintainers would be good to help guide
direction toward the security view you have in mind to ensure the
flexibility desired exists.  And that's even assuming this sketch is
even vaguely interesting...

[snip]

> What i do here is to suggest *further* steps down the same road, now that we
> see that this scheme can indeed be used to implement sandboxing ... I think
> it's a valid line of inquiry.

I certainly agree that it's a valid line of inquiry, but I worry about
the massive scope expansion.  I know it hurts my head, but I'm hoping
the brain-dump above frames up how I think about this patch and your
line of inquiry.  ftrace hooking and the perf code certainly look a
lot like LSMs if I squint hard :)  But there is a substantial amount
of work to merge the worlds, and (thankfully) I don't think that
future directly impacts process-scoped security mechanisms even if
they can interact nicely.

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12 13:01                           ` Ingo Molnar
  (?)
@ 2011-05-13  0:18                             ` James Morris
  -1 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-13  0:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds

On Thu, 12 May 2011, Ingo Molnar wrote:
> Funnily enough, back then you wrote this:
> 
>   " I'm concerned that we're seeing yet another security scheme being designed on 
>     the fly, without a well-formed threat model, and without taking into account 
>     lessons learned from the seemingly endless parade of similar, failed schemes. "
> 
> so when and how did your opinion of this scheme turn from it being an "endless 
> parade of failed schemes" to it being a "well-defined and readily 
> understandable feature"? :-)

When it was defined in a way which limited its purpose to reducing the 
attack surface of the sycall interface.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13  0:18                             ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-13  0:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Martin Schwidefsky, Thomas Gleixner,
	Roland McGrath, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller

On Thu, 12 May 2011, Ingo Molnar wrote:
> Funnily enough, back then you wrote this:
> 
>   " I'm concerned that we're seeing yet another security scheme being designed on 
>     the fly, without a well-formed threat model, and without taking into account 
>     lessons learned from the seemingly endless parade of similar, failed schemes. "
> 
> so when and how did your opinion of this scheme turn from it being an "endless 
> parade of failed schemes" to it being a "well-defined and readily 
> understandable feature"? :-)

When it was defined in a way which limited its purpose to reducing the 
attack surface of the sycall interface.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13  0:18                             ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-13  0:18 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, 12 May 2011, Ingo Molnar wrote:
> Funnily enough, back then you wrote this:
> 
>   " I'm concerned that we're seeing yet another security scheme being designed on 
>     the fly, without a well-formed threat model, and without taking into account 
>     lessons learned from the seemingly endless parade of similar, failed schemes. "
> 
> so when and how did your opinion of this scheme turn from it being an "endless 
> parade of failed schemes" to it being a "well-defined and readily 
> understandable feature"? :-)

When it was defined in a way which limited its purpose to reducing the 
attack surface of the sycall interface.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13  0:18                             ` James Morris
  (?)
@ 2011-05-13 12:10                               ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:10 UTC (permalink / raw)
  To: James Morris
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds


* James Morris <jmorris@namei.org> wrote:

> On Thu, 12 May 2011, Ingo Molnar wrote:
> > Funnily enough, back then you wrote this:
> > 
> >   " I'm concerned that we're seeing yet another security scheme being designed on 
> >     the fly, without a well-formed threat model, and without taking into account 
> >     lessons learned from the seemingly endless parade of similar, failed schemes. "
> > 
> > so when and how did your opinion of this scheme turn from it being an 
> > "endless parade of failed schemes" to it being a "well-defined and readily 
> > understandable feature"? :-)
> 
> When it was defined in a way which limited its purpose to reducing the attack 
> surface of the sycall interface.

Let me outline a simple example of a new filter expression based security 
feature that could be implemented outside the narrow system call boundary you 
find acceptable, and please tell what is bad about it.

Say i'm a user-space sandbox developer who wants to enforce that sandboxed code 
should only be allowed to open files in /home/sandbox/, /lib/ and /usr/lib/.

It is a simple and sensible security feature, agreed? It allows most code to 
run well and link to countless libraries - but no access to other files is 
allowed.

I would also like my sandbox app to be able to install this policy without 
having to be root. I do not want the sandbox app to have permission to create 
labels on /lib and /usr/lib and what not.

Firstly, using the filter code i deny the various link creation syscalls so 
that sandboxed code cannot escape for example by creating a symlink to outside 
the permitted VFS namespace. (Note: we opt-in to syscalls, that way new 
syscalls added by new kernels are denied by defalt. The current symlink 
creation syscalls are not opted in to.)

But the next step, actually checking filenames, poses a big hurdle: i cannot 
implement the filename checking at the sys_open() syscall level in a secure 
way: because the pathname is passed to sys_open() by pointer, and if i check it 
at the generic sys_open() syscall level, another thread in the sandbox might 
modify the underlying filename *after* i've checked it.

But if i had a VFS event at the fs/namei.c::getname() level, i would have 
access to a central point where the VFS string becomes stable to the kernel and 
can be checked (and denied if necessary).

A sidenote, and not surprisingly, the audit subsystem already has an event 
callback there:

        audit_getname(result);

Unfortunately this audit callback cannot be used for my purposes, because the 
event is single-purpose for auditd and because it allows no feedback (no 
deny/accept discretion for the security policy).

But if had this simple event there:

	err = event_vfs_getname(result);

I could implement this new filename based sandboxing policy, using a filter 
like this installed on the vfs::getname event and inherited by all sandboxed 
tasks (which cannot uninstall the filter, obviously):

  "
	if (strstr(name, ".."))
		return -EACCESS;

	if (!strncmp(name, "/home/sandbox/", 14) &&
	    !strncmp(name, "/lib/", 5) &&
	    !strncmp(name, "/usr/lib/", 9))
		return -EACCESS;

  "

  #
  # Note1: Obviously the filter engine would be extended to allow such simple string
  #        match functions. )
  #
  # Note2: ".." is disallowed so that sandboxed code cannot escape the restrictions
  #         using "/..".
  #

This kind of flexible and dynamic sandboxing would allow a wide range of file 
ops within the sandbox, while still isolating it from files not included in the 
specified VFS namespace.

( Note that there are tons of other examples as well, for useful security features
  that are best done using events outside the syscall boundary. )

The security event filters code tied to seccomp and syscalls at the moment is 
useful, but limited in its future potential.

So i argue that it should go slightly further and should become:

 - unprivileged:  application-definable, allowing the embedding of security 
                  policy in *apps* as well, not just the system

 - flexible:      can be added/removed runtime unprivileged, and cheaply so

 - transparent:   does not impact executing code that meets the policy

 - nestable:      it is inherited by child tasks and is fundamentally stackable,
                  multiple policies will have the combined effect and they
                  are transparent to each other. So if a child task within a
                  sandbox adds *more* checks then those add to the already
                  existing set of checks. We only narrow permissions, never
                  extend them.

 - generic:       allowing observation and (safe) control of security relevant
                  parameters not just at the system call boundary but at other
                  relevant places of kernel execution as well: which 
                  points/callbacks could also be used for other types of event 
                  extraction such as perf. It could even be shared with audit ...

I argue that this is the LSM and audit subsystems designed right: in the long 
run it could allow everything that LSM does at the moment - and so much more 
...

And you argue that allowing this would be bad, if it was extended like that 
then you'd consider it a failed scheme? Why?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:10                               ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:10 UTC (permalink / raw)
  To: James Morris
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Martin Schwidefsky, Thomas Gleixner,
	Roland McGrath, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller


* James Morris <jmorris@namei.org> wrote:

> On Thu, 12 May 2011, Ingo Molnar wrote:
> > Funnily enough, back then you wrote this:
> > 
> >   " I'm concerned that we're seeing yet another security scheme being designed on 
> >     the fly, without a well-formed threat model, and without taking into account 
> >     lessons learned from the seemingly endless parade of similar, failed schemes. "
> > 
> > so when and how did your opinion of this scheme turn from it being an 
> > "endless parade of failed schemes" to it being a "well-defined and readily 
> > understandable feature"? :-)
> 
> When it was defined in a way which limited its purpose to reducing the attack 
> surface of the sycall interface.

Let me outline a simple example of a new filter expression based security 
feature that could be implemented outside the narrow system call boundary you 
find acceptable, and please tell what is bad about it.

Say i'm a user-space sandbox developer who wants to enforce that sandboxed code 
should only be allowed to open files in /home/sandbox/, /lib/ and /usr/lib/.

It is a simple and sensible security feature, agreed? It allows most code to 
run well and link to countless libraries - but no access to other files is 
allowed.

I would also like my sandbox app to be able to install this policy without 
having to be root. I do not want the sandbox app to have permission to create 
labels on /lib and /usr/lib and what not.

Firstly, using the filter code i deny the various link creation syscalls so 
that sandboxed code cannot escape for example by creating a symlink to outside 
the permitted VFS namespace. (Note: we opt-in to syscalls, that way new 
syscalls added by new kernels are denied by defalt. The current symlink 
creation syscalls are not opted in to.)

But the next step, actually checking filenames, poses a big hurdle: i cannot 
implement the filename checking at the sys_open() syscall level in a secure 
way: because the pathname is passed to sys_open() by pointer, and if i check it 
at the generic sys_open() syscall level, another thread in the sandbox might 
modify the underlying filename *after* i've checked it.

But if i had a VFS event at the fs/namei.c::getname() level, i would have 
access to a central point where the VFS string becomes stable to the kernel and 
can be checked (and denied if necessary).

A sidenote, and not surprisingly, the audit subsystem already has an event 
callback there:

        audit_getname(result);

Unfortunately this audit callback cannot be used for my purposes, because the 
event is single-purpose for auditd and because it allows no feedback (no 
deny/accept discretion for the security policy).

But if had this simple event there:

	err = event_vfs_getname(result);

I could implement this new filename based sandboxing policy, using a filter 
like this installed on the vfs::getname event and inherited by all sandboxed 
tasks (which cannot uninstall the filter, obviously):

  "
	if (strstr(name, ".."))
		return -EACCESS;

	if (!strncmp(name, "/home/sandbox/", 14) &&
	    !strncmp(name, "/lib/", 5) &&
	    !strncmp(name, "/usr/lib/", 9))
		return -EACCESS;

  "

  #
  # Note1: Obviously the filter engine would be extended to allow such simple string
  #        match functions. )
  #
  # Note2: ".." is disallowed so that sandboxed code cannot escape the restrictions
  #         using "/..".
  #

This kind of flexible and dynamic sandboxing would allow a wide range of file 
ops within the sandbox, while still isolating it from files not included in the 
specified VFS namespace.

( Note that there are tons of other examples as well, for useful security features
  that are best done using events outside the syscall boundary. )

The security event filters code tied to seccomp and syscalls at the moment is 
useful, but limited in its future potential.

So i argue that it should go slightly further and should become:

 - unprivileged:  application-definable, allowing the embedding of security 
                  policy in *apps* as well, not just the system

 - flexible:      can be added/removed runtime unprivileged, and cheaply so

 - transparent:   does not impact executing code that meets the policy

 - nestable:      it is inherited by child tasks and is fundamentally stackable,
                  multiple policies will have the combined effect and they
                  are transparent to each other. So if a child task within a
                  sandbox adds *more* checks then those add to the already
                  existing set of checks. We only narrow permissions, never
                  extend them.

 - generic:       allowing observation and (safe) control of security relevant
                  parameters not just at the system call boundary but at other
                  relevant places of kernel execution as well: which 
                  points/callbacks could also be used for other types of event 
                  extraction such as perf. It could even be shared with audit ...

I argue that this is the LSM and audit subsystems designed right: in the long 
run it could allow everything that LSM does at the moment - and so much more 
...

And you argue that allowing this would be bad, if it was extended like that 
then you'd consider it a failed scheme? Why?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:10                               ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:10 UTC (permalink / raw)
  To: linux-arm-kernel


* James Morris <jmorris@namei.org> wrote:

> On Thu, 12 May 2011, Ingo Molnar wrote:
> > Funnily enough, back then you wrote this:
> > 
> >   " I'm concerned that we're seeing yet another security scheme being designed on 
> >     the fly, without a well-formed threat model, and without taking into account 
> >     lessons learned from the seemingly endless parade of similar, failed schemes. "
> > 
> > so when and how did your opinion of this scheme turn from it being an 
> > "endless parade of failed schemes" to it being a "well-defined and readily 
> > understandable feature"? :-)
> 
> When it was defined in a way which limited its purpose to reducing the attack 
> surface of the sycall interface.

Let me outline a simple example of a new filter expression based security 
feature that could be implemented outside the narrow system call boundary you 
find acceptable, and please tell what is bad about it.

Say i'm a user-space sandbox developer who wants to enforce that sandboxed code 
should only be allowed to open files in /home/sandbox/, /lib/ and /usr/lib/.

It is a simple and sensible security feature, agreed? It allows most code to 
run well and link to countless libraries - but no access to other files is 
allowed.

I would also like my sandbox app to be able to install this policy without 
having to be root. I do not want the sandbox app to have permission to create 
labels on /lib and /usr/lib and what not.

Firstly, using the filter code i deny the various link creation syscalls so 
that sandboxed code cannot escape for example by creating a symlink to outside 
the permitted VFS namespace. (Note: we opt-in to syscalls, that way new 
syscalls added by new kernels are denied by defalt. The current symlink 
creation syscalls are not opted in to.)

But the next step, actually checking filenames, poses a big hurdle: i cannot 
implement the filename checking at the sys_open() syscall level in a secure 
way: because the pathname is passed to sys_open() by pointer, and if i check it 
at the generic sys_open() syscall level, another thread in the sandbox might 
modify the underlying filename *after* i've checked it.

But if i had a VFS event at the fs/namei.c::getname() level, i would have 
access to a central point where the VFS string becomes stable to the kernel and 
can be checked (and denied if necessary).

A sidenote, and not surprisingly, the audit subsystem already has an event 
callback there:

        audit_getname(result);

Unfortunately this audit callback cannot be used for my purposes, because the 
event is single-purpose for auditd and because it allows no feedback (no 
deny/accept discretion for the security policy).

But if had this simple event there:

	err = event_vfs_getname(result);

I could implement this new filename based sandboxing policy, using a filter 
like this installed on the vfs::getname event and inherited by all sandboxed 
tasks (which cannot uninstall the filter, obviously):

  "
	if (strstr(name, ".."))
		return -EACCESS;

	if (!strncmp(name, "/home/sandbox/", 14) &&
	    !strncmp(name, "/lib/", 5) &&
	    !strncmp(name, "/usr/lib/", 9))
		return -EACCESS;

  "

  #
  # Note1: Obviously the filter engine would be extended to allow such simple string
  #        match functions. )
  #
  # Note2: ".." is disallowed so that sandboxed code cannot escape the restrictions
  #         using "/..".
  #

This kind of flexible and dynamic sandboxing would allow a wide range of file 
ops within the sandbox, while still isolating it from files not included in the 
specified VFS namespace.

( Note that there are tons of other examples as well, for useful security features
  that are best done using events outside the syscall boundary. )

The security event filters code tied to seccomp and syscalls at the moment is 
useful, but limited in its future potential.

So i argue that it should go slightly further and should become:

 - unprivileged:  application-definable, allowing the embedding of security 
                  policy in *apps* as well, not just the system

 - flexible:      can be added/removed runtime unprivileged, and cheaply so

 - transparent:   does not impact executing code that meets the policy

 - nestable:      it is inherited by child tasks and is fundamentally stackable,
                  multiple policies will have the combined effect and they
                  are transparent to each other. So if a child task within a
                  sandbox adds *more* checks then those add to the already
                  existing set of checks. We only narrow permissions, never
                  extend them.

 - generic:       allowing observation and (safe) control of security relevant
                  parameters not just at the system call boundary but at other
                  relevant places of kernel execution as well: which 
                  points/callbacks could also be used for other types of event 
                  extraction such as perf. It could even be shared with audit ...

I argue that this is the LSM and audit subsystems designed right: in the long 
run it could allow everything that LSM does at the moment - and so much more 
...

And you argue that allowing this would be bad, if it was extended like that 
then you'd consider it a failed scheme? Why?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 12:10                               ` Ingo Molnar
  (?)
@ 2011-05-13 12:19                                 ` Peter Zijlstra
  -1 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 12:19 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, microblaze-uclinux, linux-mips,
	linuxppc-dev, linux-s390, linux-sh, sparclinux, Linus Torvalds

On Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
>         err = event_vfs_getname(result);

I really think we should not do this. Events like we have them should be
inactive, totally passive entities, only observe but not affect
execution (other than the bare minimal time delay introduced by
observance).

If you want another entity that is more active, please invent a new name
for it and create a new subsystem for them, now you could have these
active entities also have an (automatic) passive event side, but that's
some detail.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:19                                 ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 12:19 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller

On Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
>         err =3D event_vfs_getname(result);

I really think we should not do this. Events like we have them should be
inactive, totally passive entities, only observe but not affect
execution (other than the bare minimal time delay introduced by
observance).

If you want another entity that is more active, please invent a new name
for it and create a new subsystem for them, now you could have these
active entities also have an (automatic) passive event side, but that's
some detail.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:19                                 ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 12:19 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
>         err = event_vfs_getname(result);

I really think we should not do this. Events like we have them should be
inactive, totally passive entities, only observe but not affect
execution (other than the bare minimal time delay introduced by
observance).

If you want another entity that is more active, please invent a new name
for it and create a new subsystem for them, now you could have these
active entities also have an (automatic) passive event side, but that's
some detail.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 12:19                                 ` Peter Zijlstra
  (?)
@ 2011-05-13 12:26                                   ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, microblaze-uclinux, linux-mips,
	linuxppc-dev, linux-s390, linux-sh, sparclinux, Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> >         err = event_vfs_getname(result);
> 
> I really think we should not do this. Events like we have them should be 
> inactive, totally passive entities, only observe but not affect execution 
> (other than the bare minimal time delay introduced by observance).

Well, this patchset already demonstrates that we can use a single event 
callback for a rather useful purpose.

Either it makes sense to do, in which case we should share facilities as much 
as possible, or it makes no sense, in which case we should not merge it at all.

> If you want another entity that is more active, please invent a new name for 
> it and create a new subsystem for them, now you could have these active 
> entities also have an (automatic) passive event side, but that's some detail.

Why should we have two callbacks next to each other:

	event_vfs_getname(result);
	result = check_event_vfs_getname(result);

if one could do it all?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:26                                   ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> >         err = event_vfs_getname(result);
> 
> I really think we should not do this. Events like we have them should be 
> inactive, totally passive entities, only observe but not affect execution 
> (other than the bare minimal time delay introduced by observance).

Well, this patchset already demonstrates that we can use a single event 
callback for a rather useful purpose.

Either it makes sense to do, in which case we should share facilities as much 
as possible, or it makes no sense, in which case we should not merge it at all.

> If you want another entity that is more active, please invent a new name for 
> it and create a new subsystem for them, now you could have these active 
> entities also have an (automatic) passive event side, but that's some detail.

Why should we have two callbacks next to each other:

	event_vfs_getname(result);
	result = check_event_vfs_getname(result);

if one could do it all?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:26                                   ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:26 UTC (permalink / raw)
  To: linux-arm-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> >         err = event_vfs_getname(result);
> 
> I really think we should not do this. Events like we have them should be 
> inactive, totally passive entities, only observe but not affect execution 
> (other than the bare minimal time delay introduced by observance).

Well, this patchset already demonstrates that we can use a single event 
callback for a rather useful purpose.

Either it makes sense to do, in which case we should share facilities as much 
as possible, or it makes no sense, in which case we should not merge it at all.

> If you want another entity that is more active, please invent a new name for 
> it and create a new subsystem for them, now you could have these active 
> entities also have an (automatic) passive event side, but that's some detail.

Why should we have two callbacks next to each other:

	event_vfs_getname(result);
	result = check_event_vfs_getname(result);

if one could do it all?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 12:26                                   ` Ingo Molnar
  (?)
@ 2011-05-13 12:39                                     ` Peter Zijlstra
  -1 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 12:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, microblaze-uclinux, linux-mips,
	linuxppc-dev, linux-s390, linux-sh, sparclinux, Linus Torvalds

On Fri, 2011-05-13 at 14:26 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> > >         err = event_vfs_getname(result);
> > 
> > I really think we should not do this. Events like we have them should be 
> > inactive, totally passive entities, only observe but not affect execution 
> > (other than the bare minimal time delay introduced by observance).
> 
> Well, this patchset already demonstrates that we can use a single event 
> callback for a rather useful purpose.

Can and should are two distinct things.

> Either it makes sense to do, in which case we should share facilities as much 
> as possible, or it makes no sense, in which case we should not merge it at all.

And I'm arguing we should _not_. Observing is radically different from
Affecting, at the very least the two things should have different
permission schemes. We should not confuse these two matters.

> > If you want another entity that is more active, please invent a new name for 
> > it and create a new subsystem for them, now you could have these active 
> > entities also have an (automatic) passive event side, but that's some detail.
> 
> Why should we have two callbacks next to each other:
> 
> 	event_vfs_getname(result);
> 	result = check_event_vfs_getname(result);
> 
> if one could do it all?

Did you actually read the bit where I said that check_event_* (although
I still think that name sucks) could imply a matching event_*?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:39                                     ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 12:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller

On Fri, 2011-05-13 at 14:26 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
>=20
> > On Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> > >         err =3D event_vfs_getname(result);
> >=20
> > I really think we should not do this. Events like we have them should b=
e=20
> > inactive, totally passive entities, only observe but not affect executi=
on=20
> > (other than the bare minimal time delay introduced by observance).
>=20
> Well, this patchset already demonstrates that we can use a single event=
=20
> callback for a rather useful purpose.

Can and should are two distinct things.

> Either it makes sense to do, in which case we should share facilities as =
much=20
> as possible, or it makes no sense, in which case we should not merge it a=
t all.

And I'm arguing we should _not_. Observing is radically different from
Affecting, at the very least the two things should have different
permission schemes. We should not confuse these two matters.

> > If you want another entity that is more active, please invent a new nam=
e for=20
> > it and create a new subsystem for them, now you could have these active=
=20
> > entities also have an (automatic) passive event side, but that's some d=
etail.
>=20
> Why should we have two callbacks next to each other:
>=20
> 	event_vfs_getname(result);
> 	result =3D check_event_vfs_getname(result);
>=20
> if one could do it all?

Did you actually read the bit where I said that check_event_* (although
I still think that name sucks) could imply a matching event_*?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:39                                     ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 12:39 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2011-05-13 at 14:26 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> > >         err = event_vfs_getname(result);
> > 
> > I really think we should not do this. Events like we have them should be 
> > inactive, totally passive entities, only observe but not affect execution 
> > (other than the bare minimal time delay introduced by observance).
> 
> Well, this patchset already demonstrates that we can use a single event 
> callback for a rather useful purpose.

Can and should are two distinct things.

> Either it makes sense to do, in which case we should share facilities as much 
> as possible, or it makes no sense, in which case we should not merge it at all.

And I'm arguing we should _not_. Observing is radically different from
Affecting, at the very least the two things should have different
permission schemes. We should not confuse these two matters.

> > If you want another entity that is more active, please invent a new name for 
> > it and create a new subsystem for them, now you could have these active 
> > entities also have an (automatic) passive event side, but that's some detail.
> 
> Why should we have two callbacks next to each other:
> 
> 	event_vfs_getname(result);
> 	result = check_event_vfs_getname(result);
> 
> if one could do it all?

Did you actually read the bit where I said that check_event_* (although
I still think that name sucks) could imply a matching event_*?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 12:39                                     ` Peter Zijlstra
  (?)
@ 2011-05-13 12:43                                       ` Peter Zijlstra
  -1 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 12:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, microblaze-uclinux, linux-mips,
	linuxppc-dev, linux-s390, linux-sh, sparclinux, Linus Torvalds

On Fri, 2011-05-13 at 14:39 +0200, Peter Zijlstra wrote:
> 
> >       event_vfs_getname(result);
> >       result = check_event_vfs_getname(result); 

Another fundamental difference is how to treat the callback chains for
these two.

Observers won't have a return value and are assumed to never fail,
therefore we can always call every entry on the callback list.

Active things otoh do have a return value, and thus we need to have
semantics that define what to do with that during callback iteration,
when to continue and when to break. Thus for active elements its
impossible to guarantee all entries will indeed be called.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:43                                       ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 12:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller

On Fri, 2011-05-13 at 14:39 +0200, Peter Zijlstra wrote:
>=20
> >       event_vfs_getname(result);
> >       result =3D check_event_vfs_getname(result);=20

Another fundamental difference is how to treat the callback chains for
these two.

Observers won't have a return value and are assumed to never fail,
therefore we can always call every entry on the callback list.

Active things otoh do have a return value, and thus we need to have
semantics that define what to do with that during callback iteration,
when to continue and when to break. Thus for active elements its
impossible to guarantee all entries will indeed be called.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:43                                       ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 12:43 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2011-05-13 at 14:39 +0200, Peter Zijlstra wrote:
> 
> >       event_vfs_getname(result);
> >       result = check_event_vfs_getname(result); 

Another fundamental difference is how to treat the callback chains for
these two.

Observers won't have a return value and are assumed to never fail,
therefore we can always call every entry on the callback list.

Active things otoh do have a return value, and thus we need to have
semantics that define what to do with that during callback iteration,
when to continue and when to break. Thus for active elements its
impossible to guarantee all entries will indeed be called.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 12:39                                     ` Peter Zijlstra
  (?)
@ 2011-05-13 12:49                                       ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, microblaze-uclinux, linux-mips,
	linuxppc-dev, linux-s390, linux-sh, sparclinux, Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> > Why should we have two callbacks next to each other:
> > 
> > 	event_vfs_getname(result);
> > 	result = check_event_vfs_getname(result);
> > 
> > if one could do it all?
> 
> Did you actually read the bit where I said that check_event_* (although
> I still think that name sucks) could imply a matching event_*?

No, did not notice that - and yes that solves this particular problem.

So given that by your own admission it makes sense to share the facilities at 
the low level, i also argue that it makes sense to share as high up as 
possible.

Are you perhaps arguing for a ->observe flag that would make 100% sure that the 
default behavior for events is observe-only? That would make sense indeed.

Otherwise both cases really want to use all the same facilities for event 
discovery, setup, control and potential extraction of events.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:49                                       ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller


* Peter Zijlstra <peterz@infradead.org> wrote:

> > Why should we have two callbacks next to each other:
> > 
> > 	event_vfs_getname(result);
> > 	result = check_event_vfs_getname(result);
> > 
> > if one could do it all?
> 
> Did you actually read the bit where I said that check_event_* (although
> I still think that name sucks) could imply a matching event_*?

No, did not notice that - and yes that solves this particular problem.

So given that by your own admission it makes sense to share the facilities at 
the low level, i also argue that it makes sense to share as high up as 
possible.

Are you perhaps arguing for a ->observe flag that would make 100% sure that the 
default behavior for events is observe-only? That would make sense indeed.

Otherwise both cases really want to use all the same facilities for event 
discovery, setup, control and potential extraction of events.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:49                                       ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:49 UTC (permalink / raw)
  To: linux-arm-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> > Why should we have two callbacks next to each other:
> > 
> > 	event_vfs_getname(result);
> > 	result = check_event_vfs_getname(result);
> > 
> > if one could do it all?
> 
> Did you actually read the bit where I said that check_event_* (although
> I still think that name sucks) could imply a matching event_*?

No, did not notice that - and yes that solves this particular problem.

So given that by your own admission it makes sense to share the facilities at 
the low level, i also argue that it makes sense to share as high up as 
possible.

Are you perhaps arguing for a ->observe flag that would make 100% sure that the 
default behavior for events is observe-only? That would make sense indeed.

Otherwise both cases really want to use all the same facilities for event 
discovery, setup, control and potential extraction of events.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 12:43                                       ` Peter Zijlstra
  (?)
@ 2011-05-13 12:54                                         ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, microblaze-uclinux, linux-mips,
	linuxppc-dev, linux-s390, linux-sh, sparclinux, Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:39 +0200, Peter Zijlstra wrote:
> > 
> > >       event_vfs_getname(result);
> > >       result = check_event_vfs_getname(result); 
> 
> Another fundamental difference is how to treat the callback chains for
> these two.
> 
> Observers won't have a return value and are assumed to never fail,
> therefore we can always call every entry on the callback list.
> 
> Active things otoh do have a return value, and thus we need to have
> semantics that define what to do with that during callback iteration,
> when to continue and when to break. Thus for active elements its
> impossible to guarantee all entries will indeed be called.

I think the sanest semantics is to run all active callbacks as well.

For example if this is used for three stacked security policies - as if 3 LSM 
modules were stacked at once. We'd call all three, and we'd determine that at 
least one failed - and we'd return a failure.

Even if the first one failed already we'd still want to trigger *all* the 
failures, because security policies like to know when they have triggered a 
failure (regardless of other active policies) and want to see that failure 
event (if they are logging such events).

So to me this looks pretty similar to observer callbacks as well, it's the 
natural extension to an observer callback chain.

Observer callbacks are simply constant functions (to the caller), those which 
never return failure and which never modify any of the parameters.

It's as if you argued that there should be separate syscalls/facilities for 
handling readonly files versus handling read/write files.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:54                                         ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:39 +0200, Peter Zijlstra wrote:
> > 
> > >       event_vfs_getname(result);
> > >       result = check_event_vfs_getname(result); 
> 
> Another fundamental difference is how to treat the callback chains for
> these two.
> 
> Observers won't have a return value and are assumed to never fail,
> therefore we can always call every entry on the callback list.
> 
> Active things otoh do have a return value, and thus we need to have
> semantics that define what to do with that during callback iteration,
> when to continue and when to break. Thus for active elements its
> impossible to guarantee all entries will indeed be called.

I think the sanest semantics is to run all active callbacks as well.

For example if this is used for three stacked security policies - as if 3 LSM 
modules were stacked at once. We'd call all three, and we'd determine that at 
least one failed - and we'd return a failure.

Even if the first one failed already we'd still want to trigger *all* the 
failures, because security policies like to know when they have triggered a 
failure (regardless of other active policies) and want to see that failure 
event (if they are logging such events).

So to me this looks pretty similar to observer callbacks as well, it's the 
natural extension to an observer callback chain.

Observer callbacks are simply constant functions (to the caller), those which 
never return failure and which never modify any of the parameters.

It's as if you argued that there should be separate syscalls/facilities for 
handling readonly files versus handling read/write files.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 12:54                                         ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 12:54 UTC (permalink / raw)
  To: linux-arm-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:39 +0200, Peter Zijlstra wrote:
> > 
> > >       event_vfs_getname(result);
> > >       result = check_event_vfs_getname(result); 
> 
> Another fundamental difference is how to treat the callback chains for
> these two.
> 
> Observers won't have a return value and are assumed to never fail,
> therefore we can always call every entry on the callback list.
> 
> Active things otoh do have a return value, and thus we need to have
> semantics that define what to do with that during callback iteration,
> when to continue and when to break. Thus for active elements its
> impossible to guarantee all entries will indeed be called.

I think the sanest semantics is to run all active callbacks as well.

For example if this is used for three stacked security policies - as if 3 LSM 
modules were stacked at once. We'd call all three, and we'd determine that at 
least one failed - and we'd return a failure.

Even if the first one failed already we'd still want to trigger *all* the 
failures, because security policies like to know when they have triggered a 
failure (regardless of other active policies) and want to see that failure 
event (if they are logging such events).

So to me this looks pretty similar to observer callbacks as well, it's the 
natural extension to an observer callback chain.

Observer callbacks are simply constant functions (to the caller), those which 
never return failure and which never modify any of the parameters.

It's as if you argued that there should be separate syscalls/facilities for 
handling readonly files versus handling read/write files.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 12:54                                         ` Ingo Molnar
  (?)
@ 2011-05-13 13:08                                           ` Peter Zijlstra
  -1 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 13:08 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, microblaze-uclinux, linux-mips,
	linuxppc-dev, linux-s390, linux-sh, sparclinux, Linus Torvalds

On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> I think the sanest semantics is to run all active callbacks as well.
> 
> For example if this is used for three stacked security policies - as if 3 LSM 
> modules were stacked at once. We'd call all three, and we'd determine that at 
> least one failed - and we'd return a failure. 

But that only works for boolean functions where you can return the
multi-bit-or of the result. What if you need to return the specific
error code.

Also, there's bound to be other cases where people will want to employ
this, look at all the various notifier chain muck we've got, it already
deals with much of this -- simply because users need it.

Then there's the whole indirection argument, if you don't need
indirection, its often better to not use it, I myself much prefer code
to look like:

   foo1(bar);
   foo2(bar);
   foo3(bar);

Than:

   foo_notifier(bar);

Simply because its much clearer who all are involved without me having
to grep around to see who registers for foo_notifier and wth they do
with it. It also makes it much harder to sneak in another user, whereas
its nearly impossible to find new notifier users.

Its also much faster, no extra memory accesses, no indirect function
calls, no other muck.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 13:08                                           ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 13:08 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller

On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> I think the sanest semantics is to run all active callbacks as well.
>=20
> For example if this is used for three stacked security policies - as if 3=
 LSM=20
> modules were stacked at once. We'd call all three, and we'd determine tha=
t at=20
> least one failed - and we'd return a failure.=20

But that only works for boolean functions where you can return the
multi-bit-or of the result. What if you need to return the specific
error code.

Also, there's bound to be other cases where people will want to employ
this, look at all the various notifier chain muck we've got, it already
deals with much of this -- simply because users need it.

Then there's the whole indirection argument, if you don't need
indirection, its often better to not use it, I myself much prefer code
to look like:

   foo1(bar);
   foo2(bar);
   foo3(bar);

Than:

   foo_notifier(bar);

Simply because its much clearer who all are involved without me having
to grep around to see who registers for foo_notifier and wth they do
with it. It also makes it much harder to sneak in another user, whereas
its nearly impossible to find new notifier users.

Its also much faster, no extra memory accesses, no indirect function
calls, no other muck.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 13:08                                           ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 13:08 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> I think the sanest semantics is to run all active callbacks as well.
> 
> For example if this is used for three stacked security policies - as if 3 LSM 
> modules were stacked at once. We'd call all three, and we'd determine that at 
> least one failed - and we'd return a failure. 

But that only works for boolean functions where you can return the
multi-bit-or of the result. What if you need to return the specific
error code.

Also, there's bound to be other cases where people will want to employ
this, look at all the various notifier chain muck we've got, it already
deals with much of this -- simply because users need it.

Then there's the whole indirection argument, if you don't need
indirection, its often better to not use it, I myself much prefer code
to look like:

   foo1(bar);
   foo2(bar);
   foo3(bar);

Than:

   foo_notifier(bar);

Simply because its much clearer who all are involved without me having
to grep around to see who registers for foo_notifier and wth they do
with it. It also makes it much harder to sneak in another user, whereas
its nearly impossible to find new notifier users.

Its also much faster, no extra memory accesses, no indirect function
calls, no other muck.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 13:08                                           ` Peter Zijlstra
  (?)
@ 2011-05-13 13:18                                             ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 13:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, microblaze-uclinux, linux-mips,
	linuxppc-dev, linux-s390, linux-sh, sparclinux, Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > I think the sanest semantics is to run all active callbacks as well.
> > 
> > For example if this is used for three stacked security policies - as if 3 LSM 
> > modules were stacked at once. We'd call all three, and we'd determine that at 
> > least one failed - and we'd return a failure. 
> 
> But that only works for boolean functions where you can return the
> multi-bit-or of the result. What if you need to return the specific
> error code.

Do you mean that one filter returns -EINVAL while the other -EACCES?

Seems like a non-problem to me, we'd return the first nonzero value.

> Also, there's bound to be other cases where people will want to employ
> this, look at all the various notifier chain muck we've got, it already
> deals with much of this -- simply because users need it.

Do you mean it would be easy to abuse it? What kind of abuse are you most 
worried about?

> Then there's the whole indirection argument, if you don't need
> indirection, its often better to not use it, I myself much prefer code
> to look like:
> 
>    foo1(bar);
>    foo2(bar);
>    foo3(bar);
> 
> Than:
> 
>    foo_notifier(bar);
> 
> Simply because its much clearer who all are involved without me having
> to grep around to see who registers for foo_notifier and wth they do
> with it. It also makes it much harder to sneak in another user, whereas
> its nearly impossible to find new notifier users.
> 
> Its also much faster, no extra memory accesses, no indirect function
> calls, no other muck.

But i suspect this question has been settled, given the fact that even pure 
observer events need and already process a chain of events? Am i missing 
something about your argument?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 13:18                                             ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 13:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > I think the sanest semantics is to run all active callbacks as well.
> > 
> > For example if this is used for three stacked security policies - as if 3 LSM 
> > modules were stacked at once. We'd call all three, and we'd determine that at 
> > least one failed - and we'd return a failure. 
> 
> But that only works for boolean functions where you can return the
> multi-bit-or of the result. What if you need to return the specific
> error code.

Do you mean that one filter returns -EINVAL while the other -EACCES?

Seems like a non-problem to me, we'd return the first nonzero value.

> Also, there's bound to be other cases where people will want to employ
> this, look at all the various notifier chain muck we've got, it already
> deals with much of this -- simply because users need it.

Do you mean it would be easy to abuse it? What kind of abuse are you most 
worried about?

> Then there's the whole indirection argument, if you don't need
> indirection, its often better to not use it, I myself much prefer code
> to look like:
> 
>    foo1(bar);
>    foo2(bar);
>    foo3(bar);
> 
> Than:
> 
>    foo_notifier(bar);
> 
> Simply because its much clearer who all are involved without me having
> to grep around to see who registers for foo_notifier and wth they do
> with it. It also makes it much harder to sneak in another user, whereas
> its nearly impossible to find new notifier users.
> 
> Its also much faster, no extra memory accesses, no indirect function
> calls, no other muck.

But i suspect this question has been settled, given the fact that even pure 
observer events need and already process a chain of events? Am i missing 
something about your argument?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 13:18                                             ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 13:18 UTC (permalink / raw)
  To: linux-arm-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > I think the sanest semantics is to run all active callbacks as well.
> > 
> > For example if this is used for three stacked security policies - as if 3 LSM 
> > modules were stacked at once. We'd call all three, and we'd determine that at 
> > least one failed - and we'd return a failure. 
> 
> But that only works for boolean functions where you can return the
> multi-bit-or of the result. What if you need to return the specific
> error code.

Do you mean that one filter returns -EINVAL while the other -EACCES?

Seems like a non-problem to me, we'd return the first nonzero value.

> Also, there's bound to be other cases where people will want to employ
> this, look at all the various notifier chain muck we've got, it already
> deals with much of this -- simply because users need it.

Do you mean it would be easy to abuse it? What kind of abuse are you most 
worried about?

> Then there's the whole indirection argument, if you don't need
> indirection, its often better to not use it, I myself much prefer code
> to look like:
> 
>    foo1(bar);
>    foo2(bar);
>    foo3(bar);
> 
> Than:
> 
>    foo_notifier(bar);
> 
> Simply because its much clearer who all are involved without me having
> to grep around to see who registers for foo_notifier and wth they do
> with it. It also makes it much harder to sneak in another user, whereas
> its nearly impossible to find new notifier users.
> 
> Its also much faster, no extra memory accesses, no indirect function
> calls, no other muck.

But i suspect this question has been settled, given the fact that even pure 
observer events need and already process a chain of events? Am i missing 
something about your argument?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 13:18                                             ` Ingo Molnar
  (?)
@ 2011-05-13 13:55                                               ` Peter Zijlstra
  -1 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 13:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

Cut the microblaze list since its bouncy.

On Fri, 2011-05-13 at 15:18 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > > I think the sanest semantics is to run all active callbacks as well.
> > > 
> > > For example if this is used for three stacked security policies - as if 3 LSM 
> > > modules were stacked at once. We'd call all three, and we'd determine that at 
> > > least one failed - and we'd return a failure. 
> > 
> > But that only works for boolean functions where you can return the
> > multi-bit-or of the result. What if you need to return the specific
> > error code.
> 
> Do you mean that one filter returns -EINVAL while the other -EACCES?
> 
> Seems like a non-problem to me, we'd return the first nonzero value.

Assuming the first is -EINVAL, what then is the value in computing the
-EACCESS? Sounds like a massive waste of time to me.

> > Also, there's bound to be other cases where people will want to employ
> > this, look at all the various notifier chain muck we've got, it already
> > deals with much of this -- simply because users need it.
> 
> Do you mean it would be easy to abuse it? What kind of abuse are you most 
> worried about?

I'm not worried about abuse, I'm saying that going by the existing
notifier pattern always visiting all entries on the callback list is
undesired.

> > Then there's the whole indirection argument, if you don't need
> > indirection, its often better to not use it, I myself much prefer code
> > to look like:
> > 
> >    foo1(bar);
> >    foo2(bar);
> >    foo3(bar);
> > 
> > Than:
> > 
> >    foo_notifier(bar);
> > 
> > Simply because its much clearer who all are involved without me having
> > to grep around to see who registers for foo_notifier and wth they do
> > with it. It also makes it much harder to sneak in another user, whereas
> > its nearly impossible to find new notifier users.
> > 
> > Its also much faster, no extra memory accesses, no indirect function
> > calls, no other muck.
> 
> But i suspect this question has been settled, given the fact that even pure 
> observer events need and already process a chain of events? Am i missing 
> something about your argument?

I'm saying that there's reasons to not use notifiers passive or active.

Mostly the whole notifier/indirection muck comes up once you want
modules to make use of the thing, because then you need dynamic
management of the callback list.

(Then again, I'm fairly glad we don't have explicit callbacks in
kernel/cpu.c for all the cpu-hotplug callbacks :-)

Anyway, I oppose for the existing events to gain an active role.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 13:55                                               ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 13:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Tejun Heo, linux390, Andrew Morton, agl,
	David S. Miller

Cut the microblaze list since its bouncy.

On Fri, 2011-05-13 at 15:18 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
>=20
> > On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > > I think the sanest semantics is to run all active callbacks as well.
> > >=20
> > > For example if this is used for three stacked security policies - as =
if 3 LSM=20
> > > modules were stacked at once. We'd call all three, and we'd determine=
 that at=20
> > > least one failed - and we'd return a failure.=20
> >=20
> > But that only works for boolean functions where you can return the
> > multi-bit-or of the result. What if you need to return the specific
> > error code.
>=20
> Do you mean that one filter returns -EINVAL while the other -EACCES?
>=20
> Seems like a non-problem to me, we'd return the first nonzero value.

Assuming the first is -EINVAL, what then is the value in computing the
-EACCESS? Sounds like a massive waste of time to me.

> > Also, there's bound to be other cases where people will want to employ
> > this, look at all the various notifier chain muck we've got, it already
> > deals with much of this -- simply because users need it.
>=20
> Do you mean it would be easy to abuse it? What kind of abuse are you most=
=20
> worried about?

I'm not worried about abuse, I'm saying that going by the existing
notifier pattern always visiting all entries on the callback list is
undesired.

> > Then there's the whole indirection argument, if you don't need
> > indirection, its often better to not use it, I myself much prefer code
> > to look like:
> >=20
> >    foo1(bar);
> >    foo2(bar);
> >    foo3(bar);
> >=20
> > Than:
> >=20
> >    foo_notifier(bar);
> >=20
> > Simply because its much clearer who all are involved without me having
> > to grep around to see who registers for foo_notifier and wth they do
> > with it. It also makes it much harder to sneak in another user, whereas
> > its nearly impossible to find new notifier users.
> >=20
> > Its also much faster, no extra memory accesses, no indirect function
> > calls, no other muck.
>=20
> But i suspect this question has been settled, given the fact that even pu=
re=20
> observer events need and already process a chain of events? Am i missing=
=20
> something about your argument?

I'm saying that there's reasons to not use notifiers passive or active.

Mostly the whole notifier/indirection muck comes up once you want
modules to make use of the thing, because then you need dynamic
management of the callback list.

(Then again, I'm fairly glad we don't have explicit callbacks in
kernel/cpu.c for all the cpu-hotplug callbacks :-)

Anyway, I oppose for the existing events to gain an active role.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 13:55                                               ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 13:55 UTC (permalink / raw)
  To: linux-arm-kernel

Cut the microblaze list since its bouncy.

On Fri, 2011-05-13 at 15:18 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > > I think the sanest semantics is to run all active callbacks as well.
> > > 
> > > For example if this is used for three stacked security policies - as if 3 LSM 
> > > modules were stacked at once. We'd call all three, and we'd determine that at 
> > > least one failed - and we'd return a failure. 
> > 
> > But that only works for boolean functions where you can return the
> > multi-bit-or of the result. What if you need to return the specific
> > error code.
> 
> Do you mean that one filter returns -EINVAL while the other -EACCES?
> 
> Seems like a non-problem to me, we'd return the first nonzero value.

Assuming the first is -EINVAL, what then is the value in computing the
-EACCESS? Sounds like a massive waste of time to me.

> > Also, there's bound to be other cases where people will want to employ
> > this, look at all the various notifier chain muck we've got, it already
> > deals with much of this -- simply because users need it.
> 
> Do you mean it would be easy to abuse it? What kind of abuse are you most 
> worried about?

I'm not worried about abuse, I'm saying that going by the existing
notifier pattern always visiting all entries on the callback list is
undesired.

> > Then there's the whole indirection argument, if you don't need
> > indirection, its often better to not use it, I myself much prefer code
> > to look like:
> > 
> >    foo1(bar);
> >    foo2(bar);
> >    foo3(bar);
> > 
> > Than:
> > 
> >    foo_notifier(bar);
> > 
> > Simply because its much clearer who all are involved without me having
> > to grep around to see who registers for foo_notifier and wth they do
> > with it. It also makes it much harder to sneak in another user, whereas
> > its nearly impossible to find new notifier users.
> > 
> > Its also much faster, no extra memory accesses, no indirect function
> > calls, no other muck.
> 
> But i suspect this question has been settled, given the fact that even pure 
> observer events need and already process a chain of events? Am i missing 
> something about your argument?

I'm saying that there's reasons to not use notifiers passive or active.

Mostly the whole notifier/indirection muck comes up once you want
modules to make use of the thing, because then you need dynamic
management of the callback list.

(Then again, I'm fairly glad we don't have explicit callbacks in
kernel/cpu.c for all the cpu-hotplug callbacks :-)

Anyway, I oppose for the existing events to gain an active role.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 12:49                                       ` Ingo Molnar
  (?)
@ 2011-05-13 13:55                                         ` Peter Zijlstra
  -1 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 13:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, microblaze-uclinux, linux-mips,
	linuxppc-dev, linux-s390, linux-sh, sparclinux, Linus Torvalds

On Fri, 2011-05-13 at 14:49 +0200, Ingo Molnar wrote:
> 
> So given that by your own admission it makes sense to share the facilities at 
> the low level, i also argue that it makes sense to share as high up as 
> possible. 

I'm not saying any such thing, I'm saying that it might make sense to
observe active objects and auto-create these observation points. That
doesn't make them similar or make them share anything.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 13:55                                         ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 13:55 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller

On Fri, 2011-05-13 at 14:49 +0200, Ingo Molnar wrote:
>=20
> So given that by your own admission it makes sense to share the facilitie=
s at=20
> the low level, i also argue that it makes sense to share as high up as=
=20
> possible.=20

I'm not saying any such thing, I'm saying that it might make sense to
observe active objects and auto-create these observation points. That
doesn't make them similar or make them share anything.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 13:55                                         ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 13:55 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2011-05-13 at 14:49 +0200, Ingo Molnar wrote:
> 
> So given that by your own admission it makes sense to share the facilities at 
> the low level, i also argue that it makes sense to share as high up as 
> possible. 

I'm not saying any such thing, I'm saying that it might make sense to
observe active objects and auto-create these observation points. That
doesn't make them similar or make them share anything.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 13:55                                               ` Peter Zijlstra
  (?)
@ 2011-05-13 14:57                                                 ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 14:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> Cut the microblaze list since its bouncy.
> 
> On Fri, 2011-05-13 at 15:18 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > > > I think the sanest semantics is to run all active callbacks as well.
> > > > 
> > > > For example if this is used for three stacked security policies - as if 3 LSM 
> > > > modules were stacked at once. We'd call all three, and we'd determine that at 
> > > > least one failed - and we'd return a failure. 
> > > 
> > > But that only works for boolean functions where you can return the
> > > multi-bit-or of the result. What if you need to return the specific
> > > error code.
> > 
> > Do you mean that one filter returns -EINVAL while the other -EACCES?
> > 
> > Seems like a non-problem to me, we'd return the first nonzero value.
> 
> Assuming the first is -EINVAL, what then is the value in computing the
> -EACCESS? Sounds like a massive waste of time to me.

No, because the common case is no rejection - this is a security mechanism. So 
in the normal case we would execute all 3 anyway, just to determine that all 
return 0.

Are you really worried about the abnormal case of one of them returning an 
error and us calculating all 3 return values?

> > > Also, there's bound to be other cases where people will want to employ
> > > this, look at all the various notifier chain muck we've got, it already
> > > deals with much of this -- simply because users need it.
> > 
> > Do you mean it would be easy to abuse it? What kind of abuse are you most 
> > worried about?
> 
> I'm not worried about abuse, I'm saying that going by the existing
> notifier pattern always visiting all entries on the callback list is
> undesired.

That is because many notifier chains are used in an 'event consuming' manner - 
they are responding to things like hardware events and are called in an 
interrupt-handler alike fashion most of the time.

> > > Then there's the whole indirection argument, if you don't need
> > > indirection, its often better to not use it, I myself much prefer code
> > > to look like:
> > > 
> > >    foo1(bar);
> > >    foo2(bar);
> > >    foo3(bar);
> > > 
> > > Than:
> > > 
> > >    foo_notifier(bar);
> > > 
> > > Simply because its much clearer who all are involved without me having
> > > to grep around to see who registers for foo_notifier and wth they do
> > > with it. It also makes it much harder to sneak in another user, whereas
> > > its nearly impossible to find new notifier users.
> > > 
> > > Its also much faster, no extra memory accesses, no indirect function
> > > calls, no other muck.
> > 
> > But i suspect this question has been settled, given the fact that even pure 
> > observer events need and already process a chain of events? Am i missing 
> > something about your argument?
> 
> I'm saying that there's reasons to not use notifiers passive or active.
> 
> Mostly the whole notifier/indirection muck comes up once you want
> modules to make use of the thing, because then you need dynamic
> management of the callback list.

But your argument assumes that we'd have a chain of functions to call, like 
regular notifiers.

While the natural model here would be to have a list of registered event 
structs for that point, with different filters but basically the same callback 
mechanism (a call into the filter engine in essence).

Also note that the common case would be no event registered - and we'd 
automatically optimize that case via the existing jump labels optimization.

> (Then again, I'm fairly glad we don't have explicit callbacks in kernel/cpu.c 
> for all the cpu-hotplug callbacks :-)
> 
> Anyway, I oppose for the existing events to gain an active role.

Why if 'being active' is optional and useful?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 14:57                                                 ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 14:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Tejun Heo, linux390, Andrew Morton, agl,
	David S. Miller


* Peter Zijlstra <peterz@infradead.org> wrote:

> Cut the microblaze list since its bouncy.
> 
> On Fri, 2011-05-13 at 15:18 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > > > I think the sanest semantics is to run all active callbacks as well.
> > > > 
> > > > For example if this is used for three stacked security policies - as if 3 LSM 
> > > > modules were stacked at once. We'd call all three, and we'd determine that at 
> > > > least one failed - and we'd return a failure. 
> > > 
> > > But that only works for boolean functions where you can return the
> > > multi-bit-or of the result. What if you need to return the specific
> > > error code.
> > 
> > Do you mean that one filter returns -EINVAL while the other -EACCES?
> > 
> > Seems like a non-problem to me, we'd return the first nonzero value.
> 
> Assuming the first is -EINVAL, what then is the value in computing the
> -EACCESS? Sounds like a massive waste of time to me.

No, because the common case is no rejection - this is a security mechanism. So 
in the normal case we would execute all 3 anyway, just to determine that all 
return 0.

Are you really worried about the abnormal case of one of them returning an 
error and us calculating all 3 return values?

> > > Also, there's bound to be other cases where people will want to employ
> > > this, look at all the various notifier chain muck we've got, it already
> > > deals with much of this -- simply because users need it.
> > 
> > Do you mean it would be easy to abuse it? What kind of abuse are you most 
> > worried about?
> 
> I'm not worried about abuse, I'm saying that going by the existing
> notifier pattern always visiting all entries on the callback list is
> undesired.

That is because many notifier chains are used in an 'event consuming' manner - 
they are responding to things like hardware events and are called in an 
interrupt-handler alike fashion most of the time.

> > > Then there's the whole indirection argument, if you don't need
> > > indirection, its often better to not use it, I myself much prefer code
> > > to look like:
> > > 
> > >    foo1(bar);
> > >    foo2(bar);
> > >    foo3(bar);
> > > 
> > > Than:
> > > 
> > >    foo_notifier(bar);
> > > 
> > > Simply because its much clearer who all are involved without me having
> > > to grep around to see who registers for foo_notifier and wth they do
> > > with it. It also makes it much harder to sneak in another user, whereas
> > > its nearly impossible to find new notifier users.
> > > 
> > > Its also much faster, no extra memory accesses, no indirect function
> > > calls, no other muck.
> > 
> > But i suspect this question has been settled, given the fact that even pure 
> > observer events need and already process a chain of events? Am i missing 
> > something about your argument?
> 
> I'm saying that there's reasons to not use notifiers passive or active.
> 
> Mostly the whole notifier/indirection muck comes up once you want
> modules to make use of the thing, because then you need dynamic
> management of the callback list.

But your argument assumes that we'd have a chain of functions to call, like 
regular notifiers.

While the natural model here would be to have a list of registered event 
structs for that point, with different filters but basically the same callback 
mechanism (a call into the filter engine in essence).

Also note that the common case would be no event registered - and we'd 
automatically optimize that case via the existing jump labels optimization.

> (Then again, I'm fairly glad we don't have explicit callbacks in kernel/cpu.c 
> for all the cpu-hotplug callbacks :-)
> 
> Anyway, I oppose for the existing events to gain an active role.

Why if 'being active' is optional and useful?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 14:57                                                 ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 14:57 UTC (permalink / raw)
  To: linux-arm-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> Cut the microblaze list since its bouncy.
> 
> On Fri, 2011-05-13 at 15:18 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > > > I think the sanest semantics is to run all active callbacks as well.
> > > > 
> > > > For example if this is used for three stacked security policies - as if 3 LSM 
> > > > modules were stacked at once. We'd call all three, and we'd determine that at 
> > > > least one failed - and we'd return a failure. 
> > > 
> > > But that only works for boolean functions where you can return the
> > > multi-bit-or of the result. What if you need to return the specific
> > > error code.
> > 
> > Do you mean that one filter returns -EINVAL while the other -EACCES?
> > 
> > Seems like a non-problem to me, we'd return the first nonzero value.
> 
> Assuming the first is -EINVAL, what then is the value in computing the
> -EACCESS? Sounds like a massive waste of time to me.

No, because the common case is no rejection - this is a security mechanism. So 
in the normal case we would execute all 3 anyway, just to determine that all 
return 0.

Are you really worried about the abnormal case of one of them returning an 
error and us calculating all 3 return values?

> > > Also, there's bound to be other cases where people will want to employ
> > > this, look at all the various notifier chain muck we've got, it already
> > > deals with much of this -- simply because users need it.
> > 
> > Do you mean it would be easy to abuse it? What kind of abuse are you most 
> > worried about?
> 
> I'm not worried about abuse, I'm saying that going by the existing
> notifier pattern always visiting all entries on the callback list is
> undesired.

That is because many notifier chains are used in an 'event consuming' manner - 
they are responding to things like hardware events and are called in an 
interrupt-handler alike fashion most of the time.

> > > Then there's the whole indirection argument, if you don't need
> > > indirection, its often better to not use it, I myself much prefer code
> > > to look like:
> > > 
> > >    foo1(bar);
> > >    foo2(bar);
> > >    foo3(bar);
> > > 
> > > Than:
> > > 
> > >    foo_notifier(bar);
> > > 
> > > Simply because its much clearer who all are involved without me having
> > > to grep around to see who registers for foo_notifier and wth they do
> > > with it. It also makes it much harder to sneak in another user, whereas
> > > its nearly impossible to find new notifier users.
> > > 
> > > Its also much faster, no extra memory accesses, no indirect function
> > > calls, no other muck.
> > 
> > But i suspect this question has been settled, given the fact that even pure 
> > observer events need and already process a chain of events? Am i missing 
> > something about your argument?
> 
> I'm saying that there's reasons to not use notifiers passive or active.
> 
> Mostly the whole notifier/indirection muck comes up once you want
> modules to make use of the thing, because then you need dynamic
> management of the callback list.

But your argument assumes that we'd have a chain of functions to call, like 
regular notifiers.

While the natural model here would be to have a list of registered event 
structs for that point, with different filters but basically the same callback 
mechanism (a call into the filter engine in essence).

Also note that the common case would be no event registered - and we'd 
automatically optimize that case via the existing jump labels optimization.

> (Then again, I'm fairly glad we don't have explicit callbacks in kernel/cpu.c 
> for all the cpu-hotplug callbacks :-)
> 
> Anyway, I oppose for the existing events to gain an active role.

Why if 'being active' is optional and useful?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 13:55                                         ` Peter Zijlstra
  (?)
@ 2011-05-13 15:02                                           ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 15:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, microblaze-uclinux, linux-mips,
	linuxppc-dev, linux-s390, linux-sh, sparclinux, Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:49 +0200, Ingo Molnar wrote:
> > 
> > So given that by your own admission it makes sense to share the facilities at 
> > the low level, i also argue that it makes sense to share as high up as 
> > possible. 
> 
> I'm not saying any such thing, I'm saying that it might make sense to
> observe active objects and auto-create these observation points. That
> doesn't make them similar or make them share anything.

Well, they would share the lowest level call site:

	result = check_event_vfs_getname(result);

You call it 'auto-generated call site', i call it a shared (single line) call 
site. The same thing as far as the lowest level goes.

Now (the way i understood it) you'd want to stop the sharing right after that. 
I argue that it should go all the way up.

Note: i fully agree that there should be events where filters can have no 
effect whatsoever. For example if this was written as:

	check_event_vfs_getname(result);

Then it would have no effect. This is decided by the subsystem developers, 
obviously. So whether an event is 'active' or 'passive' can be enforced at the 
subsystem level as well.

As far as the event facilities go, 'no effect observation' is a special-case of 
'active observation' - just like read-only files are a special case of 
read-write files.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:02                                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 15:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, linux-arm-kernel,
	kees.cook, Serge E. Hallyn, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:49 +0200, Ingo Molnar wrote:
> > 
> > So given that by your own admission it makes sense to share the facilities at 
> > the low level, i also argue that it makes sense to share as high up as 
> > possible. 
> 
> I'm not saying any such thing, I'm saying that it might make sense to
> observe active objects and auto-create these observation points. That
> doesn't make them similar or make them share anything.

Well, they would share the lowest level call site:

	result = check_event_vfs_getname(result);

You call it 'auto-generated call site', i call it a shared (single line) call 
site. The same thing as far as the lowest level goes.

Now (the way i understood it) you'd want to stop the sharing right after that. 
I argue that it should go all the way up.

Note: i fully agree that there should be events where filters can have no 
effect whatsoever. For example if this was written as:

	check_event_vfs_getname(result);

Then it would have no effect. This is decided by the subsystem developers, 
obviously. So whether an event is 'active' or 'passive' can be enforced at the 
subsystem level as well.

As far as the event facilities go, 'no effect observation' is a special-case of 
'active observation' - just like read-only files are a special case of 
read-write files.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:02                                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-13 15:02 UTC (permalink / raw)
  To: linux-arm-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 14:49 +0200, Ingo Molnar wrote:
> > 
> > So given that by your own admission it makes sense to share the facilities at 
> > the low level, i also argue that it makes sense to share as high up as 
> > possible. 
> 
> I'm not saying any such thing, I'm saying that it might make sense to
> observe active objects and auto-create these observation points. That
> doesn't make them similar or make them share anything.

Well, they would share the lowest level call site:

	result = check_event_vfs_getname(result);

You call it 'auto-generated call site', i call it a shared (single line) call 
site. The same thing as far as the lowest level goes.

Now (the way i understood it) you'd want to stop the sharing right after that. 
I argue that it should go all the way up.

Note: i fully agree that there should be events where filters can have no 
effect whatsoever. For example if this was written as:

	check_event_vfs_getname(result);

Then it would have no effect. This is decided by the subsystem developers, 
obviously. So whether an event is 'active' or 'passive' can be enforced at the 
subsystem level as well.

As far as the event facilities go, 'no effect observation' is a special-case of 
'active observation' - just like read-only files are a special case of 
read-write files.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 12:10                               ` Ingo Molnar
  (?)
@ 2011-05-13 15:10                                 ` Eric Paris
  -1 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-13 15:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, kees.cook, agl, Peter Zijlstra,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds

[dropping microblaze and roland]

lOn Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> * James Morris <jmorris@namei.org> wrote:

> It is a simple and sensible security feature, agreed? It allows most code to 
> run well and link to countless libraries - but no access to other files is 
> allowed.

It's simple enough and sounds reasonable, but you can read all the
discussion about AppArmour why many people don't really think it's the
best.  Still, I'll agree it's a lot better than nothing.

> But if i had a VFS event at the fs/namei.c::getname() level, i would have 
> access to a central point where the VFS string becomes stable to the kernel and 
> can be checked (and denied if necessary).
> 
> A sidenote, and not surprisingly, the audit subsystem already has an event 
> callback there:
> 
>         audit_getname(result);
> 
> Unfortunately this audit callback cannot be used for my purposes, because the 
> event is single-purpose for auditd and because it allows no feedback (no 
> deny/accept discretion for the security policy).
> 
> But if had this simple event there:
> 
> 	err = event_vfs_getname(result);

Wow it sounds so easy.  Now lets keep extending your train of thought
until we can actually provide the security provided by SELinux.  What do
we end up with?  We end up with an event hook right next to every LSM
hook.  You know, the LSM hooks were placed where they are for a reason.
Because those were the locations inside the kernel where you actually
have information about the task doing an operation and the objects
(files, sockets, directories, other tasks, etc) they are doing an
operation on.

Honestly all you are talking about it remaking the LSM with 2 sets of
hooks instead if 1.  Why?  It seems much easier that if you want the
language of the filter engine you would just make a new LSM that uses
the filter engine for it's policy language rather than the language
created by SELinux or SMACK or name your LSM implementation.

>  - unprivileged:  application-definable, allowing the embedding of security 
>                   policy in *apps* as well, not just the system
> 
>  - flexible:      can be added/removed runtime unprivileged, and cheaply so
> 
>  - transparent:   does not impact executing code that meets the policy
> 
>  - nestable:      it is inherited by child tasks and is fundamentally stackable,
>                   multiple policies will have the combined effect and they
>                   are transparent to each other. So if a child task within a
>                   sandbox adds *more* checks then those add to the already
>                   existing set of checks. We only narrow permissions, never
>                   extend them.
> 
>  - generic:       allowing observation and (safe) control of security relevant
>                   parameters not just at the system call boundary but at other
>                   relevant places of kernel execution as well: which 
>                   points/callbacks could also be used for other types of event 
>                   extraction such as perf. It could even be shared with audit ...

I'm not arguing that any of these things are bad things.  What you
describe is a new LSM that uses a discretionary access control model but
with the granularity and flexibility that has traditionally only existed
in the mandatory access control security modules previously implemented
in the kernel.

I won't argue that's a bad idea, there's no reason in my mind that a
process shouldn't be allowed to control it's own access decisions in a
more flexible way than rwx bits.  Then again, I certainly don't see a
reason that this syscall hardening patch should be held up while a whole
new concept in computer security is contemplated...

-Eric

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:10                                 ` Eric Paris
  0 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-13 15:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Peter Zijlstra, Steven Rostedt, Tejun Heo,
	Thomas Gleixner, linux-arm-kernel, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

[dropping microblaze and roland]

lOn Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> * James Morris <jmorris@namei.org> wrote:

> It is a simple and sensible security feature, agreed? It allows most code to 
> run well and link to countless libraries - but no access to other files is 
> allowed.

It's simple enough and sounds reasonable, but you can read all the
discussion about AppArmour why many people don't really think it's the
best.  Still, I'll agree it's a lot better than nothing.

> But if i had a VFS event at the fs/namei.c::getname() level, i would have 
> access to a central point where the VFS string becomes stable to the kernel and 
> can be checked (and denied if necessary).
> 
> A sidenote, and not surprisingly, the audit subsystem already has an event 
> callback there:
> 
>         audit_getname(result);
> 
> Unfortunately this audit callback cannot be used for my purposes, because the 
> event is single-purpose for auditd and because it allows no feedback (no 
> deny/accept discretion for the security policy).
> 
> But if had this simple event there:
> 
> 	err = event_vfs_getname(result);

Wow it sounds so easy.  Now lets keep extending your train of thought
until we can actually provide the security provided by SELinux.  What do
we end up with?  We end up with an event hook right next to every LSM
hook.  You know, the LSM hooks were placed where they are for a reason.
Because those were the locations inside the kernel where you actually
have information about the task doing an operation and the objects
(files, sockets, directories, other tasks, etc) they are doing an
operation on.

Honestly all you are talking about it remaking the LSM with 2 sets of
hooks instead if 1.  Why?  It seems much easier that if you want the
language of the filter engine you would just make a new LSM that uses
the filter engine for it's policy language rather than the language
created by SELinux or SMACK or name your LSM implementation.

>  - unprivileged:  application-definable, allowing the embedding of security 
>                   policy in *apps* as well, not just the system
> 
>  - flexible:      can be added/removed runtime unprivileged, and cheaply so
> 
>  - transparent:   does not impact executing code that meets the policy
> 
>  - nestable:      it is inherited by child tasks and is fundamentally stackable,
>                   multiple policies will have the combined effect and they
>                   are transparent to each other. So if a child task within a
>                   sandbox adds *more* checks then those add to the already
>                   existing set of checks. We only narrow permissions, never
>                   extend them.
> 
>  - generic:       allowing observation and (safe) control of security relevant
>                   parameters not just at the system call boundary but at other
>                   relevant places of kernel execution as well: which 
>                   points/callbacks could also be used for other types of event 
>                   extraction such as perf. It could even be shared with audit ...

I'm not arguing that any of these things are bad things.  What you
describe is a new LSM that uses a discretionary access control model but
with the granularity and flexibility that has traditionally only existed
in the mandatory access control security modules previously implemented
in the kernel.

I won't argue that's a bad idea, there's no reason in my mind that a
process shouldn't be allowed to control it's own access decisions in a
more flexible way than rwx bits.  Then again, I certainly don't see a
reason that this syscall hardening patch should be held up while a whole
new concept in computer security is contemplated...

-Eric

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:10                                 ` Eric Paris
  0 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-13 15:10 UTC (permalink / raw)
  To: linux-arm-kernel

[dropping microblaze and roland]

lOn Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> * James Morris <jmorris@namei.org> wrote:

> It is a simple and sensible security feature, agreed? It allows most code to 
> run well and link to countless libraries - but no access to other files is 
> allowed.

It's simple enough and sounds reasonable, but you can read all the
discussion about AppArmour why many people don't really think it's the
best.  Still, I'll agree it's a lot better than nothing.

> But if i had a VFS event at the fs/namei.c::getname() level, i would have 
> access to a central point where the VFS string becomes stable to the kernel and 
> can be checked (and denied if necessary).
> 
> A sidenote, and not surprisingly, the audit subsystem already has an event 
> callback there:
> 
>         audit_getname(result);
> 
> Unfortunately this audit callback cannot be used for my purposes, because the 
> event is single-purpose for auditd and because it allows no feedback (no 
> deny/accept discretion for the security policy).
> 
> But if had this simple event there:
> 
> 	err = event_vfs_getname(result);

Wow it sounds so easy.  Now lets keep extending your train of thought
until we can actually provide the security provided by SELinux.  What do
we end up with?  We end up with an event hook right next to every LSM
hook.  You know, the LSM hooks were placed where they are for a reason.
Because those were the locations inside the kernel where you actually
have information about the task doing an operation and the objects
(files, sockets, directories, other tasks, etc) they are doing an
operation on.

Honestly all you are talking about it remaking the LSM with 2 sets of
hooks instead if 1.  Why?  It seems much easier that if you want the
language of the filter engine you would just make a new LSM that uses
the filter engine for it's policy language rather than the language
created by SELinux or SMACK or name your LSM implementation.

>  - unprivileged:  application-definable, allowing the embedding of security 
>                   policy in *apps* as well, not just the system
> 
>  - flexible:      can be added/removed runtime unprivileged, and cheaply so
> 
>  - transparent:   does not impact executing code that meets the policy
> 
>  - nestable:      it is inherited by child tasks and is fundamentally stackable,
>                   multiple policies will have the combined effect and they
>                   are transparent to each other. So if a child task within a
>                   sandbox adds *more* checks then those add to the already
>                   existing set of checks. We only narrow permissions, never
>                   extend them.
> 
>  - generic:       allowing observation and (safe) control of security relevant
>                   parameters not just at the system call boundary but at other
>                   relevant places of kernel execution as well: which 
>                   points/callbacks could also be used for other types of event 
>                   extraction such as perf. It could even be shared with audit ...

I'm not arguing that any of these things are bad things.  What you
describe is a new LSM that uses a discretionary access control model but
with the granularity and flexibility that has traditionally only existed
in the mandatory access control security modules previously implemented
in the kernel.

I won't argue that's a bad idea, there's no reason in my mind that a
process shouldn't be allowed to control it's own access decisions in a
more flexible way than rwx bits.  Then again, I certainly don't see a
reason that this syscall hardening patch should be held up while a whole
new concept in computer security is contemplated...

-Eric

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 13:18                                             ` Ingo Molnar
  (?)
@ 2011-05-13 15:17                                               ` Eric Paris
  -1 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-13 15:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, James Morris, Will Drewry, linux-kernel,
	Steven Rostedt, Frederic Weisbecker, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

[dropping microblaze and roland]

On Fri, 2011-05-13 at 15:18 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > > I think the sanest semantics is to run all active callbacks as well.
> > > 
> > > For example if this is used for three stacked security policies - as if 3 LSM 
> > > modules were stacked at once. We'd call all three, and we'd determine that at 
> > > least one failed - and we'd return a failure. 
> > 
> > But that only works for boolean functions where you can return the
> > multi-bit-or of the result. What if you need to return the specific
> > error code.
> 
> Do you mean that one filter returns -EINVAL while the other -EACCES?
> 
> Seems like a non-problem to me, we'd return the first nonzero value.

Sounds so easy!  Why haven't LSMs stacked already?  Because what happens
if one of these hooks did something stateful?  Lets say on open, hook #1
returns EPERM.  hook #2 allocates memory.  The open is going to fail and
hooks #2 is never going to get the close() which should have freed the
allocation.  If you can be completely stateless its easier, but there's
a reason that stacking security modules is hard.  Serge has tried in the
past and both dhowells and casey schaufler are working on it right now.
Stacking is never as easy as it sounds   :)

-Eric

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:17                                               ` Eric Paris
  0 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-13 15:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

[dropping microblaze and roland]

On Fri, 2011-05-13 at 15:18 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > > I think the sanest semantics is to run all active callbacks as well.
> > > 
> > > For example if this is used for three stacked security policies - as if 3 LSM 
> > > modules were stacked at once. We'd call all three, and we'd determine that at 
> > > least one failed - and we'd return a failure. 
> > 
> > But that only works for boolean functions where you can return the
> > multi-bit-or of the result. What if you need to return the specific
> > error code.
> 
> Do you mean that one filter returns -EINVAL while the other -EACCES?
> 
> Seems like a non-problem to me, we'd return the first nonzero value.

Sounds so easy!  Why haven't LSMs stacked already?  Because what happens
if one of these hooks did something stateful?  Lets say on open, hook #1
returns EPERM.  hook #2 allocates memory.  The open is going to fail and
hooks #2 is never going to get the close() which should have freed the
allocation.  If you can be completely stateless its easier, but there's
a reason that stacking security modules is hard.  Serge has tried in the
past and both dhowells and casey schaufler are working on it right now.
Stacking is never as easy as it sounds   :)

-Eric

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:17                                               ` Eric Paris
  0 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-13 15:17 UTC (permalink / raw)
  To: linux-arm-kernel

[dropping microblaze and roland]

On Fri, 2011-05-13 at 15:18 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, 2011-05-13 at 14:54 +0200, Ingo Molnar wrote:
> > > I think the sanest semantics is to run all active callbacks as well.
> > > 
> > > For example if this is used for three stacked security policies - as if 3 LSM 
> > > modules were stacked at once. We'd call all three, and we'd determine that at 
> > > least one failed - and we'd return a failure. 
> > 
> > But that only works for boolean functions where you can return the
> > multi-bit-or of the result. What if you need to return the specific
> > error code.
> 
> Do you mean that one filter returns -EINVAL while the other -EACCES?
> 
> Seems like a non-problem to me, we'd return the first nonzero value.

Sounds so easy!  Why haven't LSMs stacked already?  Because what happens
if one of these hooks did something stateful?  Lets say on open, hook #1
returns EPERM.  hook #2 allocates memory.  The open is going to fail and
hooks #2 is never going to get the close() which should have freed the
allocation.  If you can be completely stateless its easier, but there's
a reason that stacking security modules is hard.  Serge has tried in the
past and both dhowells and casey schaufler are working on it right now.
Stacking is never as easy as it sounds   :)

-Eric

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 15:10                                 ` Eric Paris
  (?)
@ 2011-05-13 15:23                                   ` Peter Zijlstra
  -1 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 15:23 UTC (permalink / raw)
  To: Eric Paris
  Cc: Ingo Molnar, James Morris, Will Drewry, linux-kernel,
	Steven Rostedt, Frederic Weisbecker, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Fri, 2011-05-13 at 11:10 -0400, Eric Paris wrote:
> Then again, I certainly don't see a
> reason that this syscall hardening patch should be held up while a whole
> new concept in computer security is contemplated... 

Which makes me wonder why this syscall hardening stuff is done outside
of LSM? Why isn't is part of the LSM so that say SELinux can have a
syscall bitmask per security context?

Making it part of the LSM also avoids having to add this prctl().

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:23                                   ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 15:23 UTC (permalink / raw)
  To: Eric Paris
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, H. Peter Anvin,
	sparclinux, Jiri Slaby, linux-s390, Russell King, x86,
	James Morris, Linus Torvalds, Ingo Molnar, Ingo Molnar,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	kees.cook, linux-arm-kernel, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Fri, 2011-05-13 at 11:10 -0400, Eric Paris wrote:
> Then again, I certainly don't see a
> reason that this syscall hardening patch should be held up while a whole
> new concept in computer security is contemplated...=20

Which makes me wonder why this syscall hardening stuff is done outside
of LSM? Why isn't is part of the LSM so that say SELinux can have a
syscall bitmask per security context?

Making it part of the LSM also avoids having to add this prctl().

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:23                                   ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 15:23 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2011-05-13 at 11:10 -0400, Eric Paris wrote:
> Then again, I certainly don't see a
> reason that this syscall hardening patch should be held up while a whole
> new concept in computer security is contemplated... 

Which makes me wonder why this syscall hardening stuff is done outside
of LSM? Why isn't is part of the LSM so that say SELinux can have a
syscall bitmask per security context?

Making it part of the LSM also avoids having to add this prctl().

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 14:57                                                 ` Ingo Molnar
  (?)
@ 2011-05-13 15:27                                                   ` Peter Zijlstra
  -1 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 15:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, David Howells, Russell King,
	Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Fri, 2011-05-13 at 16:57 +0200, Ingo Molnar wrote:
> this is a security mechanism

Who says? and why would you want to unify two separate concepts only to
them limit it to security that just doesn't make sense.

Either you provide a full on replacement for notifier chain like things
or you don't, only extending trace events in this fashion for security
is like way weird.

Plus see the arguments Eric made about stacking stuff, not only security
schemes will have those problems.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:27                                                   ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 15:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Fri, 2011-05-13 at 16:57 +0200, Ingo Molnar wrote:
> this is a security mechanism

Who says? and why would you want to unify two separate concepts only to
them limit it to security that just doesn't make sense.

Either you provide a full on replacement for notifier chain like things
or you don't, only extending trace events in this fashion for security
is like way weird.

Plus see the arguments Eric made about stacking stuff, not only security
schemes will have those problems.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:27                                                   ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-13 15:27 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2011-05-13 at 16:57 +0200, Ingo Molnar wrote:
> this is a security mechanism

Who says? and why would you want to unify two separate concepts only to
them limit it to security that just doesn't make sense.

Either you provide a full on replacement for notifier chain like things
or you don't, only extending trace events in this fashion for security
is like way weird.

Plus see the arguments Eric made about stacking stuff, not only security
schemes will have those problems.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* RE: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system callfiltering
@ 2011-05-13 15:29                                                 ` David Laight
  0 siblings, 0 replies; 406+ messages in thread
From: David Laight @ 2011-05-13 15:29 UTC (permalink / raw)
  To: Eric Paris, Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	H. PeterAnvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

> ... If you can be completely stateless its easier, but there's
> a reason that stacking security modules is hard.  Serge has tried in
the
> past and both dhowells and casey schaufler are working on it right
now.
> Stacking is never as easy as it sounds   :)

For a bad example of trying to allow alternate security models
look at NetBSD's kauth code :-)

NetBSD also had issues where some 'system call trace' code
was being used to (try to) apply security - unfortunately
it worked by looking at the user-space buffers on system
call entry - and a multithreaded program can easily arrange
to update them after the initial check!
For trace/event type activities this wouldn't really matter,
for security policy it does.
(I've not looked directly at these event points in linux)

	David

^ permalink raw reply	[flat|nested] 406+ messages in thread

* RE: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system callfiltering
@ 2011-05-13 15:29                                                 ` David Laight
  0 siblings, 0 replies; 406+ messages in thread
From: David Laight @ 2011-05-13 15:29 UTC (permalink / raw)
  To: Eric Paris, Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	H. PeterAnvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

> ... If you can be completely stateless its easier, but there's
> a reason that stacking security modules is hard.  Serge has tried in
the
> past and both dhowells and casey schaufler are working on it right
now.
> Stacking is never as easy as it sounds   :)

For a bad example of trying to allow alternate security models
look at NetBSD's kauth code :-)

NetBSD also had issues where some 'system call trace' code
was being used to (try to) apply security - unfortunately
it worked by looking at the user-space buffers on system
call entry - and a multithreaded program can easily arrange
to update them after the initial check!
For trace/event type activities this wouldn't really matter,
for security policy it does.
(I've not looked directly at these event points in linux)

	David

^ permalink raw reply	[flat|nested] 406+ messages in thread

* RE: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system callfiltering
@ 2011-05-13 15:29                                                 ` David Laight
  0 siblings, 0 replies; 406+ messages in thread
From: David Laight @ 2011-05-13 15:29 UTC (permalink / raw)
  To: Eric Paris, Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, linux-kernel, David Howells, Paul Mackerras,
	H. PeterAnvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Ingo Molnar, kees.cook, Serge E. Hallyn,
	Steven Rostedt, Martin Schwidefsky, Thomas Gleixner, agl,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, Oleg Nesterov, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, Linus Torvalds, David S. Miller

> ... If you can be completely stateless its easier, but there's
> a reason that stacking security modules is hard.  Serge has tried in
the
> past and both dhowells and casey schaufler are working on it right
now.
> Stacking is never as easy as it sounds   :)

For a bad example of trying to allow alternate security models
look at NetBSD's kauth code :-)

NetBSD also had issues where some 'system call trace' code
was being used to (try to) apply security - unfortunately
it worked by looking at the user-space buffers on system
call entry - and a multithreaded program can easily arrange
to update them after the initial check!
For trace/event type activities this wouldn't really matter,
for security policy it does.
(I've not looked directly at these event points in linux)

	David

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system callfiltering
@ 2011-05-13 15:29                                                 ` David Laight
  0 siblings, 0 replies; 406+ messages in thread
From: David Laight @ 2011-05-13 15:29 UTC (permalink / raw)
  To: linux-arm-kernel

> ... If you can be completely stateless its easier, but there's
> a reason that stacking security modules is hard.  Serge has tried in
the
> past and both dhowells and casey schaufler are working on it right
now.
> Stacking is never as easy as it sounds   :)

For a bad example of trying to allow alternate security models
look at NetBSD's kauth code :-)

NetBSD also had issues where some 'system call trace' code
was being used to (try to) apply security - unfortunately
it worked by looking at the user-space buffers on system
call entry - and a multithreaded program can easily arrange
to update them after the initial check!
For trace/event type activities this wouldn't really matter,
for security policy it does.
(I've not looked directly at these event points in linux)

	David

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 15:23                                   ` Peter Zijlstra
  (?)
@ 2011-05-13 15:55                                     ` Eric Paris
  -1 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-13 15:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, James Morris, Will Drewry, linux-kernel,
	Steven Rostedt, Frederic Weisbecker, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Fri, 2011-05-13 at 17:23 +0200, Peter Zijlstra wrote:
> On Fri, 2011-05-13 at 11:10 -0400, Eric Paris wrote:
> > Then again, I certainly don't see a
> > reason that this syscall hardening patch should be held up while a whole
> > new concept in computer security is contemplated... 
> 
> Which makes me wonder why this syscall hardening stuff is done outside
> of LSM? Why isn't is part of the LSM so that say SELinux can have a
> syscall bitmask per security context?

I could do that, but I like Will's approach better.  From the PoV of
meeting security goals of information flow, data confidentiality,
integrity, least priv, etc limiting on the syscall boundary doesn't make
a lot of sense.  You just don't know enough there to enforce these
things.  These are the types of goals that SELinux and other LSMs have
previously tried to enforce.  From the PoV of making the kernel more
resistant to attacks and making a process more resistant to misbehavior
I think that the syscall boundary is appropriate.  Although I could do
it in SELinux it don't really want to do it there.

In case people are interested or confused let me give my definition of
two words I've used a bit in these conversations: discretionary and
mandatory.  Any time I talk about a 'discretionary' security decision it
is a security decisions that a process imposed upon itself.  Aka the
choice to use seccomp is discretionary.  The choice to mark our own file
u-wx is discretionary.  This isn't the best definition but it's one that
works well in this discussion.  Mandatory security is one enforce by a
global policy.  It's what selinux is all about.  SELinux doesn't give
hoot what a process wants to do, it enforces a global policy from the
top down.  You take over a process, well, too bad, you still have no
choice but to follow the mandatory policy.

The LSM does NOT enforce a mandatory access control model, it's just how
it's been used in the past.  Ingo appears to me (please correct me if
I'm wrong) to really be a fan of exposing the flexibility of the LSM to
a discretionary access control model.  That doesn't seem like a bad
idea.  And maybe using the filter engine to define the language to do
this isn't a bad idea either.  But I think that's a 'down the road'
project, not something to hold up a better seccomp.

> Making it part of the LSM also avoids having to add this prctl().

Well, it would mean exposing some new language construct to every LSM
(instead of a single prctl construct) and it would mean anyone wanting
to use the interface would have to rely on the LSM implementing those
hooks the way they need it.  Honestly chrome can already get all of the
benefits of this patch (given a perfectly coded kernel) and a whole lot
more using SELinux, but (surprise surprise) not everyone uses SELinux.
I think it's a good idea to expose a simple interface which will be
widely enough adopted that many userspace applications can rely on it
for hardening.

The existence of the LSM and the fact that there exists multiple
security modules that may or may not be enabled really leads application
developers to be unable to rely on LSM for security.  If linux had a
single security model which everyone could rely on we wouldn't really
have as big of an issue but that's not possible.  So I'm advocating for
this series which will provide a single useful change which applications
can rely upon across distros and platforms to enhance the properties and
abilities of the linux kernel.

-Eric

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:55                                     ` Eric Paris
  0 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-13 15:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, H. Peter Anvin,
	sparclinux, Jiri Slaby, linux-s390, Russell King, x86,
	James Morris, Linus Torvalds, Ingo Molnar, Ingo Molnar,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	kees.cook, linux-arm-kernel, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Fri, 2011-05-13 at 17:23 +0200, Peter Zijlstra wrote:
> On Fri, 2011-05-13 at 11:10 -0400, Eric Paris wrote:
> > Then again, I certainly don't see a
> > reason that this syscall hardening patch should be held up while a whole
> > new concept in computer security is contemplated... 
> 
> Which makes me wonder why this syscall hardening stuff is done outside
> of LSM? Why isn't is part of the LSM so that say SELinux can have a
> syscall bitmask per security context?

I could do that, but I like Will's approach better.  From the PoV of
meeting security goals of information flow, data confidentiality,
integrity, least priv, etc limiting on the syscall boundary doesn't make
a lot of sense.  You just don't know enough there to enforce these
things.  These are the types of goals that SELinux and other LSMs have
previously tried to enforce.  From the PoV of making the kernel more
resistant to attacks and making a process more resistant to misbehavior
I think that the syscall boundary is appropriate.  Although I could do
it in SELinux it don't really want to do it there.

In case people are interested or confused let me give my definition of
two words I've used a bit in these conversations: discretionary and
mandatory.  Any time I talk about a 'discretionary' security decision it
is a security decisions that a process imposed upon itself.  Aka the
choice to use seccomp is discretionary.  The choice to mark our own file
u-wx is discretionary.  This isn't the best definition but it's one that
works well in this discussion.  Mandatory security is one enforce by a
global policy.  It's what selinux is all about.  SELinux doesn't give
hoot what a process wants to do, it enforces a global policy from the
top down.  You take over a process, well, too bad, you still have no
choice but to follow the mandatory policy.

The LSM does NOT enforce a mandatory access control model, it's just how
it's been used in the past.  Ingo appears to me (please correct me if
I'm wrong) to really be a fan of exposing the flexibility of the LSM to
a discretionary access control model.  That doesn't seem like a bad
idea.  And maybe using the filter engine to define the language to do
this isn't a bad idea either.  But I think that's a 'down the road'
project, not something to hold up a better seccomp.

> Making it part of the LSM also avoids having to add this prctl().

Well, it would mean exposing some new language construct to every LSM
(instead of a single prctl construct) and it would mean anyone wanting
to use the interface would have to rely on the LSM implementing those
hooks the way they need it.  Honestly chrome can already get all of the
benefits of this patch (given a perfectly coded kernel) and a whole lot
more using SELinux, but (surprise surprise) not everyone uses SELinux.
I think it's a good idea to expose a simple interface which will be
widely enough adopted that many userspace applications can rely on it
for hardening.

The existence of the LSM and the fact that there exists multiple
security modules that may or may not be enabled really leads application
developers to be unable to rely on LSM for security.  If linux had a
single security model which everyone could rely on we wouldn't really
have as big of an issue but that's not possible.  So I'm advocating for
this series which will provide a single useful change which applications
can rely upon across distros and platforms to enhance the properties and
abilities of the linux kernel.

-Eric

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 15:55                                     ` Eric Paris
  0 siblings, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-13 15:55 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2011-05-13 at 17:23 +0200, Peter Zijlstra wrote:
> On Fri, 2011-05-13 at 11:10 -0400, Eric Paris wrote:
> > Then again, I certainly don't see a
> > reason that this syscall hardening patch should be held up while a whole
> > new concept in computer security is contemplated... 
> 
> Which makes me wonder why this syscall hardening stuff is done outside
> of LSM? Why isn't is part of the LSM so that say SELinux can have a
> syscall bitmask per security context?

I could do that, but I like Will's approach better.  From the PoV of
meeting security goals of information flow, data confidentiality,
integrity, least priv, etc limiting on the syscall boundary doesn't make
a lot of sense.  You just don't know enough there to enforce these
things.  These are the types of goals that SELinux and other LSMs have
previously tried to enforce.  From the PoV of making the kernel more
resistant to attacks and making a process more resistant to misbehavior
I think that the syscall boundary is appropriate.  Although I could do
it in SELinux it don't really want to do it there.

In case people are interested or confused let me give my definition of
two words I've used a bit in these conversations: discretionary and
mandatory.  Any time I talk about a 'discretionary' security decision it
is a security decisions that a process imposed upon itself.  Aka the
choice to use seccomp is discretionary.  The choice to mark our own file
u-wx is discretionary.  This isn't the best definition but it's one that
works well in this discussion.  Mandatory security is one enforce by a
global policy.  It's what selinux is all about.  SELinux doesn't give
hoot what a process wants to do, it enforces a global policy from the
top down.  You take over a process, well, too bad, you still have no
choice but to follow the mandatory policy.

The LSM does NOT enforce a mandatory access control model, it's just how
it's been used in the past.  Ingo appears to me (please correct me if
I'm wrong) to really be a fan of exposing the flexibility of the LSM to
a discretionary access control model.  That doesn't seem like a bad
idea.  And maybe using the filter engine to define the language to do
this isn't a bad idea either.  But I think that's a 'down the road'
project, not something to hold up a better seccomp.

> Making it part of the LSM also avoids having to add this prctl().

Well, it would mean exposing some new language construct to every LSM
(instead of a single prctl construct) and it would mean anyone wanting
to use the interface would have to rely on the LSM implementing those
hooks the way they need it.  Honestly chrome can already get all of the
benefits of this patch (given a perfectly coded kernel) and a whole lot
more using SELinux, but (surprise surprise) not everyone uses SELinux.
I think it's a good idea to expose a simple interface which will be
widely enough adopted that many userspace applications can rely on it
for hardening.

The existence of the LSM and the fact that there exists multiple
security modules that may or may not be enabled really leads application
developers to be unable to rely on LSM for security.  If linux had a
single security model which everyone could rely on we wouldn't really
have as big of an issue but that's not possible.  So I'm advocating for
this series which will provide a single useful change which applications
can rely upon across distros and platforms to enhance the properties and
abilities of the linux kernel.

-Eric

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 15:55                                     ` Eric Paris
  (?)
@ 2011-05-13 16:29                                       ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-13 16:29 UTC (permalink / raw)
  To: Eric Paris
  Cc: Peter Zijlstra, Ingo Molnar, James Morris, linux-kernel,
	Steven Rostedt, Frederic Weisbecker, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Fri, May 13, 2011 at 10:55 AM, Eric Paris <eparis@redhat.com> wrote:
> On Fri, 2011-05-13 at 17:23 +0200, Peter Zijlstra wrote:
>> On Fri, 2011-05-13 at 11:10 -0400, Eric Paris wrote:
>> > Then again, I certainly don't see a
>> > reason that this syscall hardening patch should be held up while a whole
>> > new concept in computer security is contemplated...
>>
>> Which makes me wonder why this syscall hardening stuff is done outside
>> of LSM? Why isn't is part of the LSM so that say SELinux can have a
>> syscall bitmask per security context?
>
> I could do that, but I like Will's approach better.  From the PoV of
> meeting security goals of information flow, data confidentiality,
> integrity, least priv, etc limiting on the syscall boundary doesn't make
> a lot of sense.  You just don't know enough there to enforce these
> things.  These are the types of goals that SELinux and other LSMs have
> previously tried to enforce.  From the PoV of making the kernel more
> resistant to attacks and making a process more resistant to misbehavior
> I think that the syscall boundary is appropriate.  Although I could do
> it in SELinux it don't really want to do it there.

There's also the problem that there are no hooks per-system call for
LSMs, only logical hooks that sometimes mirror system call names and
are called after user data has been parsed.  If system call enter
hooks, like seccomp's, were added for LSMs, it would allow the lsm
bitmask approach, but it still wouldn't satisfy the issues you raise
below (and I wholeheartedly agree with).

> In case people are interested or confused let me give my definition of
> two words I've used a bit in these conversations: discretionary and
> mandatory.  Any time I talk about a 'discretionary' security decision it
> is a security decisions that a process imposed upon itself.  Aka the
> choice to use seccomp is discretionary.  The choice to mark our own file
> u-wx is discretionary.  This isn't the best definition but it's one that
> works well in this discussion.  Mandatory security is one enforce by a
> global policy.  It's what selinux is all about.  SELinux doesn't give
> hoot what a process wants to do, it enforces a global policy from the
> top down.  You take over a process, well, too bad, you still have no
> choice but to follow the mandatory policy.
>
> The LSM does NOT enforce a mandatory access control model, it's just how
> it's been used in the past.  Ingo appears to me (please correct me if
> I'm wrong) to really be a fan of exposing the flexibility of the LSM to
> a discretionary access control model.  That doesn't seem like a bad
> idea.  And maybe using the filter engine to define the language to do
> this isn't a bad idea either.  But I think that's a 'down the road'
> project, not something to hold up a better seccomp.
>
>> Making it part of the LSM also avoids having to add this prctl().
>
> Well, it would mean exposing some new language construct to every LSM
> (instead of a single prctl construct) and it would mean anyone wanting
> to use the interface would have to rely on the LSM implementing those
> hooks the way they need it.  Honestly chrome can already get all of the
> benefits of this patch (given a perfectly coded kernel) and a whole lot
> more using SELinux, but (surprise surprise) not everyone uses SELinux.
> I think it's a good idea to expose a simple interface which will be
> widely enough adopted that many userspace applications can rely on it
> for hardening.
>
> The existence of the LSM and the fact that there exists multiple
> security modules that may or may not be enabled really leads application
> developers to be unable to rely on LSM for security.  If linux had a
> single security model which everyone could rely on we wouldn't really
> have as big of an issue but that's not possible.  So I'm advocating for
> this series which will provide a single useful change which applications
> can rely upon across distros and platforms to enhance the properties and
> abilities of the linux kernel.
>
> -Eric
>
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 16:29                                       ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-13 16:29 UTC (permalink / raw)
  To: Eric Paris
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, Ingo Molnar,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	kees.cook, linux-arm-kernel, Michal Marek, Michal Simek,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Fri, May 13, 2011 at 10:55 AM, Eric Paris <eparis@redhat.com> wrote:
> On Fri, 2011-05-13 at 17:23 +0200, Peter Zijlstra wrote:
>> On Fri, 2011-05-13 at 11:10 -0400, Eric Paris wrote:
>> > Then again, I certainly don't see a
>> > reason that this syscall hardening patch should be held up while a who=
le
>> > new concept in computer security is contemplated...
>>
>> Which makes me wonder why this syscall hardening stuff is done outside
>> of LSM? Why isn't is part of the LSM so that say SELinux can have a
>> syscall bitmask per security context?
>
> I could do that, but I like Will's approach better. =A0From the PoV of
> meeting security goals of information flow, data confidentiality,
> integrity, least priv, etc limiting on the syscall boundary doesn't make
> a lot of sense. =A0You just don't know enough there to enforce these
> things. =A0These are the types of goals that SELinux and other LSMs have
> previously tried to enforce. =A0From the PoV of making the kernel more
> resistant to attacks and making a process more resistant to misbehavior
> I think that the syscall boundary is appropriate. =A0Although I could do
> it in SELinux it don't really want to do it there.

There's also the problem that there are no hooks per-system call for
LSMs, only logical hooks that sometimes mirror system call names and
are called after user data has been parsed.  If system call enter
hooks, like seccomp's, were added for LSMs, it would allow the lsm
bitmask approach, but it still wouldn't satisfy the issues you raise
below (and I wholeheartedly agree with).

> In case people are interested or confused let me give my definition of
> two words I've used a bit in these conversations: discretionary and
> mandatory. =A0Any time I talk about a 'discretionary' security decision i=
t
> is a security decisions that a process imposed upon itself. =A0Aka the
> choice to use seccomp is discretionary. =A0The choice to mark our own fil=
e
> u-wx is discretionary. =A0This isn't the best definition but it's one tha=
t
> works well in this discussion. =A0Mandatory security is one enforce by a
> global policy. =A0It's what selinux is all about. =A0SELinux doesn't give
> hoot what a process wants to do, it enforces a global policy from the
> top down. =A0You take over a process, well, too bad, you still have no
> choice but to follow the mandatory policy.
>
> The LSM does NOT enforce a mandatory access control model, it's just how
> it's been used in the past. =A0Ingo appears to me (please correct me if
> I'm wrong) to really be a fan of exposing the flexibility of the LSM to
> a discretionary access control model. =A0That doesn't seem like a bad
> idea. =A0And maybe using the filter engine to define the language to do
> this isn't a bad idea either. =A0But I think that's a 'down the road'
> project, not something to hold up a better seccomp.
>
>> Making it part of the LSM also avoids having to add this prctl().
>
> Well, it would mean exposing some new language construct to every LSM
> (instead of a single prctl construct) and it would mean anyone wanting
> to use the interface would have to rely on the LSM implementing those
> hooks the way they need it. =A0Honestly chrome can already get all of the
> benefits of this patch (given a perfectly coded kernel) and a whole lot
> more using SELinux, but (surprise surprise) not everyone uses SELinux.
> I think it's a good idea to expose a simple interface which will be
> widely enough adopted that many userspace applications can rely on it
> for hardening.
>
> The existence of the LSM and the fact that there exists multiple
> security modules that may or may not be enabled really leads application
> developers to be unable to rely on LSM for security. =A0If linux had a
> single security model which everyone could rely on we wouldn't really
> have as big of an issue but that's not possible. =A0So I'm advocating for
> this series which will provide a single useful change which applications
> can rely upon across distros and platforms to enhance the properties and
> abilities of the linux kernel.
>
> -Eric
>
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 16:29                                       ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-13 16:29 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, May 13, 2011 at 10:55 AM, Eric Paris <eparis@redhat.com> wrote:
> On Fri, 2011-05-13 at 17:23 +0200, Peter Zijlstra wrote:
>> On Fri, 2011-05-13 at 11:10 -0400, Eric Paris wrote:
>> > Then again, I certainly don't see a
>> > reason that this syscall hardening patch should be held up while a whole
>> > new concept in computer security is contemplated...
>>
>> Which makes me wonder why this syscall hardening stuff is done outside
>> of LSM? Why isn't is part of the LSM so that say SELinux can have a
>> syscall bitmask per security context?
>
> I could do that, but I like Will's approach better. ?From the PoV of
> meeting security goals of information flow, data confidentiality,
> integrity, least priv, etc limiting on the syscall boundary doesn't make
> a lot of sense. ?You just don't know enough there to enforce these
> things. ?These are the types of goals that SELinux and other LSMs have
> previously tried to enforce. ?From the PoV of making the kernel more
> resistant to attacks and making a process more resistant to misbehavior
> I think that the syscall boundary is appropriate. ?Although I could do
> it in SELinux it don't really want to do it there.

There's also the problem that there are no hooks per-system call for
LSMs, only logical hooks that sometimes mirror system call names and
are called after user data has been parsed.  If system call enter
hooks, like seccomp's, were added for LSMs, it would allow the lsm
bitmask approach, but it still wouldn't satisfy the issues you raise
below (and I wholeheartedly agree with).

> In case people are interested or confused let me give my definition of
> two words I've used a bit in these conversations: discretionary and
> mandatory. ?Any time I talk about a 'discretionary' security decision it
> is a security decisions that a process imposed upon itself. ?Aka the
> choice to use seccomp is discretionary. ?The choice to mark our own file
> u-wx is discretionary. ?This isn't the best definition but it's one that
> works well in this discussion. ?Mandatory security is one enforce by a
> global policy. ?It's what selinux is all about. ?SELinux doesn't give
> hoot what a process wants to do, it enforces a global policy from the
> top down. ?You take over a process, well, too bad, you still have no
> choice but to follow the mandatory policy.
>
> The LSM does NOT enforce a mandatory access control model, it's just how
> it's been used in the past. ?Ingo appears to me (please correct me if
> I'm wrong) to really be a fan of exposing the flexibility of the LSM to
> a discretionary access control model. ?That doesn't seem like a bad
> idea. ?And maybe using the filter engine to define the language to do
> this isn't a bad idea either. ?But I think that's a 'down the road'
> project, not something to hold up a better seccomp.
>
>> Making it part of the LSM also avoids having to add this prctl().
>
> Well, it would mean exposing some new language construct to every LSM
> (instead of a single prctl construct) and it would mean anyone wanting
> to use the interface would have to rely on the LSM implementing those
> hooks the way they need it. ?Honestly chrome can already get all of the
> benefits of this patch (given a perfectly coded kernel) and a whole lot
> more using SELinux, but (surprise surprise) not everyone uses SELinux.
> I think it's a good idea to expose a simple interface which will be
> widely enough adopted that many userspace applications can rely on it
> for hardening.
>
> The existence of the LSM and the fact that there exists multiple
> security modules that may or may not be enabled really leads application
> developers to be unable to rely on LSM for security. ?If linux had a
> single security model which everyone could rely on we wouldn't really
> have as big of an issue but that's not possible. ?So I'm advocating for
> this series which will provide a single useful change which applications
> can rely upon across distros and platforms to enhance the properties and
> abilities of the linux kernel.
>
> -Eric
>
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12  3:02                     ` Will Drewry
  (?)
@ 2011-05-13 19:35                       ` Arnd Bergmann
  -1 siblings, 0 replies; 406+ messages in thread
From: Arnd Bergmann @ 2011-05-13 19:35 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: Will Drewry, linux-kernel, linux-mips, linux-sh, Peter Zijlstra,
	Frederic Weisbecker, Heiko Carstens, David Howells,
	Paul Mackerras, Eric Paris, H. Peter Anvin, sparclinux,
	Jiri Slaby, linux-s390, Russell King, x86, jmorris, Ingo Molnar,
	Benjamin Herrenschmidt, Ingo Molnar, Serge E. Hallyn,
	Peter Zijlstra, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, kees.cook, Roland McGrath,
	Michal Marek, Michal Simek, linuxppc-dev, Oleg Nesterov,
	Ralf Baechle, Paul Mundt, Tejun Heo, linux390, Andrew Morton,
	agl, David S. Miller

On Thursday 12 May 2011, Will Drewry wrote:
> This change adds a new seccomp mode based on the work by
> agl@chromium.org in [1]. This new mode, "filter mode", provides a hash
> table of seccomp_filter objects.  When in the new mode (2), all system
> calls are checked against the filters - first by system call number,
> then by a filter string.  If an entry exists for a given system call and
> all filter predicates evaluate to true, then the task may proceed.
> Otherwise, the task is killed (as per seccomp_mode == 1).

I've got a question about this: Do you expect the typical usage to disallow
ioctl()? Given that ioctl alone is responsible for a huge number of exploits
in various drivers, while certain ioctls are immensely useful (FIONREAD,
FIOASYNC, ...), do you expect to extend the mechanism to filter specific
ioctl commands in the future?

	Arnd

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 19:35                       ` Arnd Bergmann
  0 siblings, 0 replies; 406+ messages in thread
From: Arnd Bergmann @ 2011-05-13 19:35 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, jmorris, Ingo Molnar, Ingo Molnar,
	Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Tejun Heo, Thomas Gleixner, kees.cook,
	Roland McGrath, Michal Marek, Michal Simek, Will Drewry, agl,
	linux-kernel, Eric Paris, Paul Mundt, Martin Schwidefsky,
	linux390, Andrew Morton, linuxppc-dev, David S. Miller

On Thursday 12 May 2011, Will Drewry wrote:
> This change adds a new seccomp mode based on the work by
> agl@chromium.org in [1]. This new mode, "filter mode", provides a hash
> table of seccomp_filter objects.  When in the new mode (2), all system
> calls are checked against the filters - first by system call number,
> then by a filter string.  If an entry exists for a given system call and
> all filter predicates evaluate to true, then the task may proceed.
> Otherwise, the task is killed (as per seccomp_mode == 1).

I've got a question about this: Do you expect the typical usage to disallow
ioctl()? Given that ioctl alone is responsible for a huge number of exploits
in various drivers, while certain ioctls are immensely useful (FIONREAD,
FIOASYNC, ...), do you expect to extend the mechanism to filter specific
ioctl commands in the future?

	Arnd

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-13 19:35                       ` Arnd Bergmann
  0 siblings, 0 replies; 406+ messages in thread
From: Arnd Bergmann @ 2011-05-13 19:35 UTC (permalink / raw)
  To: linux-arm-kernel

On Thursday 12 May 2011, Will Drewry wrote:
> This change adds a new seccomp mode based on the work by
> agl at chromium.org in [1]. This new mode, "filter mode", provides a hash
> table of seccomp_filter objects.  When in the new mode (2), all system
> calls are checked against the filters - first by system call number,
> then by a filter string.  If an entry exists for a given system call and
> all filter predicates evaluate to true, then the task may proceed.
> Otherwise, the task is killed (as per seccomp_mode == 1).

I've got a question about this: Do you expect the typical usage to disallow
ioctl()? Given that ioctl alone is responsible for a huge number of exploits
in various drivers, while certain ioctls are immensely useful (FIONREAD,
FIOASYNC, ...), do you expect to extend the mechanism to filter specific
ioctl commands in the future?

	Arnd

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 15:27                                                   ` Peter Zijlstra
  (?)
@ 2011-05-14  7:05                                                     ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-14  7:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, David Howells, Russell King,
	Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 16:57 +0200, Ingo Molnar wrote:
> > this is a security mechanism
> 
> Who says? [...]

Kernel developers/maintainers of the affected code.

We have security hooks all around the kernel, which can deny/accept execution 
at various key points, but we do not have 'execute arbitrary user-space defined 
(safe) scripts' callbacks in general.

But yes, if a particular callback point is defined widely enough to allow much 
bigger intervention into the flow of execution, then more is possible as well.

> [...] and why would you want to unify two separate concepts only to them 
> limit it to security that just doesn't make sense.

I don't limit them to security - the callbacks themselves are either for 
passive observation or, at most, for security accept/deny callbacks.

It's decided by the subsystem maintainers what kind of user-space control power 
(or observation power) they want to allow, not me.

I would just like to not stop the facility itself at the 'observe only' level, 
like you suggest.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-14  7:05                                                     ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-14  7:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 16:57 +0200, Ingo Molnar wrote:
> > this is a security mechanism
> 
> Who says? [...]

Kernel developers/maintainers of the affected code.

We have security hooks all around the kernel, which can deny/accept execution 
at various key points, but we do not have 'execute arbitrary user-space defined 
(safe) scripts' callbacks in general.

But yes, if a particular callback point is defined widely enough to allow much 
bigger intervention into the flow of execution, then more is possible as well.

> [...] and why would you want to unify two separate concepts only to them 
> limit it to security that just doesn't make sense.

I don't limit them to security - the callbacks themselves are either for 
passive observation or, at most, for security accept/deny callbacks.

It's decided by the subsystem maintainers what kind of user-space control power 
(or observation power) they want to allow, not me.

I would just like to not stop the facility itself at the 'observe only' level, 
like you suggest.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-14  7:05                                                     ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-14  7:05 UTC (permalink / raw)
  To: linux-arm-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2011-05-13 at 16:57 +0200, Ingo Molnar wrote:
> > this is a security mechanism
> 
> Who says? [...]

Kernel developers/maintainers of the affected code.

We have security hooks all around the kernel, which can deny/accept execution 
at various key points, but we do not have 'execute arbitrary user-space defined 
(safe) scripts' callbacks in general.

But yes, if a particular callback point is defined widely enough to allow much 
bigger intervention into the flow of execution, then more is possible as well.

> [...] and why would you want to unify two separate concepts only to them 
> limit it to security that just doesn't make sense.

I don't limit them to security - the callbacks themselves are either for 
passive observation or, at most, for security accept/deny callbacks.

It's decided by the subsystem maintainers what kind of user-space control power 
(or observation power) they want to allow, not me.

I would just like to not stop the facility itself at the 'observe only' level, 
like you suggest.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 15:10                                 ` Eric Paris
  (?)
@ 2011-05-14  7:30                                   ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-14  7:30 UTC (permalink / raw)
  To: Eric Paris
  Cc: James Morris, Will Drewry, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, kees.cook, agl, Peter Zijlstra,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds


* Eric Paris <eparis@redhat.com> wrote:

> [dropping microblaze and roland]
> 
> lOn Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> > * James Morris <jmorris@namei.org> wrote:
> 
> > It is a simple and sensible security feature, agreed? It allows most code to 
> > run well and link to countless libraries - but no access to other files is 
> > allowed.
> 
> It's simple enough and sounds reasonable, but you can read all the discussion 
> about AppArmour why many people don't really think it's the best. [...]

I have to say most of the participants of the AppArmour flamefests were dead 
wrong, and it wasnt the AppArmour folks who were wrong ...

The straight ASCII VFS namespace *makes sense*, and yes, the raw physical 
objects space that SELinux uses makes sense as well.

And no, i do not subscribe to the dogma that it is not possible to secure the 
ASCII VFS namespace: it evidently is possible, if you know and handle the 
ambiguitites. It is also obviously true that the ASCII VFS namespaces we use 
every day are a *lot* more intuitive than the labeled physical objects space 
...

What all the security flamewars missed is the simple fact that being intuitive 
matters a *lot* not just to not annoy users, but also to broaden the effective 
number of security-conscious developers ...

> > Unfortunately this audit callback cannot be used for my purposes, because 
> > the event is single-purpose for auditd and because it allows no feedback 
> > (no deny/accept discretion for the security policy).
> > 
> > But if had this simple event there:
> > 
> > 	err = event_vfs_getname(result);
> 
> Wow it sounds so easy.  Now lets keep extending your train of thought
> until we can actually provide the security provided by SELinux.  What do
> we end up with?  We end up with an event hook right next to every LSM
> hook.  You know, the LSM hooks were placed where they are for a reason.
> Because those were the locations inside the kernel where you actually
> have information about the task doing an operation and the objects
> (files, sockets, directories, other tasks, etc) they are doing an
> operation on.
> 
> Honestly all you are talking about it remaking the LSM with 2 sets of
> hooks instead if 1.  Why? [...]

Not at all. I am taking about using *one* set of events, to keep the intrusion 
at the lowest possible level.

LSM could make use of them as well.

Obviously for pragmatic reasons that might not be feasible initially.

> [...]  It seems much easier that if you want the language of the filter 
> engine you would just make a new LSM that uses the filter engine for it's 
> policy language rather than the language created by SELinux or SMACK or name 
> your LSM implementation.

Correct, that is what i suggested.

Note that performance is a primary concern, so if certain filters are very 
popular then in practice this would come with support for a couple of 'built 
in' (pre-optimized) filters that the kernel can accelerate directly, so that we 
do not incure the cost of executing the filter preds for really common-sense 
security policies that almost everyone is using.

I.e. in the end we'd *roughly* end up with the same performance and security as 
we are today (i mean, SELinux and the other LSMs did a nice job of collecting 
the things that apps should be careful about), but the crutial difference isnt 
just the advantages i menioned, but the fact that the *development model* of 
security modules would be a *lot* more extensible.

So security efforts could move to a whole different level: they could move into 
key apps and they could integrate with the general mind-set of developers.

At least Will as an application framework developer cares, so that hope is 
justified i think.

> >  - unprivileged:  application-definable, allowing the embedding of security 
> >                   policy in *apps* as well, not just the system
> > 
> >  - flexible:      can be added/removed runtime unprivileged, and cheaply so
> > 
> >  - transparent:   does not impact executing code that meets the policy
> > 
> >  - nestable:      it is inherited by child tasks and is fundamentally stackable,
> >                   multiple policies will have the combined effect and they
> >                   are transparent to each other. So if a child task within a
> >                   sandbox adds *more* checks then those add to the already
> >                   existing set of checks. We only narrow permissions, never
> >                   extend them.
> > 
> >  - generic:       allowing observation and (safe) control of security relevant
> >                   parameters not just at the system call boundary but at other
> >                   relevant places of kernel execution as well: which 
> >                   points/callbacks could also be used for other types of event 
> >                   extraction such as perf. It could even be shared with audit ...
> 
> I'm not arguing that any of these things are bad things.  What you describe 
> is a new LSM that uses a discretionary access control model but with the 
> granularity and flexibility that has traditionally only existed in the 
> mandatory access control security modules previously implemented in the 
> kernel.
> 
> I won't argue that's a bad idea, there's no reason in my mind that a process 
> shouldn't be allowed to control it's own access decisions in a more flexible 
> way than rwx bits.  Then again, I certainly don't see a reason that this 
> syscall hardening patch should be held up while a whole new concept in 
> computer security is contemplated...

Note, i'm not actually asking for the moon, a pony and more.

I fully submit that we are yet far away from being able to do a full LSM via 
this mechanism.

What i'm asking for is that because the syscall point steps taken by Will look 
very promising so it would be nice to do *that* in a slightly more flexible 
scheme that does not condemn it to be limited to the syscall boundary and such 
...

Also, to answer you, do you say that by my argument someone should have stood 
up and said 'no' to the LSM mess that was introduced a couple of years ago and 
which caused so many problems:

 - kernel inefficiencies and user-space overhead

 - stalled security efforts

 - infighting

 - friction, fragmentation, overmodularization

 - non-stackability

 - security annoyances on the Linux desktop

 - probably *less* Linux security

and should have asked them to do something better designed instead?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-14  7:30                                   ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-14  7:30 UTC (permalink / raw)
  To: Eric Paris
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Peter Zijlstra, Steven Rostedt, Tejun Heo,
	Thomas Gleixner, linux-arm-kernel, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* Eric Paris <eparis@redhat.com> wrote:

> [dropping microblaze and roland]
> 
> lOn Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> > * James Morris <jmorris@namei.org> wrote:
> 
> > It is a simple and sensible security feature, agreed? It allows most code to 
> > run well and link to countless libraries - but no access to other files is 
> > allowed.
> 
> It's simple enough and sounds reasonable, but you can read all the discussion 
> about AppArmour why many people don't really think it's the best. [...]

I have to say most of the participants of the AppArmour flamefests were dead 
wrong, and it wasnt the AppArmour folks who were wrong ...

The straight ASCII VFS namespace *makes sense*, and yes, the raw physical 
objects space that SELinux uses makes sense as well.

And no, i do not subscribe to the dogma that it is not possible to secure the 
ASCII VFS namespace: it evidently is possible, if you know and handle the 
ambiguitites. It is also obviously true that the ASCII VFS namespaces we use 
every day are a *lot* more intuitive than the labeled physical objects space 
...

What all the security flamewars missed is the simple fact that being intuitive 
matters a *lot* not just to not annoy users, but also to broaden the effective 
number of security-conscious developers ...

> > Unfortunately this audit callback cannot be used for my purposes, because 
> > the event is single-purpose for auditd and because it allows no feedback 
> > (no deny/accept discretion for the security policy).
> > 
> > But if had this simple event there:
> > 
> > 	err = event_vfs_getname(result);
> 
> Wow it sounds so easy.  Now lets keep extending your train of thought
> until we can actually provide the security provided by SELinux.  What do
> we end up with?  We end up with an event hook right next to every LSM
> hook.  You know, the LSM hooks were placed where they are for a reason.
> Because those were the locations inside the kernel where you actually
> have information about the task doing an operation and the objects
> (files, sockets, directories, other tasks, etc) they are doing an
> operation on.
> 
> Honestly all you are talking about it remaking the LSM with 2 sets of
> hooks instead if 1.  Why? [...]

Not at all. I am taking about using *one* set of events, to keep the intrusion 
at the lowest possible level.

LSM could make use of them as well.

Obviously for pragmatic reasons that might not be feasible initially.

> [...]  It seems much easier that if you want the language of the filter 
> engine you would just make a new LSM that uses the filter engine for it's 
> policy language rather than the language created by SELinux or SMACK or name 
> your LSM implementation.

Correct, that is what i suggested.

Note that performance is a primary concern, so if certain filters are very 
popular then in practice this would come with support for a couple of 'built 
in' (pre-optimized) filters that the kernel can accelerate directly, so that we 
do not incure the cost of executing the filter preds for really common-sense 
security policies that almost everyone is using.

I.e. in the end we'd *roughly* end up with the same performance and security as 
we are today (i mean, SELinux and the other LSMs did a nice job of collecting 
the things that apps should be careful about), but the crutial difference isnt 
just the advantages i menioned, but the fact that the *development model* of 
security modules would be a *lot* more extensible.

So security efforts could move to a whole different level: they could move into 
key apps and they could integrate with the general mind-set of developers.

At least Will as an application framework developer cares, so that hope is 
justified i think.

> >  - unprivileged:  application-definable, allowing the embedding of security 
> >                   policy in *apps* as well, not just the system
> > 
> >  - flexible:      can be added/removed runtime unprivileged, and cheaply so
> > 
> >  - transparent:   does not impact executing code that meets the policy
> > 
> >  - nestable:      it is inherited by child tasks and is fundamentally stackable,
> >                   multiple policies will have the combined effect and they
> >                   are transparent to each other. So if a child task within a
> >                   sandbox adds *more* checks then those add to the already
> >                   existing set of checks. We only narrow permissions, never
> >                   extend them.
> > 
> >  - generic:       allowing observation and (safe) control of security relevant
> >                   parameters not just at the system call boundary but at other
> >                   relevant places of kernel execution as well: which 
> >                   points/callbacks could also be used for other types of event 
> >                   extraction such as perf. It could even be shared with audit ...
> 
> I'm not arguing that any of these things are bad things.  What you describe 
> is a new LSM that uses a discretionary access control model but with the 
> granularity and flexibility that has traditionally only existed in the 
> mandatory access control security modules previously implemented in the 
> kernel.
> 
> I won't argue that's a bad idea, there's no reason in my mind that a process 
> shouldn't be allowed to control it's own access decisions in a more flexible 
> way than rwx bits.  Then again, I certainly don't see a reason that this 
> syscall hardening patch should be held up while a whole new concept in 
> computer security is contemplated...

Note, i'm not actually asking for the moon, a pony and more.

I fully submit that we are yet far away from being able to do a full LSM via 
this mechanism.

What i'm asking for is that because the syscall point steps taken by Will look 
very promising so it would be nice to do *that* in a slightly more flexible 
scheme that does not condemn it to be limited to the syscall boundary and such 
...

Also, to answer you, do you say that by my argument someone should have stood 
up and said 'no' to the LSM mess that was introduced a couple of years ago and 
which caused so many problems:

 - kernel inefficiencies and user-space overhead

 - stalled security efforts

 - infighting

 - friction, fragmentation, overmodularization

 - non-stackability

 - security annoyances on the Linux desktop

 - probably *less* Linux security

and should have asked them to do something better designed instead?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-14  7:30                                   ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-14  7:30 UTC (permalink / raw)
  To: linux-arm-kernel


* Eric Paris <eparis@redhat.com> wrote:

> [dropping microblaze and roland]
> 
> lOn Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
> > * James Morris <jmorris@namei.org> wrote:
> 
> > It is a simple and sensible security feature, agreed? It allows most code to 
> > run well and link to countless libraries - but no access to other files is 
> > allowed.
> 
> It's simple enough and sounds reasonable, but you can read all the discussion 
> about AppArmour why many people don't really think it's the best. [...]

I have to say most of the participants of the AppArmour flamefests were dead 
wrong, and it wasnt the AppArmour folks who were wrong ...

The straight ASCII VFS namespace *makes sense*, and yes, the raw physical 
objects space that SELinux uses makes sense as well.

And no, i do not subscribe to the dogma that it is not possible to secure the 
ASCII VFS namespace: it evidently is possible, if you know and handle the 
ambiguitites. It is also obviously true that the ASCII VFS namespaces we use 
every day are a *lot* more intuitive than the labeled physical objects space 
...

What all the security flamewars missed is the simple fact that being intuitive 
matters a *lot* not just to not annoy users, but also to broaden the effective 
number of security-conscious developers ...

> > Unfortunately this audit callback cannot be used for my purposes, because 
> > the event is single-purpose for auditd and because it allows no feedback 
> > (no deny/accept discretion for the security policy).
> > 
> > But if had this simple event there:
> > 
> > 	err = event_vfs_getname(result);
> 
> Wow it sounds so easy.  Now lets keep extending your train of thought
> until we can actually provide the security provided by SELinux.  What do
> we end up with?  We end up with an event hook right next to every LSM
> hook.  You know, the LSM hooks were placed where they are for a reason.
> Because those were the locations inside the kernel where you actually
> have information about the task doing an operation and the objects
> (files, sockets, directories, other tasks, etc) they are doing an
> operation on.
> 
> Honestly all you are talking about it remaking the LSM with 2 sets of
> hooks instead if 1.  Why? [...]

Not at all. I am taking about using *one* set of events, to keep the intrusion 
at the lowest possible level.

LSM could make use of them as well.

Obviously for pragmatic reasons that might not be feasible initially.

> [...]  It seems much easier that if you want the language of the filter 
> engine you would just make a new LSM that uses the filter engine for it's 
> policy language rather than the language created by SELinux or SMACK or name 
> your LSM implementation.

Correct, that is what i suggested.

Note that performance is a primary concern, so if certain filters are very 
popular then in practice this would come with support for a couple of 'built 
in' (pre-optimized) filters that the kernel can accelerate directly, so that we 
do not incure the cost of executing the filter preds for really common-sense 
security policies that almost everyone is using.

I.e. in the end we'd *roughly* end up with the same performance and security as 
we are today (i mean, SELinux and the other LSMs did a nice job of collecting 
the things that apps should be careful about), but the crutial difference isnt 
just the advantages i menioned, but the fact that the *development model* of 
security modules would be a *lot* more extensible.

So security efforts could move to a whole different level: they could move into 
key apps and they could integrate with the general mind-set of developers.

At least Will as an application framework developer cares, so that hope is 
justified i think.

> >  - unprivileged:  application-definable, allowing the embedding of security 
> >                   policy in *apps* as well, not just the system
> > 
> >  - flexible:      can be added/removed runtime unprivileged, and cheaply so
> > 
> >  - transparent:   does not impact executing code that meets the policy
> > 
> >  - nestable:      it is inherited by child tasks and is fundamentally stackable,
> >                   multiple policies will have the combined effect and they
> >                   are transparent to each other. So if a child task within a
> >                   sandbox adds *more* checks then those add to the already
> >                   existing set of checks. We only narrow permissions, never
> >                   extend them.
> > 
> >  - generic:       allowing observation and (safe) control of security relevant
> >                   parameters not just at the system call boundary but at other
> >                   relevant places of kernel execution as well: which 
> >                   points/callbacks could also be used for other types of event 
> >                   extraction such as perf. It could even be shared with audit ...
> 
> I'm not arguing that any of these things are bad things.  What you describe 
> is a new LSM that uses a discretionary access control model but with the 
> granularity and flexibility that has traditionally only existed in the 
> mandatory access control security modules previously implemented in the 
> kernel.
> 
> I won't argue that's a bad idea, there's no reason in my mind that a process 
> shouldn't be allowed to control it's own access decisions in a more flexible 
> way than rwx bits.  Then again, I certainly don't see a reason that this 
> syscall hardening patch should be held up while a whole new concept in 
> computer security is contemplated...

Note, i'm not actually asking for the moon, a pony and more.

I fully submit that we are yet far away from being able to do a full LSM via 
this mechanism.

What i'm asking for is that because the syscall point steps taken by Will look 
very promising so it would be nice to do *that* in a slightly more flexible 
scheme that does not condemn it to be limited to the syscall boundary and such 
...

Also, to answer you, do you say that by my argument someone should have stood 
up and said 'no' to the LSM mess that was introduced a couple of years ago and 
which caused so many problems:

 - kernel inefficiencies and user-space overhead

 - stalled security efforts

 - infighting

 - friction, fragmentation, overmodularization

 - non-stackability

 - security annoyances on the Linux desktop

 - probably *less* Linux security

and should have asked them to do something better designed instead?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-14 20:57                                     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-14 20:57 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Eric Paris, James Morris, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, kees.cook, agl, Peter Zijlstra,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds

On Sat, May 14, 2011 at 2:30 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Eric Paris <eparis@redhat.com> wrote:
>
>> [dropping microblaze and roland]
>>
>> lOn Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
>> > * James Morris <jmorris@namei.org> wrote:
>>
>> > It is a simple and sensible security feature, agreed? It allows most code to
>> > run well and link to countless libraries - but no access to other files is
>> > allowed.
>>
>> It's simple enough and sounds reasonable, but you can read all the discussion
>> about AppArmour why many people don't really think it's the best. [...]
>
> I have to say most of the participants of the AppArmour flamefests were dead
> wrong, and it wasnt the AppArmour folks who were wrong ...
>
> The straight ASCII VFS namespace *makes sense*, and yes, the raw physical
> objects space that SELinux uses makes sense as well.
>
> And no, i do not subscribe to the dogma that it is not possible to secure the
> ASCII VFS namespace: it evidently is possible, if you know and handle the
> ambiguitites. It is also obviously true that the ASCII VFS namespaces we use
> every day are a *lot* more intuitive than the labeled physical objects space
> ...
>
> What all the security flamewars missed is the simple fact that being intuitive
> matters a *lot* not just to not annoy users, but also to broaden the effective
> number of security-conscious developers ...
>
>> > Unfortunately this audit callback cannot be used for my purposes, because
>> > the event is single-purpose for auditd and because it allows no feedback
>> > (no deny/accept discretion for the security policy).
>> >
>> > But if had this simple event there:
>> >
>> >     err = event_vfs_getname(result);
>>
>> Wow it sounds so easy.  Now lets keep extending your train of thought
>> until we can actually provide the security provided by SELinux.  What do
>> we end up with?  We end up with an event hook right next to every LSM
>> hook.  You know, the LSM hooks were placed where they are for a reason.
>> Because those were the locations inside the kernel where you actually
>> have information about the task doing an operation and the objects
>> (files, sockets, directories, other tasks, etc) they are doing an
>> operation on.
>>
>> Honestly all you are talking about it remaking the LSM with 2 sets of
>> hooks instead if 1.  Why? [...]
>
> Not at all. I am taking about using *one* set of events, to keep the intrusion
> at the lowest possible level.
>
> LSM could make use of them as well.
>
> Obviously for pragmatic reasons that might not be feasible initially.
>
>> [...]  It seems much easier that if you want the language of the filter
>> engine you would just make a new LSM that uses the filter engine for it's
>> policy language rather than the language created by SELinux or SMACK or name
>> your LSM implementation.
>
> Correct, that is what i suggested.
>
> Note that performance is a primary concern, so if certain filters are very
> popular then in practice this would come with support for a couple of 'built
> in' (pre-optimized) filters that the kernel can accelerate directly, so that we
> do not incure the cost of executing the filter preds for really common-sense
> security policies that almost everyone is using.
>
> I.e. in the end we'd *roughly* end up with the same performance and security as
> we are today (i mean, SELinux and the other LSMs did a nice job of collecting
> the things that apps should be careful about), but the crutial difference isnt
> just the advantages i menioned, but the fact that the *development model* of
> security modules would be a *lot* more extensible.
>
> So security efforts could move to a whole different level: they could move into
> key apps and they could integrate with the general mind-set of developers.
>
> At least Will as an application framework developer cares, so that hope is
> justified i think.
>
>> >  - unprivileged:  application-definable, allowing the embedding of security
>> >                   policy in *apps* as well, not just the system
>> >
>> >  - flexible:      can be added/removed runtime unprivileged, and cheaply so
>> >
>> >  - transparent:   does not impact executing code that meets the policy
>> >
>> >  - nestable:      it is inherited by child tasks and is fundamentally stackable,
>> >                   multiple policies will have the combined effect and they
>> >                   are transparent to each other. So if a child task within a
>> >                   sandbox adds *more* checks then those add to the already
>> >                   existing set of checks. We only narrow permissions, never
>> >                   extend them.
>> >
>> >  - generic:       allowing observation and (safe) control of security relevant
>> >                   parameters not just at the system call boundary but at other
>> >                   relevant places of kernel execution as well: which
>> >                   points/callbacks could also be used for other types of event
>> >                   extraction such as perf. It could even be shared with audit ...
>>
>> I'm not arguing that any of these things are bad things.  What you describe
>> is a new LSM that uses a discretionary access control model but with the
>> granularity and flexibility that has traditionally only existed in the
>> mandatory access control security modules previously implemented in the
>> kernel.
>>
>> I won't argue that's a bad idea, there's no reason in my mind that a process
>> shouldn't be allowed to control it's own access decisions in a more flexible
>> way than rwx bits.  Then again, I certainly don't see a reason that this
>> syscall hardening patch should be held up while a whole new concept in
>> computer security is contemplated...
>
> Note, i'm not actually asking for the moon, a pony and more.
>
> I fully submit that we are yet far away from being able to do a full LSM via
> this mechanism.
>
> What i'm asking for is that because the syscall point steps taken by Will look
> very promising so it would be nice to do *that* in a slightly more flexible
> scheme that does not condemn it to be limited to the syscall boundary and such
> ...

What do you suggest here?

From my brief exploration of the ftrace/perf (and seccomp) code, I
don't see a clean way of integrating over the existing interfaces to
the ftrace framework (e.g., the global perf event pump seems to be a
mismatch), but I may be missing something obvious.   In my view,
implementing this nestled between the seccomp/ftrace world provides a
stepping stone forward without being too restrictive.  No matter how
we change security events in the future, system calls will always be
the first line of attack surface reduction.  It could just mean that,
in the long term, accessing the "security event filtering" framework
is done through another new interface with seccomp providing only a
targeted syscall filtering featureset that may one day be deprecated
(if that day ever comes).

If there's a clear way to cleanly expand this interface that I'm
missing, I'd love to know - thanks!
will

> Also, to answer you, do you say that by my argument someone should have stood
> up and said 'no' to the LSM mess that was introduced a couple of years ago and
> which caused so many problems:
>
>  - kernel inefficiencies and user-space overhead
>
>  - stalled security efforts
>
>  - infighting
>
>  - friction, fragmentation, overmodularization
>
>  - non-stackability
>
>  - security annoyances on the Linux desktop
>
>  - probably *less* Linux security
>
> and should have asked them to do something better designed instead?
>
> Thanks,
>
>        Ingo
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-14 20:57                                     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-14 20:57 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Eric Paris, James Morris, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, kees.cook, agl, Peter Zijlstra,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds

On Sat, May 14, 2011 at 2:30 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Eric Paris <eparis@redhat.com> wrote:
>
>> [dropping microblaze and roland]
>>
>> lOn Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
>> > * James Morris <jmorris@namei.org> wrote:
>>
>> > It is a simple and sensible security feature, agreed? It allows most code to
>> > run well and link to countless libraries - but no access to other files is
>> > allowed.
>>
>> It's simple enough and sounds reasonable, but you can read all the discussion
>> about AppArmour why many people don't really think it's the best. [...]
>
> I have to say most of the participants of the AppArmour flamefests were dead
> wrong, and it wasnt the AppArmour folks who were wrong ...
>
> The straight ASCII VFS namespace *makes sense*, and yes, the raw physical
> objects space that SELinux uses makes sense as well.
>
> And no, i do not subscribe to the dogma that it is not possible to secure the
> ASCII VFS namespace: it evidently is possible, if you know and handle the
> ambiguitites. It is also obviously true that the ASCII VFS namespaces we use
> every day are a *lot* more intuitive than the labeled physical objects space
> ...
>
> What all the security flamewars missed is the simple fact that being intuitive
> matters a *lot* not just to not annoy users, but also to broaden the effective
> number of security-conscious developers ...
>
>> > Unfortunately this audit callback cannot be used for my purposes, because
>> > the event is single-purpose for auditd and because it allows no feedback
>> > (no deny/accept discretion for the security policy).
>> >
>> > But if had this simple event there:
>> >
>> >     err = event_vfs_getname(result);
>>
>> Wow it sounds so easy.  Now lets keep extending your train of thought
>> until we can actually provide the security provided by SELinux.  What do
>> we end up with?  We end up with an event hook right next to every LSM
>> hook.  You know, the LSM hooks were placed where they are for a reason.
>> Because those were the locations inside the kernel where you actually
>> have information about the task doing an operation and the objects
>> (files, sockets, directories, other tasks, etc) they are doing an
>> operation on.
>>
>> Honestly all you are talking about it remaking the LSM with 2 sets of
>> hooks instead if 1.  Why? [...]
>
> Not at all. I am taking about using *one* set of events, to keep the intrusion
> at the lowest possible level.
>
> LSM could make use of them as well.
>
> Obviously for pragmatic reasons that might not be feasible initially.
>
>> [...]  It seems much easier that if you want the language of the filter
>> engine you would just make a new LSM that uses the filter engine for it's
>> policy language rather than the language created by SELinux or SMACK or name
>> your LSM implementation.
>
> Correct, that is what i suggested.
>
> Note that performance is a primary concern, so if certain filters are very
> popular then in practice this would come with support for a couple of 'built
> in' (pre-optimized) filters that the kernel can accelerate directly, so that we
> do not incure the cost of executing the filter preds for really common-sense
> security policies that almost everyone is using.
>
> I.e. in the end we'd *roughly* end up with the same performance and security as
> we are today (i mean, SELinux and the other LSMs did a nice job of collecting
> the things that apps should be careful about), but the crutial difference isnt
> just the advantages i menioned, but the fact that the *development model* of
> security modules would be a *lot* more extensible.
>
> So security efforts could move to a whole different level: they could move into
> key apps and they could integrate with the general mind-set of developers.
>
> At least Will as an application framework developer cares, so that hope is
> justified i think.
>
>> >  - unprivileged:  application-definable, allowing the embedding of security
>> >                   policy in *apps* as well, not just the system
>> >
>> >  - flexible:      can be added/removed runtime unprivileged, and cheaply so
>> >
>> >  - transparent:   does not impact executing code that meets the policy
>> >
>> >  - nestable:      it is inherited by child tasks and is fundamentally stackable,
>> >                   multiple policies will have the combined effect and they
>> >                   are transparent to each other. So if a child task within a
>> >                   sandbox adds *more* checks then those add to the already
>> >                   existing set of checks. We only narrow permissions, never
>> >                   extend them.
>> >
>> >  - generic:       allowing observation and (safe) control of security relevant
>> >                   parameters not just at the system call boundary but at other
>> >                   relevant places of kernel execution as well: which
>> >                   points/callbacks could also be used for other types of event
>> >                   extraction such as perf. It could even be shared with audit ...
>>
>> I'm not arguing that any of these things are bad things.  What you describe
>> is a new LSM that uses a discretionary access control model but with the
>> granularity and flexibility that has traditionally only existed in the
>> mandatory access control security modules previously implemented in the
>> kernel.
>>
>> I won't argue that's a bad idea, there's no reason in my mind that a process
>> shouldn't be allowed to control it's own access decisions in a more flexible
>> way than rwx bits.  Then again, I certainly don't see a reason that this
>> syscall hardening patch should be held up while a whole new concept in
>> computer security is contemplated...
>
> Note, i'm not actually asking for the moon, a pony and more.
>
> I fully submit that we are yet far away from being able to do a full LSM via
> this mechanism.
>
> What i'm asking for is that because the syscall point steps taken by Will look
> very promising so it would be nice to do *that* in a slightly more flexible
> scheme that does not condemn it to be limited to the syscall boundary and such
> ...

What do you suggest here?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-14 20:57                                     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-14 20:57 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Peter Zijlstra, Steven Rostedt,
	Tejun Heo, Thomas Gleixner, linux-arm-kernel, Michal Marek,
	Michal Simek, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Sat, May 14, 2011 at 2:30 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Eric Paris <eparis@redhat.com> wrote:
>
>> [dropping microblaze and roland]
>>
>> lOn Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
>> > * James Morris <jmorris@namei.org> wrote:
>>
>> > It is a simple and sensible security feature, agreed? It allows most c=
ode to
>> > run well and link to countless libraries - but no access to other file=
s is
>> > allowed.
>>
>> It's simple enough and sounds reasonable, but you can read all the discu=
ssion
>> about AppArmour why many people don't really think it's the best. [...]
>
> I have to say most of the participants of the AppArmour flamefests were d=
ead
> wrong, and it wasnt the AppArmour folks who were wrong ...
>
> The straight ASCII VFS namespace *makes sense*, and yes, the raw physical
> objects space that SELinux uses makes sense as well.
>
> And no, i do not subscribe to the dogma that it is not possible to secure=
 the
> ASCII VFS namespace: it evidently is possible, if you know and handle the
> ambiguitites. It is also obviously true that the ASCII VFS namespaces we =
use
> every day are a *lot* more intuitive than the labeled physical objects sp=
ace
> ...
>
> What all the security flamewars missed is the simple fact that being intu=
itive
> matters a *lot* not just to not annoy users, but also to broaden the effe=
ctive
> number of security-conscious developers ...
>
>> > Unfortunately this audit callback cannot be used for my purposes, beca=
use
>> > the event is single-purpose for auditd and because it allows no feedba=
ck
>> > (no deny/accept discretion for the security policy).
>> >
>> > But if had this simple event there:
>> >
>> > =A0 =A0 err =3D event_vfs_getname(result);
>>
>> Wow it sounds so easy. =A0Now lets keep extending your train of thought
>> until we can actually provide the security provided by SELinux. =A0What =
do
>> we end up with? =A0We end up with an event hook right next to every LSM
>> hook. =A0You know, the LSM hooks were placed where they are for a reason=
.
>> Because those were the locations inside the kernel where you actually
>> have information about the task doing an operation and the objects
>> (files, sockets, directories, other tasks, etc) they are doing an
>> operation on.
>>
>> Honestly all you are talking about it remaking the LSM with 2 sets of
>> hooks instead if 1. =A0Why? [...]
>
> Not at all. I am taking about using *one* set of events, to keep the intr=
usion
> at the lowest possible level.
>
> LSM could make use of them as well.
>
> Obviously for pragmatic reasons that might not be feasible initially.
>
>> [...] =A0It seems much easier that if you want the language of the filte=
r
>> engine you would just make a new LSM that uses the filter engine for it'=
s
>> policy language rather than the language created by SELinux or SMACK or =
name
>> your LSM implementation.
>
> Correct, that is what i suggested.
>
> Note that performance is a primary concern, so if certain filters are ver=
y
> popular then in practice this would come with support for a couple of 'bu=
ilt
> in' (pre-optimized) filters that the kernel can accelerate directly, so t=
hat we
> do not incure the cost of executing the filter preds for really common-se=
nse
> security policies that almost everyone is using.
>
> I.e. in the end we'd *roughly* end up with the same performance and secur=
ity as
> we are today (i mean, SELinux and the other LSMs did a nice job of collec=
ting
> the things that apps should be careful about), but the crutial difference=
 isnt
> just the advantages i menioned, but the fact that the *development model*=
 of
> security modules would be a *lot* more extensible.
>
> So security efforts could move to a whole different level: they could mov=
e into
> key apps and they could integrate with the general mind-set of developers=
.
>
> At least Will as an application framework developer cares, so that hope i=
s
> justified i think.
>
>> > =A0- unprivileged: =A0application-definable, allowing the embedding of=
 security
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 policy in *apps* as well, not just=
 the system
>> >
>> > =A0- flexible: =A0 =A0 =A0can be added/removed runtime unprivileged, a=
nd cheaply so
>> >
>> > =A0- transparent: =A0 does not impact executing code that meets the po=
licy
>> >
>> > =A0- nestable: =A0 =A0 =A0it is inherited by child tasks and is fundam=
entally stackable,
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 multiple policies will have the co=
mbined effect and they
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 are transparent to each other. So =
if a child task within a
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 sandbox adds *more* checks then th=
ose add to the already
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 existing set of checks. We only na=
rrow permissions, never
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 extend them.
>> >
>> > =A0- generic: =A0 =A0 =A0 allowing observation and (safe) control of s=
ecurity relevant
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 parameters not just at the system =
call boundary but at other
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 relevant places of kernel executio=
n as well: which
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 points/callbacks could also be use=
d for other types of event
>> > =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 extraction such as perf. It could =
even be shared with audit ...
>>
>> I'm not arguing that any of these things are bad things. =A0What you des=
cribe
>> is a new LSM that uses a discretionary access control model but with the
>> granularity and flexibility that has traditionally only existed in the
>> mandatory access control security modules previously implemented in the
>> kernel.
>>
>> I won't argue that's a bad idea, there's no reason in my mind that a pro=
cess
>> shouldn't be allowed to control it's own access decisions in a more flex=
ible
>> way than rwx bits. =A0Then again, I certainly don't see a reason that th=
is
>> syscall hardening patch should be held up while a whole new concept in
>> computer security is contemplated...
>
> Note, i'm not actually asking for the moon, a pony and more.
>
> I fully submit that we are yet far away from being able to do a full LSM =
via
> this mechanism.
>
> What i'm asking for is that because the syscall point steps taken by Will=
 look
> very promising so it would be nice to do *that* in a slightly more flexib=
le
> scheme that does not condemn it to be limited to the syscall boundary and=
 such
> ...

What do you suggest here?

>From my brief exploration of the ftrace/perf (and seccomp) code, I
don't see a clean way of integrating over the existing interfaces to
the ftrace framework (e.g., the global perf event pump seems to be a
mismatch), but I may be missing something obvious.   In my view,
implementing this nestled between the seccomp/ftrace world provides a
stepping stone forward without being too restrictive.  No matter how
we change security events in the future, system calls will always be
the first line of attack surface reduction.  It could just mean that,
in the long term, accessing the "security event filtering" framework
is done through another new interface with seccomp providing only a
targeted syscall filtering featureset that may one day be deprecated
(if that day ever comes).

If there's a clear way to cleanly expand this interface that I'm
missing, I'd love to know - thanks!
will

> Also, to answer you, do you say that by my argument someone should have s=
tood
> up and said 'no' to the LSM mess that was introduced a couple of years ag=
o and
> which caused so many problems:
>
> =A0- kernel inefficiencies and user-space overhead
>
> =A0- stalled security efforts
>
> =A0- infighting
>
> =A0- friction, fragmentation, overmodularization
>
> =A0- non-stackability
>
> =A0- security annoyances on the Linux desktop
>
> =A0- probably *less* Linux security
>
> and should have asked them to do something better designed instead?
>
> Thanks,
>
> =A0 =A0 =A0 =A0Ingo
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-14 20:57                                     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-14 20:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Sat, May 14, 2011 at 2:30 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Eric Paris <eparis@redhat.com> wrote:
>
>> [dropping microblaze and roland]
>>
>> lOn Fri, 2011-05-13 at 14:10 +0200, Ingo Molnar wrote:
>> > * James Morris <jmorris@namei.org> wrote:
>>
>> > It is a simple and sensible security feature, agreed? It allows most code to
>> > run well and link to countless libraries - but no access to other files is
>> > allowed.
>>
>> It's simple enough and sounds reasonable, but you can read all the discussion
>> about AppArmour why many people don't really think it's the best. [...]
>
> I have to say most of the participants of the AppArmour flamefests were dead
> wrong, and it wasnt the AppArmour folks who were wrong ...
>
> The straight ASCII VFS namespace *makes sense*, and yes, the raw physical
> objects space that SELinux uses makes sense as well.
>
> And no, i do not subscribe to the dogma that it is not possible to secure the
> ASCII VFS namespace: it evidently is possible, if you know and handle the
> ambiguitites. It is also obviously true that the ASCII VFS namespaces we use
> every day are a *lot* more intuitive than the labeled physical objects space
> ...
>
> What all the security flamewars missed is the simple fact that being intuitive
> matters a *lot* not just to not annoy users, but also to broaden the effective
> number of security-conscious developers ...
>
>> > Unfortunately this audit callback cannot be used for my purposes, because
>> > the event is single-purpose for auditd and because it allows no feedback
>> > (no deny/accept discretion for the security policy).
>> >
>> > But if had this simple event there:
>> >
>> > ? ? err = event_vfs_getname(result);
>>
>> Wow it sounds so easy. ?Now lets keep extending your train of thought
>> until we can actually provide the security provided by SELinux. ?What do
>> we end up with? ?We end up with an event hook right next to every LSM
>> hook. ?You know, the LSM hooks were placed where they are for a reason.
>> Because those were the locations inside the kernel where you actually
>> have information about the task doing an operation and the objects
>> (files, sockets, directories, other tasks, etc) they are doing an
>> operation on.
>>
>> Honestly all you are talking about it remaking the LSM with 2 sets of
>> hooks instead if 1. ?Why? [...]
>
> Not at all. I am taking about using *one* set of events, to keep the intrusion
> at the lowest possible level.
>
> LSM could make use of them as well.
>
> Obviously for pragmatic reasons that might not be feasible initially.
>
>> [...] ?It seems much easier that if you want the language of the filter
>> engine you would just make a new LSM that uses the filter engine for it's
>> policy language rather than the language created by SELinux or SMACK or name
>> your LSM implementation.
>
> Correct, that is what i suggested.
>
> Note that performance is a primary concern, so if certain filters are very
> popular then in practice this would come with support for a couple of 'built
> in' (pre-optimized) filters that the kernel can accelerate directly, so that we
> do not incure the cost of executing the filter preds for really common-sense
> security policies that almost everyone is using.
>
> I.e. in the end we'd *roughly* end up with the same performance and security as
> we are today (i mean, SELinux and the other LSMs did a nice job of collecting
> the things that apps should be careful about), but the crutial difference isnt
> just the advantages i menioned, but the fact that the *development model* of
> security modules would be a *lot* more extensible.
>
> So security efforts could move to a whole different level: they could move into
> key apps and they could integrate with the general mind-set of developers.
>
> At least Will as an application framework developer cares, so that hope is
> justified i think.
>
>> > ?- unprivileged: ?application-definable, allowing the embedding of security
>> > ? ? ? ? ? ? ? ? ? policy in *apps* as well, not just the system
>> >
>> > ?- flexible: ? ? ?can be added/removed runtime unprivileged, and cheaply so
>> >
>> > ?- transparent: ? does not impact executing code that meets the policy
>> >
>> > ?- nestable: ? ? ?it is inherited by child tasks and is fundamentally stackable,
>> > ? ? ? ? ? ? ? ? ? multiple policies will have the combined effect and they
>> > ? ? ? ? ? ? ? ? ? are transparent to each other. So if a child task within a
>> > ? ? ? ? ? ? ? ? ? sandbox adds *more* checks then those add to the already
>> > ? ? ? ? ? ? ? ? ? existing set of checks. We only narrow permissions, never
>> > ? ? ? ? ? ? ? ? ? extend them.
>> >
>> > ?- generic: ? ? ? allowing observation and (safe) control of security relevant
>> > ? ? ? ? ? ? ? ? ? parameters not just at the system call boundary but at other
>> > ? ? ? ? ? ? ? ? ? relevant places of kernel execution as well: which
>> > ? ? ? ? ? ? ? ? ? points/callbacks could also be used for other types of event
>> > ? ? ? ? ? ? ? ? ? extraction such as perf. It could even be shared with audit ...
>>
>> I'm not arguing that any of these things are bad things. ?What you describe
>> is a new LSM that uses a discretionary access control model but with the
>> granularity and flexibility that has traditionally only existed in the
>> mandatory access control security modules previously implemented in the
>> kernel.
>>
>> I won't argue that's a bad idea, there's no reason in my mind that a process
>> shouldn't be allowed to control it's own access decisions in a more flexible
>> way than rwx bits. ?Then again, I certainly don't see a reason that this
>> syscall hardening patch should be held up while a whole new concept in
>> computer security is contemplated...
>
> Note, i'm not actually asking for the moon, a pony and more.
>
> I fully submit that we are yet far away from being able to do a full LSM via
> this mechanism.
>
> What i'm asking for is that because the syscall point steps taken by Will look
> very promising so it would be nice to do *that* in a slightly more flexible
> scheme that does not condemn it to be limited to the syscall boundary and such
> ...

What do you suggest here?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 19:35                       ` Arnd Bergmann
  (?)
@ 2011-05-14 20:58                         ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-14 20:58 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linux-arm-kernel, linux-kernel, linux-mips, linux-sh,
	Peter Zijlstra, Frederic Weisbecker, Heiko Carstens,
	David Howells, Paul Mackerras, Eric Paris, H. Peter Anvin,
	sparclinux, Jiri Slaby, linux-s390, Russell King, x86, jmorris,
	Ingo Molnar, Benjamin Herrenschmidt, Ingo Molnar,
	Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Martin Schwidefsky, Thomas Gleixner, kees.cook,
	Roland McGrath, Michal Marek, Michal Simek, linuxppc-dev,
	Oleg Nesterov, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller

On Fri, May 13, 2011 at 2:35 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Thursday 12 May 2011, Will Drewry wrote:
>> This change adds a new seccomp mode based on the work by
>> agl@chromium.org in [1]. This new mode, "filter mode", provides a hash
>> table of seccomp_filter objects.  When in the new mode (2), all system
>> calls are checked against the filters - first by system call number,
>> then by a filter string.  If an entry exists for a given system call and
>> all filter predicates evaluate to true, then the task may proceed.
>> Otherwise, the task is killed (as per seccomp_mode == 1).
>
> I've got a question about this: Do you expect the typical usage to disallow
> ioctl()? Given that ioctl alone is responsible for a huge number of exploits
> in various drivers, while certain ioctls are immensely useful (FIONREAD,
> FIOASYNC, ...), do you expect to extend the mechanism to filter specific
> ioctl commands in the future?

In many cases, I do expect ioctl's to be dropped, but it's totally up
to whoever is setting the filters.

As is, it can already help out: [even though an LSM, if available,
would be appropriate to define a fine-grained policy]

ioctl() is hooked by the ftrace syscalls infrastructure (via SYSCALL_DEFINE3):

  SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned
long, arg)

This means you can do:
  sprintf(filter, "cmd == %u || cmd == %u", FIOASYNC, FIONREAD);
  prctl(PR_SET_SECCOMP_FILTER, __NR_ioctl, filter);
  ...
  prctl(PR_SET_SECCOMP, 2, 0);
and then you'll be able to call ioctl on any fd with any argument but
limited to only the FIOASYNC and FIONREAD commands.

Depending on integration, it could even be limited to ioctl commands
that are appropriate to a known fd if the fd is opened prior to
entering seccomp mode 2. Alternatively, __NR__ioctl could be allowed
with a filter of "1" then narrowed through a later addition of
something like "(fd == %u && (cmd == %u || cmd == %u))" or something
along those lines.

Does that make sense?

In general, this interface won't need specific extensions for most
system call oriented filtering events.  ftrace events may be expanded
(to include more system calls), but that's behind the scenes.  Only
arguments subject to time-of-check-time-of-use attacks (data living in
userspace passed in by pointer) are not safe to use via this
interface.  In theory, that limitation could also be lifted in the
implementation without changing the ABI.

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-14 20:58                         ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-14 20:58 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, jmorris, Ingo Molnar, Roland McGrath,
	Ingo Molnar, Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Tejun Heo, Thomas Gleixner, kees.cook,
	linux-arm-kernel, Michal Marek, Michal Simek, agl, linux-kernel,
	Eric Paris, Paul Mundt, Martin Schwidefsky, linux390,
	Andrew Morton, linuxppc-dev, David S. Miller

On Fri, May 13, 2011 at 2:35 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Thursday 12 May 2011, Will Drewry wrote:
>> This change adds a new seccomp mode based on the work by
>> agl@chromium.org in [1]. This new mode, "filter mode", provides a hash
>> table of seccomp_filter objects. =A0When in the new mode (2), all system
>> calls are checked against the filters - first by system call number,
>> then by a filter string. =A0If an entry exists for a given system call a=
nd
>> all filter predicates evaluate to true, then the task may proceed.
>> Otherwise, the task is killed (as per seccomp_mode =3D=3D 1).
>
> I've got a question about this: Do you expect the typical usage to disall=
ow
> ioctl()? Given that ioctl alone is responsible for a huge number of explo=
its
> in various drivers, while certain ioctls are immensely useful (FIONREAD,
> FIOASYNC, ...), do you expect to extend the mechanism to filter specific
> ioctl commands in the future?

In many cases, I do expect ioctl's to be dropped, but it's totally up
to whoever is setting the filters.

As is, it can already help out: [even though an LSM, if available,
would be appropriate to define a fine-grained policy]

ioctl() is hooked by the ftrace syscalls infrastructure (via SYSCALL_DEFINE=
3):

  SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned
long, arg)

This means you can do:
  sprintf(filter, "cmd =3D=3D %u || cmd =3D=3D %u", FIOASYNC, FIONREAD);
  prctl(PR_SET_SECCOMP_FILTER, __NR_ioctl, filter);
  ...
  prctl(PR_SET_SECCOMP, 2, 0);
and then you'll be able to call ioctl on any fd with any argument but
limited to only the FIOASYNC and FIONREAD commands.

Depending on integration, it could even be limited to ioctl commands
that are appropriate to a known fd if the fd is opened prior to
entering seccomp mode 2. Alternatively, __NR__ioctl could be allowed
with a filter of "1" then narrowed through a later addition of
something like "(fd =3D=3D %u && (cmd =3D=3D %u || cmd =3D=3D %u))" or some=
thing
along those lines.

Does that make sense?

In general, this interface won't need specific extensions for most
system call oriented filtering events.  ftrace events may be expanded
(to include more system calls), but that's behind the scenes.  Only
arguments subject to time-of-check-time-of-use attacks (data living in
userspace passed in by pointer) are not safe to use via this
interface.  In theory, that limitation could also be lifted in the
implementation without changing the ABI.

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-14 20:58                         ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-14 20:58 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, May 13, 2011 at 2:35 PM, Arnd Bergmann <arnd@arndb.de> wrote:
> On Thursday 12 May 2011, Will Drewry wrote:
>> This change adds a new seccomp mode based on the work by
>> agl at chromium.org in [1]. This new mode, "filter mode", provides a hash
>> table of seccomp_filter objects. ?When in the new mode (2), all system
>> calls are checked against the filters - first by system call number,
>> then by a filter string. ?If an entry exists for a given system call and
>> all filter predicates evaluate to true, then the task may proceed.
>> Otherwise, the task is killed (as per seccomp_mode == 1).
>
> I've got a question about this: Do you expect the typical usage to disallow
> ioctl()? Given that ioctl alone is responsible for a huge number of exploits
> in various drivers, while certain ioctls are immensely useful (FIONREAD,
> FIOASYNC, ...), do you expect to extend the mechanism to filter specific
> ioctl commands in the future?

In many cases, I do expect ioctl's to be dropped, but it's totally up
to whoever is setting the filters.

As is, it can already help out: [even though an LSM, if available,
would be appropriate to define a fine-grained policy]

ioctl() is hooked by the ftrace syscalls infrastructure (via SYSCALL_DEFINE3):

  SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned
long, arg)

This means you can do:
  sprintf(filter, "cmd == %u || cmd == %u", FIOASYNC, FIONREAD);
  prctl(PR_SET_SECCOMP_FILTER, __NR_ioctl, filter);
  ...
  prctl(PR_SET_SECCOMP, 2, 0);
and then you'll be able to call ioctl on any fd with any argument but
limited to only the FIOASYNC and FIONREAD commands.

Depending on integration, it could even be limited to ioctl commands
that are appropriate to a known fd if the fd is opened prior to
entering seccomp mode 2. Alternatively, __NR__ioctl could be allowed
with a filter of "1" then narrowed through a later addition of
something like "(fd == %u && (cmd == %u || cmd == %u))" or something
along those lines.

Does that make sense?

In general, this interface won't need specific extensions for most
system call oriented filtering events.  ftrace events may be expanded
(to include more system calls), but that's behind the scenes.  Only
arguments subject to time-of-check-time-of-use attacks (data living in
userspace passed in by pointer) are not safe to use via this
interface.  In theory, that limitation could also be lifted in the
implementation without changing the ABI.

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-14 20:58                         ` Will Drewry
  (?)
@ 2011-05-15  6:42                           ` Arnd Bergmann
  -1 siblings, 0 replies; 406+ messages in thread
From: Arnd Bergmann @ 2011-05-15  6:42 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-arm-kernel, linux-kernel, linux-mips, linux-sh,
	Peter Zijlstra, Frederic Weisbecker, Heiko Carstens,
	David Howells, Paul Mackerras, Eric Paris, H. Peter Anvin,
	sparclinux, Jiri Slaby, linux-s390, Russell King, x86, jmorris,
	Ingo Molnar, Benjamin Herrenschmidt, Ingo Molnar,
	Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Martin Schwidefsky, Thomas Gleixner, kees.cook,
	Roland McGrath, Michal Marek, Michal Simek, linuxppc-dev,
	Oleg Nesterov, Ralf Baechle, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, agl, David S. Miller

On Saturday 14 May 2011, Will Drewry wrote:
> Depending on integration, it could even be limited to ioctl commands
> that are appropriate to a known fd if the fd is opened prior to
> entering seccomp mode 2. Alternatively, __NR__ioctl could be allowed
> with a filter of "1" then narrowed through a later addition of
> something like "(fd == %u && (cmd == %u || cmd == %u))" or something
> along those lines.
> 
> Does that make sense?

Thanks for the explanation. This sounds like it's already doing all
we need.

	Arnd

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-15  6:42                           ` Arnd Bergmann
  0 siblings, 0 replies; 406+ messages in thread
From: Arnd Bergmann @ 2011-05-15  6:42 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, jmorris, Ingo Molnar, Roland McGrath,
	Ingo Molnar, Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Tejun Heo, Thomas Gleixner, kees.cook,
	linux-arm-kernel, Michal Marek, Michal Simek, agl, linux-kernel,
	Eric Paris, Paul Mundt, Martin Schwidefsky, linux390,
	Andrew Morton, linuxppc-dev, David S. Miller

On Saturday 14 May 2011, Will Drewry wrote:
> Depending on integration, it could even be limited to ioctl commands
> that are appropriate to a known fd if the fd is opened prior to
> entering seccomp mode 2. Alternatively, __NR__ioctl could be allowed
> with a filter of "1" then narrowed through a later addition of
> something like "(fd == %u && (cmd == %u || cmd == %u))" or something
> along those lines.
> 
> Does that make sense?

Thanks for the explanation. This sounds like it's already doing all
we need.

	Arnd

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-15  6:42                           ` Arnd Bergmann
  0 siblings, 0 replies; 406+ messages in thread
From: Arnd Bergmann @ 2011-05-15  6:42 UTC (permalink / raw)
  To: linux-arm-kernel

On Saturday 14 May 2011, Will Drewry wrote:
> Depending on integration, it could even be limited to ioctl commands
> that are appropriate to a known fd if the fd is opened prior to
> entering seccomp mode 2. Alternatively, __NR__ioctl could be allowed
> with a filter of "1" then narrowed through a later addition of
> something like "(fd == %u && (cmd == %u || cmd == %u))" or something
> along those lines.
> 
> Does that make sense?

Thanks for the explanation. This sounds like it's already doing all
we need.

	Arnd

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 12:10                               ` Ingo Molnar
  (?)
@ 2011-05-16  0:36                                 ` James Morris
  -1 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-16  0:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, David Howells, Russell King,
	Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds

On Fri, 13 May 2011, Ingo Molnar wrote:

> Say i'm a user-space sandbox developer who wants to enforce that sandboxed code 
> should only be allowed to open files in /home/sandbox/, /lib/ and /usr/lib/.
> 
> It is a simple and sensible security feature, agreed? It allows most code to 
> run well and link to countless libraries - but no access to other files is 
> allowed.

Not really.

Firstly, what is the security goal of these restrictions?  Then, are the 
restrictions complete and unbypassable?

How do you reason about the behavior of the system as a whole?


> I argue that this is the LSM and audit subsystems designed right: in the long 
> run it could allow everything that LSM does at the moment - and so much more 
> ...

Now you're proposing a redesign of the security subsystem.  That's a 
significant undertaking.

In the meantime, we have a simple, well-defined enhancement to seccomp 
which will be very useful to current users in reducing their kernel attack 
surface.

We should merge that, and the security subsystem discussion can carry on 
separately.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16  0:36                                 ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-16  0:36 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Peter Zijlstra, Steven Rostedt, Tejun Heo,
	Thomas Gleixner, linux-arm-kernel, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Fri, 13 May 2011, Ingo Molnar wrote:

> Say i'm a user-space sandbox developer who wants to enforce that sandboxed code 
> should only be allowed to open files in /home/sandbox/, /lib/ and /usr/lib/.
> 
> It is a simple and sensible security feature, agreed? It allows most code to 
> run well and link to countless libraries - but no access to other files is 
> allowed.

Not really.

Firstly, what is the security goal of these restrictions?  Then, are the 
restrictions complete and unbypassable?

How do you reason about the behavior of the system as a whole?


> I argue that this is the LSM and audit subsystems designed right: in the long 
> run it could allow everything that LSM does at the moment - and so much more 
> ...

Now you're proposing a redesign of the security subsystem.  That's a 
significant undertaking.

In the meantime, we have a simple, well-defined enhancement to seccomp 
which will be very useful to current users in reducing their kernel attack 
surface.

We should merge that, and the security subsystem discussion can carry on 
separately.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16  0:36                                 ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-16  0:36 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 13 May 2011, Ingo Molnar wrote:

> Say i'm a user-space sandbox developer who wants to enforce that sandboxed code 
> should only be allowed to open files in /home/sandbox/, /lib/ and /usr/lib/.
> 
> It is a simple and sensible security feature, agreed? It allows most code to 
> run well and link to countless libraries - but no access to other files is 
> allowed.

Not really.

Firstly, what is the security goal of these restrictions?  Then, are the 
restrictions complete and unbypassable?

How do you reason about the behavior of the system as a whole?


> I argue that this is the LSM and audit subsystems designed right: in the long 
> run it could allow everything that LSM does at the moment - and so much more 
> ...

Now you're proposing a redesign of the security subsystem.  That's a 
significant undertaking.

In the meantime, we have a simple, well-defined enhancement to seccomp 
which will be very useful to current users in reducing their kernel attack 
surface.

We should merge that, and the security subsystem discussion can carry on 
separately.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-15  6:42                           ` Arnd Bergmann
  (?)
@ 2011-05-16 12:00                             ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:00 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Will Drewry, linux-arm-kernel, linux-kernel, linux-mips,
	linux-sh, Peter Zijlstra, Frederic Weisbecker, Heiko Carstens,
	David Howells, Paul Mackerras, Eric Paris, H. Peter Anvin,
	sparclinux, Jiri Slaby, linux-s390, Russell King, x86, jmorris,
	Ingo Molnar, Benjamin Herrenschmidt, Serge E. Hallyn,
	Peter Zijlstra, microblaze-uclinux, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, kees.cook, Roland McGrath,
	Michal Marek, Michal Simek, linuxppc-dev, Oleg Nesterov,
	Ralf Baechle, Paul Mundt, Tejun Heo, linux390, Andrew Morton,
	agl, David S. Miller


* Arnd Bergmann <arnd@arndb.de> wrote:

> On Saturday 14 May 2011, Will Drewry wrote:
> > Depending on integration, it could even be limited to ioctl commands
> > that are appropriate to a known fd if the fd is opened prior to
> > entering seccomp mode 2. Alternatively, __NR__ioctl could be allowed
> > with a filter of "1" then narrowed through a later addition of
> > something like "(fd == %u && (cmd == %u || cmd == %u))" or something
> > along those lines.
> > 
> > Does that make sense?
> 
> Thanks for the explanation. This sounds like it's already doing all
> we need.

One thing we could do more clearly here is to help keep the filter expressions 
symbolic - i.e. help resolve the various ioctl variants as well, not just the 
raw syscall parameter numbers.

But yes, access to the raw syscall parameters and the ability to filter them 
already gives us the ability to exclude/include specific ioctls in a rather 
flexible way.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 12:00                             ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:00 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, jmorris, Ingo Molnar, Roland McGrath,
	kees.cook, Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Steven Rostedt, Tejun Heo, Thomas Gleixner, linux-arm-kernel,
	Michal Marek, Michal Simek, Will Drewry, agl, linux-kernel,
	Eric Paris, Paul Mundt, Martin Schwidefsky, linux390,
	Andrew Morton, linuxppc-dev, David S. Miller


* Arnd Bergmann <arnd@arndb.de> wrote:

> On Saturday 14 May 2011, Will Drewry wrote:
> > Depending on integration, it could even be limited to ioctl commands
> > that are appropriate to a known fd if the fd is opened prior to
> > entering seccomp mode 2. Alternatively, __NR__ioctl could be allowed
> > with a filter of "1" then narrowed through a later addition of
> > something like "(fd == %u && (cmd == %u || cmd == %u))" or something
> > along those lines.
> > 
> > Does that make sense?
> 
> Thanks for the explanation. This sounds like it's already doing all
> we need.

One thing we could do more clearly here is to help keep the filter expressions 
symbolic - i.e. help resolve the various ioctl variants as well, not just the 
raw syscall parameter numbers.

But yes, access to the raw syscall parameters and the ability to filter them 
already gives us the ability to exclude/include specific ioctls in a rather 
flexible way.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 12:00                             ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:00 UTC (permalink / raw)
  To: linux-arm-kernel


* Arnd Bergmann <arnd@arndb.de> wrote:

> On Saturday 14 May 2011, Will Drewry wrote:
> > Depending on integration, it could even be limited to ioctl commands
> > that are appropriate to a known fd if the fd is opened prior to
> > entering seccomp mode 2. Alternatively, __NR__ioctl could be allowed
> > with a filter of "1" then narrowed through a later addition of
> > something like "(fd == %u && (cmd == %u || cmd == %u))" or something
> > along those lines.
> > 
> > Does that make sense?
> 
> Thanks for the explanation. This sounds like it's already doing all
> we need.

One thing we could do more clearly here is to help keep the filter expressions 
symbolic - i.e. help resolve the various ioctl variants as well, not just the 
raw syscall parameter numbers.

But yes, access to the raw syscall parameters and the ability to filter them 
already gives us the ability to exclude/include specific ioctls in a rather 
flexible way.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system callfiltering
  2011-05-13 15:29                                                 ` David Laight
  (?)
@ 2011-05-16 12:03                                                   ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:03 UTC (permalink / raw)
  To: David Laight
  Cc: Eric Paris, linux-mips, linux-sh, Peter Zijlstra,
	Frederic Weisbecker, Heiko Carstens, Oleg Nesterov,
	David Howells, Paul Mackerras, H. PeterAnvin, sparclinux,
	Jiri Slaby, linux-s390, Russell King, x86, James Morris,
	Linus Torvalds, Ingo Molnar, kees.cook, Serge E. Hallyn,
	Steven Rostedt, Tejun Heo, Thomas Gleixner, linux-arm-kernel,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	linux-kernel, Ralf Baechle, Paul Mundt, Martin Schwidefsky,
	linux390, Andrew Morton, agl, David S. Miller


* David Laight <David.Laight@ACULAB.COM> wrote:

> [...] unfortunately it worked by looking at the user-space buffers on system 
> call entry - and a multithreaded program can easily arrange to update them 
> after the initial check! [...]

Such problems of reliability/persistency of security checks is exactly one of 
my arguments why this should not be limited to the syscall boundary, if you 
read the example i have provided in this discussion.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system callfiltering
@ 2011-05-16 12:03                                                   ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:03 UTC (permalink / raw)
  To: David Laight
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, linux-kernel, David Howells, Paul Mackerras,
	Ralf Baechle, H. PeterAnvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, agl, linux-arm-kernel, Michal Marek,
	Michal Simek, Will Drewry, linuxppc-dev, Oleg Nesterov,
	Eric Paris, Paul Mundt, Tejun Heo, linux390, Andrew Morton,
	Linus Torvalds, David S. Miller


* David Laight <David.Laight@ACULAB.COM> wrote:

> [...] unfortunately it worked by looking at the user-space buffers on system 
> call entry - and a multithreaded program can easily arrange to update them 
> after the initial check! [...]

Such problems of reliability/persistency of security checks is exactly one of 
my arguments why this should not be limited to the syscall boundary, if you 
read the example i have provided in this discussion.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system callfiltering
@ 2011-05-16 12:03                                                   ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:03 UTC (permalink / raw)
  To: linux-arm-kernel


* David Laight <David.Laight@ACULAB.COM> wrote:

> [...] unfortunately it worked by looking at the user-space buffers on system 
> call entry - and a multithreaded program can easily arrange to update them 
> after the initial check! [...]

Such problems of reliability/persistency of security checks is exactly one of 
my arguments why this should not be limited to the syscall boundary, if you 
read the example i have provided in this discussion.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-14 20:57                                     ` Will Drewry
  (?)
@ 2011-05-16 12:43                                       ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:43 UTC (permalink / raw)
  To: Will Drewry
  Cc: Eric Paris, James Morris, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, kees.cook, agl, Peter Zijlstra,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds


* Will Drewry <wad@chromium.org> wrote:

> > Note, i'm not actually asking for the moon, a pony and more.
> >
> > I fully submit that we are yet far away from being able to do a full LSM 
> > via this mechanism.
> >
> > What i'm asking for is that because the syscall point steps taken by Will 
> > look very promising so it would be nice to do *that* in a slightly more 
> > flexible scheme that does not condemn it to be limited to the syscall 
> > boundary and such ...
> 
> What do you suggest here?
> 
> From my brief exploration of the ftrace/perf (and seccomp) code, I don't see 
> a clean way of integrating over the existing interfaces to the ftrace 
> framework (e.g., the global perf event pump seems to be a mismatch), but I 
> may be missing something obvious. [...]

Well, there's no global perf event pump. Here we'd obviously want to use 
buffer-less events that do no output (pumping) whatsoever - i.e. just counting 
events with filters attached.

What i suggest is to:

 - share syscall events        # you are fine with that as your patch makes use of them
 - share the scripting engine  # you are fine with that as your patch makes use of it

 - share *any* other event to do_exit() a process at syscall exit time

 - share any other active event that kernel developers specifically enable 
   for active use to impact security-relevant execution even sooner than 
   syscall exit time - not just system calls

 - share the actual facility that manages (sets/gets) filters

So right now you have this structure for your new feature:

 Documentation/trace/seccomp_filter.txt |   75 +++++
 arch/x86/kernel/ldt.c                  |    5 
 arch/x86/kernel/tls.c                  |    7 
 fs/proc/array.c                        |   21 +
 fs/proc/base.c                         |   25 +
 include/linux/ftrace_event.h           |    9 
 include/linux/seccomp.h                |   98 +++++++
 include/linux/syscalls.h               |   54 ++--
 include/trace/syscall.h                |    6 
 kernel/Makefile                        |    1 
 kernel/fork.c                          |    8 
 kernel/perf_event.c                    |    7 
 kernel/seccomp.c                       |  144 ++++++++++-
 kernel/seccomp_filter.c                |  428 +++++++++++++++++++++++++++++++++
 kernel/sys.c                           |   11 
 kernel/trace/Kconfig                   |   10 
 kernel/trace/trace_events_filter.c     |   60 ++--
 kernel/trace/trace_syscalls.c          |   96 ++++++-
 18 files changed, 986 insertions(+), 79 deletions(-)

Firstly, one specific problem i can see is that kernel/seccomp_filter.c 
hardcodes to the system call boundary. Which is fine to a prototype 
implementation (and obviously fine for something that builds upon seccomp) but 
not fine in terms of a flexible Linux security feature :-)

You have hardcoded these syscall assumptions via:

 struct seccomp_filter {
        struct list_head list;
        struct rcu_head rcu;
        int syscall_nr;
        struct syscall_metadata *data;
        struct event_filter *event_filter;
 };

Which comes just because you chose to enumerate only syscall events - instead 
of enumerating all events.

Instead of that please bind to all events instead - syscalls are just one of 
the many events we have. Type 'perf list' and see how many event types we have, 
and quite a few could be enabled for 'active feedback' sandboxing as well.

Secondly, and this is really a variant of the first problem you have, the way 
you process event filter 'failures' is pretty limited. You utilize the regular 
seccomp method which works by calling into __secure_computing() and silently 
accepting syscalls or generating a hard do_exit() on even the slightest of 
filter failures.

Instead of that what we'd want to have is to have regular syscall events 
registered, *and* enabled for such active filtering purposes. The moment the 
filter hits such an 'active' event would set the TIF_NOTIFY_RESUME flag and 
some other attribute in the task and the kernel would do a do_exit() at the 
earliest of opportunities before calling the syscall or at the 
return-from-syscall point latest.

Note that no seccomp specific code would have to execute here, we can already 
generate events both at syscall entry and at syscall exit points, the only new 
bit we'd need is for the 'kill the task' [or abort the syscall] policy action.

This is *far* more generic still yields the same short-term end result as far 
as your sandboxing is concerned.

What we'd need for this is a way to mark existing TRACE_EVENT()s as 'active'. 
We'd mark all syscall events as 'active' straight away.

[ Detail: these trace events return a return code, which the calling code can 
  use, that way event return values could be used sooner than syscall exit 
  points. IRQ code could make use of it as well, so for example this way we 
  could filter based early packet inspection, still in the IRQ code. ]

Then what your feature would do is to simply open up the events you are 
interested in (just as counting events, without any buffers), and set 'active' 
filters in them them to 'deny/allow' specific combinations of parameters only.

Thirdly, you have added a separate facility to set/query filters via /proc, 
this lives mostly in /proc/<pid>/seccomp_filter. This seccomp specificity would 
go away altogether: it should be possible to query existing events and filters 
even outside of the seccomp facility, so if you want something explicit here 
beyond the current event ioctl() to set filters please improve the generic 
facility - which right now is poorer than what you have implemented so it's in 
good need to be improved! :-)

All in one such a solution would enable us to reuse code in a lot more deeper 
fashion while still providing all the functionality you are interested in. 
These are all short-term concerns, not pie-in-the-sky future ideal LSM 
concerns.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 12:43                                       ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:43 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Peter Zijlstra, Steven Rostedt,
	Tejun Heo, Thomas Gleixner, linux-arm-kernel, Michal Marek,
	Michal Simek, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* Will Drewry <wad@chromium.org> wrote:

> > Note, i'm not actually asking for the moon, a pony and more.
> >
> > I fully submit that we are yet far away from being able to do a full LSM 
> > via this mechanism.
> >
> > What i'm asking for is that because the syscall point steps taken by Will 
> > look very promising so it would be nice to do *that* in a slightly more 
> > flexible scheme that does not condemn it to be limited to the syscall 
> > boundary and such ...
> 
> What do you suggest here?
> 
> From my brief exploration of the ftrace/perf (and seccomp) code, I don't see 
> a clean way of integrating over the existing interfaces to the ftrace 
> framework (e.g., the global perf event pump seems to be a mismatch), but I 
> may be missing something obvious. [...]

Well, there's no global perf event pump. Here we'd obviously want to use 
buffer-less events that do no output (pumping) whatsoever - i.e. just counting 
events with filters attached.

What i suggest is to:

 - share syscall events        # you are fine with that as your patch makes use of them
 - share the scripting engine  # you are fine with that as your patch makes use of it

 - share *any* other event to do_exit() a process at syscall exit time

 - share any other active event that kernel developers specifically enable 
   for active use to impact security-relevant execution even sooner than 
   syscall exit time - not just system calls

 - share the actual facility that manages (sets/gets) filters

So right now you have this structure for your new feature:

 Documentation/trace/seccomp_filter.txt |   75 +++++
 arch/x86/kernel/ldt.c                  |    5 
 arch/x86/kernel/tls.c                  |    7 
 fs/proc/array.c                        |   21 +
 fs/proc/base.c                         |   25 +
 include/linux/ftrace_event.h           |    9 
 include/linux/seccomp.h                |   98 +++++++
 include/linux/syscalls.h               |   54 ++--
 include/trace/syscall.h                |    6 
 kernel/Makefile                        |    1 
 kernel/fork.c                          |    8 
 kernel/perf_event.c                    |    7 
 kernel/seccomp.c                       |  144 ++++++++++-
 kernel/seccomp_filter.c                |  428 +++++++++++++++++++++++++++++++++
 kernel/sys.c                           |   11 
 kernel/trace/Kconfig                   |   10 
 kernel/trace/trace_events_filter.c     |   60 ++--
 kernel/trace/trace_syscalls.c          |   96 ++++++-
 18 files changed, 986 insertions(+), 79 deletions(-)

Firstly, one specific problem i can see is that kernel/seccomp_filter.c 
hardcodes to the system call boundary. Which is fine to a prototype 
implementation (and obviously fine for something that builds upon seccomp) but 
not fine in terms of a flexible Linux security feature :-)

You have hardcoded these syscall assumptions via:

 struct seccomp_filter {
        struct list_head list;
        struct rcu_head rcu;
        int syscall_nr;
        struct syscall_metadata *data;
        struct event_filter *event_filter;
 };

Which comes just because you chose to enumerate only syscall events - instead 
of enumerating all events.

Instead of that please bind to all events instead - syscalls are just one of 
the many events we have. Type 'perf list' and see how many event types we have, 
and quite a few could be enabled for 'active feedback' sandboxing as well.

Secondly, and this is really a variant of the first problem you have, the way 
you process event filter 'failures' is pretty limited. You utilize the regular 
seccomp method which works by calling into __secure_computing() and silently 
accepting syscalls or generating a hard do_exit() on even the slightest of 
filter failures.

Instead of that what we'd want to have is to have regular syscall events 
registered, *and* enabled for such active filtering purposes. The moment the 
filter hits such an 'active' event would set the TIF_NOTIFY_RESUME flag and 
some other attribute in the task and the kernel would do a do_exit() at the 
earliest of opportunities before calling the syscall or at the 
return-from-syscall point latest.

Note that no seccomp specific code would have to execute here, we can already 
generate events both at syscall entry and at syscall exit points, the only new 
bit we'd need is for the 'kill the task' [or abort the syscall] policy action.

This is *far* more generic still yields the same short-term end result as far 
as your sandboxing is concerned.

What we'd need for this is a way to mark existing TRACE_EVENT()s as 'active'. 
We'd mark all syscall events as 'active' straight away.

[ Detail: these trace events return a return code, which the calling code can 
  use, that way event return values could be used sooner than syscall exit 
  points. IRQ code could make use of it as well, so for example this way we 
  could filter based early packet inspection, still in the IRQ code. ]

Then what your feature would do is to simply open up the events you are 
interested in (just as counting events, without any buffers), and set 'active' 
filters in them them to 'deny/allow' specific combinations of parameters only.

Thirdly, you have added a separate facility to set/query filters via /proc, 
this lives mostly in /proc/<pid>/seccomp_filter. This seccomp specificity would 
go away altogether: it should be possible to query existing events and filters 
even outside of the seccomp facility, so if you want something explicit here 
beyond the current event ioctl() to set filters please improve the generic 
facility - which right now is poorer than what you have implemented so it's in 
good need to be improved! :-)

All in one such a solution would enable us to reuse code in a lot more deeper 
fashion while still providing all the functionality you are interested in. 
These are all short-term concerns, not pie-in-the-sky future ideal LSM 
concerns.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 12:43                                       ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:43 UTC (permalink / raw)
  To: linux-arm-kernel


* Will Drewry <wad@chromium.org> wrote:

> > Note, i'm not actually asking for the moon, a pony and more.
> >
> > I fully submit that we are yet far away from being able to do a full LSM 
> > via this mechanism.
> >
> > What i'm asking for is that because the syscall point steps taken by Will 
> > look very promising so it would be nice to do *that* in a slightly more 
> > flexible scheme that does not condemn it to be limited to the syscall 
> > boundary and such ...
> 
> What do you suggest here?
> 
> From my brief exploration of the ftrace/perf (and seccomp) code, I don't see 
> a clean way of integrating over the existing interfaces to the ftrace 
> framework (e.g., the global perf event pump seems to be a mismatch), but I 
> may be missing something obvious. [...]

Well, there's no global perf event pump. Here we'd obviously want to use 
buffer-less events that do no output (pumping) whatsoever - i.e. just counting 
events with filters attached.

What i suggest is to:

 - share syscall events        # you are fine with that as your patch makes use of them
 - share the scripting engine  # you are fine with that as your patch makes use of it

 - share *any* other event to do_exit() a process at syscall exit time

 - share any other active event that kernel developers specifically enable 
   for active use to impact security-relevant execution even sooner than 
   syscall exit time - not just system calls

 - share the actual facility that manages (sets/gets) filters

So right now you have this structure for your new feature:

 Documentation/trace/seccomp_filter.txt |   75 +++++
 arch/x86/kernel/ldt.c                  |    5 
 arch/x86/kernel/tls.c                  |    7 
 fs/proc/array.c                        |   21 +
 fs/proc/base.c                         |   25 +
 include/linux/ftrace_event.h           |    9 
 include/linux/seccomp.h                |   98 +++++++
 include/linux/syscalls.h               |   54 ++--
 include/trace/syscall.h                |    6 
 kernel/Makefile                        |    1 
 kernel/fork.c                          |    8 
 kernel/perf_event.c                    |    7 
 kernel/seccomp.c                       |  144 ++++++++++-
 kernel/seccomp_filter.c                |  428 +++++++++++++++++++++++++++++++++
 kernel/sys.c                           |   11 
 kernel/trace/Kconfig                   |   10 
 kernel/trace/trace_events_filter.c     |   60 ++--
 kernel/trace/trace_syscalls.c          |   96 ++++++-
 18 files changed, 986 insertions(+), 79 deletions(-)

Firstly, one specific problem i can see is that kernel/seccomp_filter.c 
hardcodes to the system call boundary. Which is fine to a prototype 
implementation (and obviously fine for something that builds upon seccomp) but 
not fine in terms of a flexible Linux security feature :-)

You have hardcoded these syscall assumptions via:

 struct seccomp_filter {
        struct list_head list;
        struct rcu_head rcu;
        int syscall_nr;
        struct syscall_metadata *data;
        struct event_filter *event_filter;
 };

Which comes just because you chose to enumerate only syscall events - instead 
of enumerating all events.

Instead of that please bind to all events instead - syscalls are just one of 
the many events we have. Type 'perf list' and see how many event types we have, 
and quite a few could be enabled for 'active feedback' sandboxing as well.

Secondly, and this is really a variant of the first problem you have, the way 
you process event filter 'failures' is pretty limited. You utilize the regular 
seccomp method which works by calling into __secure_computing() and silently 
accepting syscalls or generating a hard do_exit() on even the slightest of 
filter failures.

Instead of that what we'd want to have is to have regular syscall events 
registered, *and* enabled for such active filtering purposes. The moment the 
filter hits such an 'active' event would set the TIF_NOTIFY_RESUME flag and 
some other attribute in the task and the kernel would do a do_exit() at the 
earliest of opportunities before calling the syscall or at the 
return-from-syscall point latest.

Note that no seccomp specific code would have to execute here, we can already 
generate events both at syscall entry and at syscall exit points, the only new 
bit we'd need is for the 'kill the task' [or abort the syscall] policy action.

This is *far* more generic still yields the same short-term end result as far 
as your sandboxing is concerned.

What we'd need for this is a way to mark existing TRACE_EVENT()s as 'active'. 
We'd mark all syscall events as 'active' straight away.

[ Detail: these trace events return a return code, which the calling code can 
  use, that way event return values could be used sooner than syscall exit 
  points. IRQ code could make use of it as well, so for example this way we 
  could filter based early packet inspection, still in the IRQ code. ]

Then what your feature would do is to simply open up the events you are 
interested in (just as counting events, without any buffers), and set 'active' 
filters in them them to 'deny/allow' specific combinations of parameters only.

Thirdly, you have added a separate facility to set/query filters via /proc, 
this lives mostly in /proc/<pid>/seccomp_filter. This seccomp specificity would 
go away altogether: it should be possible to query existing events and filters 
even outside of the seccomp facility, so if you want something explicit here 
beyond the current event ioctl() to set filters please improve the generic 
facility - which right now is poorer than what you have implemented so it's in 
good need to be improved! :-)

All in one such a solution would enable us to reuse code in a lot more deeper 
fashion while still providing all the functionality you are interested in. 
These are all short-term concerns, not pie-in-the-sky future ideal LSM 
concerns.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12 16:26                             ` Will Drewry
  (?)
@ 2011-05-16 12:55                               ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:55 UTC (permalink / raw)
  To: Will Drewry
  Cc: James Morris, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds


* Will Drewry <wad@chromium.org> wrote:

> I agree with you on many of these points!  However, I don't think that the 
> views around LSMs, perf/ftrace infrastructure, or the current seccomp 
> filtering implementation are necessarily in conflict.  Here is my 
> understanding of how the different worlds fit together and where I see this 
> patchset living, along with where I could see future work going.  Perhaps I'm 
> being a trifle naive, but here goes anyway:
> 
> 1. LSMs provide a global mechanism for hooking "security relevant"
> events at a point where all the incoming user-sourced data has been
> preprocessed and moved into userspace.  The hooks are called every
> time one of those boundaries are crossed.

> 2. Perf and the ftrace infrastructure provide global function tracing
> and system call hooks with direct access to the caller's registers
> (and memory).

No, perf events are not just global but per task as well. Nor are events 
limited to 'tracing' (generating a flow of events into a trace buffer) - they 
can just be themselves as well and count and generate callbacks.

The generic NMI watchdog uses that kind of event model for example, see 
kernel/watchdog.c and how it makes use of struct perf_event abstractions to do 
per CPU events (with no buffrs), or how kernel/hw_breakpoint.c uses it for per 
task events and integrates it with the ptrace hw-breakpoints code.

Ideally Peter's one particular suggestion is right IMO and we'd want to be able 
for a perf_event to just be a list of callbacks, attached to a task and barely 
more than a discoverable, named notifier chain in its slimmest form.

In practice it's fatter than that right now, but we should definitely factor 
out that aspect of it more clearly, both code-wise and API-wise. 
kernel/watchdog.c and kernel/hw_breakpoint.c shows that such factoring out is 
possible and desirable.

> 3. seccomp (as it exists today) provides a global system call entry
> hook point with a binary per-process decision about whether to provide
> "secure computing" behavior.
> 
> When I boil that down to abstractions, I see:
> A. Globally scoped: LSMs, ftrace/perf
> B. Locally/process scoped: seccomp

Ok, i see where you got the idea that you needed to cut your surface of 
abstraction at the filter engine / syscall enumeration level - i think you were 
thinking of it in the ftrace model of tracepoints, not in the perf model of 
events.

No, events are generic and as such per task as well, not just global.

I've replied to your other mail with more specific suggestions of how we could 
provide your feature using abstractions that share code more widely. Talking 
specifics will i hope help move the discussion forward! :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 12:55                               ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:55 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Peter Zijlstra,
	microblaze-uclinux, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller


* Will Drewry <wad@chromium.org> wrote:

> I agree with you on many of these points!  However, I don't think that the 
> views around LSMs, perf/ftrace infrastructure, or the current seccomp 
> filtering implementation are necessarily in conflict.  Here is my 
> understanding of how the different worlds fit together and where I see this 
> patchset living, along with where I could see future work going.  Perhaps I'm 
> being a trifle naive, but here goes anyway:
> 
> 1. LSMs provide a global mechanism for hooking "security relevant"
> events at a point where all the incoming user-sourced data has been
> preprocessed and moved into userspace.  The hooks are called every
> time one of those boundaries are crossed.

> 2. Perf and the ftrace infrastructure provide global function tracing
> and system call hooks with direct access to the caller's registers
> (and memory).

No, perf events are not just global but per task as well. Nor are events 
limited to 'tracing' (generating a flow of events into a trace buffer) - they 
can just be themselves as well and count and generate callbacks.

The generic NMI watchdog uses that kind of event model for example, see 
kernel/watchdog.c and how it makes use of struct perf_event abstractions to do 
per CPU events (with no buffrs), or how kernel/hw_breakpoint.c uses it for per 
task events and integrates it with the ptrace hw-breakpoints code.

Ideally Peter's one particular suggestion is right IMO and we'd want to be able 
for a perf_event to just be a list of callbacks, attached to a task and barely 
more than a discoverable, named notifier chain in its slimmest form.

In practice it's fatter than that right now, but we should definitely factor 
out that aspect of it more clearly, both code-wise and API-wise. 
kernel/watchdog.c and kernel/hw_breakpoint.c shows that such factoring out is 
possible and desirable.

> 3. seccomp (as it exists today) provides a global system call entry
> hook point with a binary per-process decision about whether to provide
> "secure computing" behavior.
> 
> When I boil that down to abstractions, I see:
> A. Globally scoped: LSMs, ftrace/perf
> B. Locally/process scoped: seccomp

Ok, i see where you got the idea that you needed to cut your surface of 
abstraction at the filter engine / syscall enumeration level - i think you were 
thinking of it in the ftrace model of tracepoints, not in the perf model of 
events.

No, events are generic and as such per task as well, not just global.

I've replied to your other mail with more specific suggestions of how we could 
provide your feature using abstractions that share code more widely. Talking 
specifics will i hope help move the discussion forward! :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 12:55                               ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 12:55 UTC (permalink / raw)
  To: linux-arm-kernel


* Will Drewry <wad@chromium.org> wrote:

> I agree with you on many of these points!  However, I don't think that the 
> views around LSMs, perf/ftrace infrastructure, or the current seccomp 
> filtering implementation are necessarily in conflict.  Here is my 
> understanding of how the different worlds fit together and where I see this 
> patchset living, along with where I could see future work going.  Perhaps I'm 
> being a trifle naive, but here goes anyway:
> 
> 1. LSMs provide a global mechanism for hooking "security relevant"
> events at a point where all the incoming user-sourced data has been
> preprocessed and moved into userspace.  The hooks are called every
> time one of those boundaries are crossed.

> 2. Perf and the ftrace infrastructure provide global function tracing
> and system call hooks with direct access to the caller's registers
> (and memory).

No, perf events are not just global but per task as well. Nor are events 
limited to 'tracing' (generating a flow of events into a trace buffer) - they 
can just be themselves as well and count and generate callbacks.

The generic NMI watchdog uses that kind of event model for example, see 
kernel/watchdog.c and how it makes use of struct perf_event abstractions to do 
per CPU events (with no buffrs), or how kernel/hw_breakpoint.c uses it for per 
task events and integrates it with the ptrace hw-breakpoints code.

Ideally Peter's one particular suggestion is right IMO and we'd want to be able 
for a perf_event to just be a list of callbacks, attached to a task and barely 
more than a discoverable, named notifier chain in its slimmest form.

In practice it's fatter than that right now, but we should definitely factor 
out that aspect of it more clearly, both code-wise and API-wise. 
kernel/watchdog.c and kernel/hw_breakpoint.c shows that such factoring out is 
possible and desirable.

> 3. seccomp (as it exists today) provides a global system call entry
> hook point with a binary per-process decision about whether to provide
> "secure computing" behavior.
> 
> When I boil that down to abstractions, I see:
> A. Globally scoped: LSMs, ftrace/perf
> B. Locally/process scoped: seccomp

Ok, i see where you got the idea that you needed to cut your surface of 
abstraction at the filter engine / syscall enumeration level - i think you were 
thinking of it in the ftrace model of tracepoints, not in the perf model of 
events.

No, events are generic and as such per task as well, not just global.

I've replied to your other mail with more specific suggestions of how we could 
provide your feature using abstractions that share code more widely. Talking 
specifics will i hope help move the discussion forward! :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-16 12:55                               ` Ingo Molnar
  (?)
@ 2011-05-16 14:42                                 ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-16 14:42 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	Linus Torvalds

On Mon, May 16, 2011 at 7:55 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> I agree with you on many of these points!  However, I don't think that the
>> views around LSMs, perf/ftrace infrastructure, or the current seccomp
>> filtering implementation are necessarily in conflict.  Here is my
>> understanding of how the different worlds fit together and where I see this
>> patchset living, along with where I could see future work going.  Perhaps I'm
>> being a trifle naive, but here goes anyway:
>>
>> 1. LSMs provide a global mechanism for hooking "security relevant"
>> events at a point where all the incoming user-sourced data has been
>> preprocessed and moved into userspace.  The hooks are called every
>> time one of those boundaries are crossed.
>
>> 2. Perf and the ftrace infrastructure provide global function tracing
>> and system call hooks with direct access to the caller's registers
>> (and memory).
>
> No, perf events are not just global but per task as well. Nor are events
> limited to 'tracing' (generating a flow of events into a trace buffer) - they
> can just be themselves as well and count and generate callbacks.

I was looking at the perf_sysenter_enable() call, but clearly there is
more going on :)

> The generic NMI watchdog uses that kind of event model for example, see
> kernel/watchdog.c and how it makes use of struct perf_event abstractions to do
> per CPU events (with no buffrs), or how kernel/hw_breakpoint.c uses it for per
> task events and integrates it with the ptrace hw-breakpoints code.
>
> Ideally Peter's one particular suggestion is right IMO and we'd want to be able
> for a perf_event to just be a list of callbacks, attached to a task and barely
> more than a discoverable, named notifier chain in its slimmest form.
>
> In practice it's fatter than that right now, but we should definitely factor
> out that aspect of it more clearly, both code-wise and API-wise.
> kernel/watchdog.c and kernel/hw_breakpoint.c shows that such factoring out is
> possible and desirable.
>
>> 3. seccomp (as it exists today) provides a global system call entry
>> hook point with a binary per-process decision about whether to provide
>> "secure computing" behavior.
>>
>> When I boil that down to abstractions, I see:
>> A. Globally scoped: LSMs, ftrace/perf
>> B. Locally/process scoped: seccomp
>
> Ok, i see where you got the idea that you needed to cut your surface of
> abstraction at the filter engine / syscall enumeration level - i think you were
> thinking of it in the ftrace model of tracepoints, not in the perf model of
> events.
>
> No, events are generic and as such per task as well, not just global.
>
> I've replied to your other mail with more specific suggestions of how we could
> provide your feature using abstractions that share code more widely. Talking
> specifics will i hope help move the discussion forward! :-)

Agreed.  I'll digest both the watchdog code as well as your other
comments and follow up when I have a fuller picture in my head.

(I have a few initial comments I'll post in response to your other mail.)

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 14:42                                 ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-16 14:42 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Peter Zijlstra,
	microblaze-uclinux, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller

On Mon, May 16, 2011 at 7:55 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> I agree with you on many of these points! =A0However, I don't think that=
 the
>> views around LSMs, perf/ftrace infrastructure, or the current seccomp
>> filtering implementation are necessarily in conflict. =A0Here is my
>> understanding of how the different worlds fit together and where I see t=
his
>> patchset living, along with where I could see future work going. =A0Perh=
aps I'm
>> being a trifle naive, but here goes anyway:
>>
>> 1. LSMs provide a global mechanism for hooking "security relevant"
>> events at a point where all the incoming user-sourced data has been
>> preprocessed and moved into userspace. =A0The hooks are called every
>> time one of those boundaries are crossed.
>
>> 2. Perf and the ftrace infrastructure provide global function tracing
>> and system call hooks with direct access to the caller's registers
>> (and memory).
>
> No, perf events are not just global but per task as well. Nor are events
> limited to 'tracing' (generating a flow of events into a trace buffer) - =
they
> can just be themselves as well and count and generate callbacks.

I was looking at the perf_sysenter_enable() call, but clearly there is
more going on :)

> The generic NMI watchdog uses that kind of event model for example, see
> kernel/watchdog.c and how it makes use of struct perf_event abstractions =
to do
> per CPU events (with no buffrs), or how kernel/hw_breakpoint.c uses it fo=
r per
> task events and integrates it with the ptrace hw-breakpoints code.
>
> Ideally Peter's one particular suggestion is right IMO and we'd want to b=
e able
> for a perf_event to just be a list of callbacks, attached to a task and b=
arely
> more than a discoverable, named notifier chain in its slimmest form.
>
> In practice it's fatter than that right now, but we should definitely fac=
tor
> out that aspect of it more clearly, both code-wise and API-wise.
> kernel/watchdog.c and kernel/hw_breakpoint.c shows that such factoring ou=
t is
> possible and desirable.
>
>> 3. seccomp (as it exists today) provides a global system call entry
>> hook point with a binary per-process decision about whether to provide
>> "secure computing" behavior.
>>
>> When I boil that down to abstractions, I see:
>> A. Globally scoped: LSMs, ftrace/perf
>> B. Locally/process scoped: seccomp
>
> Ok, i see where you got the idea that you needed to cut your surface of
> abstraction at the filter engine / syscall enumeration level - i think yo=
u were
> thinking of it in the ftrace model of tracepoints, not in the perf model =
of
> events.
>
> No, events are generic and as such per task as well, not just global.
>
> I've replied to your other mail with more specific suggestions of how we =
could
> provide your feature using abstractions that share code more widely. Talk=
ing
> specifics will i hope help move the discussion forward! :-)

Agreed.  I'll digest both the watchdog code as well as your other
comments and follow up when I have a fuller picture in my head.

(I have a few initial comments I'll post in response to your other mail.)

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 14:42                                 ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-16 14:42 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, May 16, 2011 at 7:55 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> I agree with you on many of these points! ?However, I don't think that the
>> views around LSMs, perf/ftrace infrastructure, or the current seccomp
>> filtering implementation are necessarily in conflict. ?Here is my
>> understanding of how the different worlds fit together and where I see this
>> patchset living, along with where I could see future work going. ?Perhaps I'm
>> being a trifle naive, but here goes anyway:
>>
>> 1. LSMs provide a global mechanism for hooking "security relevant"
>> events at a point where all the incoming user-sourced data has been
>> preprocessed and moved into userspace. ?The hooks are called every
>> time one of those boundaries are crossed.
>
>> 2. Perf and the ftrace infrastructure provide global function tracing
>> and system call hooks with direct access to the caller's registers
>> (and memory).
>
> No, perf events are not just global but per task as well. Nor are events
> limited to 'tracing' (generating a flow of events into a trace buffer) - they
> can just be themselves as well and count and generate callbacks.

I was looking at the perf_sysenter_enable() call, but clearly there is
more going on :)

> The generic NMI watchdog uses that kind of event model for example, see
> kernel/watchdog.c and how it makes use of struct perf_event abstractions to do
> per CPU events (with no buffrs), or how kernel/hw_breakpoint.c uses it for per
> task events and integrates it with the ptrace hw-breakpoints code.
>
> Ideally Peter's one particular suggestion is right IMO and we'd want to be able
> for a perf_event to just be a list of callbacks, attached to a task and barely
> more than a discoverable, named notifier chain in its slimmest form.
>
> In practice it's fatter than that right now, but we should definitely factor
> out that aspect of it more clearly, both code-wise and API-wise.
> kernel/watchdog.c and kernel/hw_breakpoint.c shows that such factoring out is
> possible and desirable.
>
>> 3. seccomp (as it exists today) provides a global system call entry
>> hook point with a binary per-process decision about whether to provide
>> "secure computing" behavior.
>>
>> When I boil that down to abstractions, I see:
>> A. Globally scoped: LSMs, ftrace/perf
>> B. Locally/process scoped: seccomp
>
> Ok, i see where you got the idea that you needed to cut your surface of
> abstraction at the filter engine / syscall enumeration level - i think you were
> thinking of it in the ftrace model of tracepoints, not in the perf model of
> events.
>
> No, events are generic and as such per task as well, not just global.
>
> I've replied to your other mail with more specific suggestions of how we could
> provide your feature using abstractions that share code more widely. Talking
> specifics will i hope help move the discussion forward! :-)

Agreed.  I'll digest both the watchdog code as well as your other
comments and follow up when I have a fuller picture in my head.

(I have a few initial comments I'll post in response to your other mail.)

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-16  0:36                                 ` James Morris
  (?)
@ 2011-05-16 15:08                                   ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 15:08 UTC (permalink / raw)
  To: James Morris
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, David Howells, Russell King,
	Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds


* James Morris <jmorris@namei.org> wrote:

> On Fri, 13 May 2011, Ingo Molnar wrote:
> 
> > Say i'm a user-space sandbox developer who wants to enforce that sandboxed 
> > code should only be allowed to open files in /home/sandbox/, /lib/ and 
> > /usr/lib/.
> > 
> > It is a simple and sensible security feature, agreed? It allows most code 
> > to run well and link to countless libraries - but no access to other files 
> > is allowed.
> 
> Not really.
> 
> Firstly, what is the security goal of these restrictions? [...]

To do what i described above? Namely:

 " Sandboxed code should only be allowed to open files in /home/sandbox/, /lib/
   and /usr/lib/ "

> [...]  Then, are the restrictions complete and unbypassable?

If only the system calls i mentioned are allowed, and if the sandboxed VFS 
namespace itself is isolated from the rest of the system (no bind mounts, no 
hard links outside the sandbox, etc.) then its goal is to not be bypassable - 
what use is a sandbox if the sandbox can be bypassed by the sandboxed code?

There's a few ways how to alter (and thus bypass) VFS namespace lookups: 
symlinks, chdir, chroot, rename, etc., which (as i mentioned) have to be 
excluded by default or filtered as well.

> How do you reason about the behavior of the system as a whole?

For some usecases i mainly want to reason about what the sandboxed code can do 
and can not do, within a fairly static and limited VFS namespace environment.

I might not want to have a full-blown 'physical barrier' for all objects 
labeled as inaccessible to sandboxed code (or labeled as accessible to 
sandboxed code).

Especially as manipulating file labels is not also slow (affects all files) but 
is also often an exclusively privileged operation even for owned files, for no 
good reason. For things like /lib/ and /usr/lib/ it also *has* to be a 
privileged operation.

> > I argue that this is the LSM and audit subsystems designed right: in the 
> > long run it could allow everything that LSM does at the moment - and so 
> > much more ...
> 
> Now you're proposing a redesign of the security subsystem.  That's a 
> significant undertaking.

It certainly is.

> In the meantime, we have a simple, well-defined enhancement to seccomp which 
> will be very useful to current users in reducing their kernel attack surface.
> 
> We should merge that, and the security subsystem discussion can carry on 
> separately.

Is that the development and merge process along which the LSM subsystem got 
into its current state?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 15:08                                   ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 15:08 UTC (permalink / raw)
  To: James Morris
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Peter Zijlstra, Steven Rostedt, Tejun Heo,
	Thomas Gleixner, linux-arm-kernel, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* James Morris <jmorris@namei.org> wrote:

> On Fri, 13 May 2011, Ingo Molnar wrote:
> 
> > Say i'm a user-space sandbox developer who wants to enforce that sandboxed 
> > code should only be allowed to open files in /home/sandbox/, /lib/ and 
> > /usr/lib/.
> > 
> > It is a simple and sensible security feature, agreed? It allows most code 
> > to run well and link to countless libraries - but no access to other files 
> > is allowed.
> 
> Not really.
> 
> Firstly, what is the security goal of these restrictions? [...]

To do what i described above? Namely:

 " Sandboxed code should only be allowed to open files in /home/sandbox/, /lib/
   and /usr/lib/ "

> [...]  Then, are the restrictions complete and unbypassable?

If only the system calls i mentioned are allowed, and if the sandboxed VFS 
namespace itself is isolated from the rest of the system (no bind mounts, no 
hard links outside the sandbox, etc.) then its goal is to not be bypassable - 
what use is a sandbox if the sandbox can be bypassed by the sandboxed code?

There's a few ways how to alter (and thus bypass) VFS namespace lookups: 
symlinks, chdir, chroot, rename, etc., which (as i mentioned) have to be 
excluded by default or filtered as well.

> How do you reason about the behavior of the system as a whole?

For some usecases i mainly want to reason about what the sandboxed code can do 
and can not do, within a fairly static and limited VFS namespace environment.

I might not want to have a full-blown 'physical barrier' for all objects 
labeled as inaccessible to sandboxed code (or labeled as accessible to 
sandboxed code).

Especially as manipulating file labels is not also slow (affects all files) but 
is also often an exclusively privileged operation even for owned files, for no 
good reason. For things like /lib/ and /usr/lib/ it also *has* to be a 
privileged operation.

> > I argue that this is the LSM and audit subsystems designed right: in the 
> > long run it could allow everything that LSM does at the moment - and so 
> > much more ...
> 
> Now you're proposing a redesign of the security subsystem.  That's a 
> significant undertaking.

It certainly is.

> In the meantime, we have a simple, well-defined enhancement to seccomp which 
> will be very useful to current users in reducing their kernel attack surface.
> 
> We should merge that, and the security subsystem discussion can carry on 
> separately.

Is that the development and merge process along which the LSM subsystem got 
into its current state?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 15:08                                   ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 15:08 UTC (permalink / raw)
  To: linux-arm-kernel


* James Morris <jmorris@namei.org> wrote:

> On Fri, 13 May 2011, Ingo Molnar wrote:
> 
> > Say i'm a user-space sandbox developer who wants to enforce that sandboxed 
> > code should only be allowed to open files in /home/sandbox/, /lib/ and 
> > /usr/lib/.
> > 
> > It is a simple and sensible security feature, agreed? It allows most code 
> > to run well and link to countless libraries - but no access to other files 
> > is allowed.
> 
> Not really.
> 
> Firstly, what is the security goal of these restrictions? [...]

To do what i described above? Namely:

 " Sandboxed code should only be allowed to open files in /home/sandbox/, /lib/
   and /usr/lib/ "

> [...]  Then, are the restrictions complete and unbypassable?

If only the system calls i mentioned are allowed, and if the sandboxed VFS 
namespace itself is isolated from the rest of the system (no bind mounts, no 
hard links outside the sandbox, etc.) then its goal is to not be bypassable - 
what use is a sandbox if the sandbox can be bypassed by the sandboxed code?

There's a few ways how to alter (and thus bypass) VFS namespace lookups: 
symlinks, chdir, chroot, rename, etc., which (as i mentioned) have to be 
excluded by default or filtered as well.

> How do you reason about the behavior of the system as a whole?

For some usecases i mainly want to reason about what the sandboxed code can do 
and can not do, within a fairly static and limited VFS namespace environment.

I might not want to have a full-blown 'physical barrier' for all objects 
labeled as inaccessible to sandboxed code (or labeled as accessible to 
sandboxed code).

Especially as manipulating file labels is not also slow (affects all files) but 
is also often an exclusively privileged operation even for owned files, for no 
good reason. For things like /lib/ and /usr/lib/ it also *has* to be a 
privileged operation.

> > I argue that this is the LSM and audit subsystems designed right: in the 
> > long run it could allow everything that LSM does at the moment - and so 
> > much more ...
> 
> Now you're proposing a redesign of the security subsystem.  That's a 
> significant undertaking.

It certainly is.

> In the meantime, we have a simple, well-defined enhancement to seccomp which 
> will be very useful to current users in reducing their kernel attack surface.
> 
> We should merge that, and the security subsystem discussion can carry on 
> separately.

Is that the development and merge process along which the LSM subsystem got 
into its current state?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-12  3:02                     ` Will Drewry
  (?)
@ 2011-05-16 15:26                       ` Steven Rostedt
  -1 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-16 15:26 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, Frederic Weisbecker, Eric Paris, Ingo Molnar,
	kees.cook, agl, jmorris, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux

Sorry to be absent from this thread so far, I just got back from my
travels and I'm now catching up on email.


On Wed, 2011-05-11 at 22:02 -0500, Will Drewry wrote:

> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 377a7a5..22e1668 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -1664,6 +1664,16 @@ config SECCOMP
>  	  and the task is only allowed to execute a few safe syscalls
>  	  defined by each seccomp mode.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config CC_STACKPROTECTOR
>  	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
>  	depends on EXPERIMENTAL
> diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
> index eccdefe..7641ee9 100644
> --- a/arch/microblaze/Kconfig
> +++ b/arch/microblaze/Kconfig
> @@ -129,6 +129,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  endmenu
>  
>  menu "Advanced setup"
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index 8e256cc..fe4cbda 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -2245,6 +2245,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config USE_OF
>  	bool "Flattened Device Tree support"
>  	select OF
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 8f4d50b..83499e4 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -605,6 +605,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  endmenu
>  
>  config ISA_DMA_API
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index 2508a6f..2777515 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -614,6 +614,16 @@ config SECCOMP
>  
>  	  If unsure, say Y.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  endmenu
>  
>  menu "Power Management"
> diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
> index 4b89da2..00c1521 100644
> --- a/arch/sh/Kconfig
> +++ b/arch/sh/Kconfig
> @@ -676,6 +676,16 @@ config SECCOMP
>  
>  	  If unsure, say N.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config SMP
>  	bool "Symmetric multi-processing support"
>  	depends on SYS_SUPPORTS_SMP
> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index e560d10..5b42255 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -270,6 +270,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config HOTPLUG_CPU
>  	bool "Support for hot-pluggable CPUs"
>  	depends on SPARC64 && SMP
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index cc6c53a..d6d44d9 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1485,6 +1485,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config CC_STACKPROTECTOR
>  	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
>  	---help---

You just cut-and-pasted 8 copies of a config selection. The proper way
to do that is to add the Kconfig selection in a core kernel Kconfig,
have it depend on "HAVE_SECCOMP_FILTER" and then in each of these
configs, simply add in the arch Kconfig:

config <ARCH>
	[...]
	select HAVE_SECCOMP_FILTER


That way you don't need to duplicate the config option all over the
place.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 15:26                       ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-16 15:26 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, jmorris, Ingo Molnar, linux-arm-kernel,
	Ingo Molnar, Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Martin Schwidefsky, Thomas Gleixner, kees.cook, Roland McGrath,
	Michal Marek, Michal Simek, linuxppc-dev, linux-kernel,
	Eric Paris, Paul Mundt, Tejun Heo, linux390, Andrew Morton, agl,
	David S. Miller

Sorry to be absent from this thread so far, I just got back from my
travels and I'm now catching up on email.


On Wed, 2011-05-11 at 22:02 -0500, Will Drewry wrote:

> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 377a7a5..22e1668 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -1664,6 +1664,16 @@ config SECCOMP
>  	  and the task is only allowed to execute a few safe syscalls
>  	  defined by each seccomp mode.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config CC_STACKPROTECTOR
>  	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
>  	depends on EXPERIMENTAL
> diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
> index eccdefe..7641ee9 100644
> --- a/arch/microblaze/Kconfig
> +++ b/arch/microblaze/Kconfig
> @@ -129,6 +129,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  endmenu
>  
>  menu "Advanced setup"
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index 8e256cc..fe4cbda 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -2245,6 +2245,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config USE_OF
>  	bool "Flattened Device Tree support"
>  	select OF
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 8f4d50b..83499e4 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -605,6 +605,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  endmenu
>  
>  config ISA_DMA_API
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index 2508a6f..2777515 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -614,6 +614,16 @@ config SECCOMP
>  
>  	  If unsure, say Y.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  endmenu
>  
>  menu "Power Management"
> diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
> index 4b89da2..00c1521 100644
> --- a/arch/sh/Kconfig
> +++ b/arch/sh/Kconfig
> @@ -676,6 +676,16 @@ config SECCOMP
>  
>  	  If unsure, say N.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config SMP
>  	bool "Symmetric multi-processing support"
>  	depends on SYS_SUPPORTS_SMP
> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index e560d10..5b42255 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -270,6 +270,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config HOTPLUG_CPU
>  	bool "Support for hot-pluggable CPUs"
>  	depends on SPARC64 && SMP
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index cc6c53a..d6d44d9 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1485,6 +1485,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config CC_STACKPROTECTOR
>  	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
>  	---help---

You just cut-and-pasted 8 copies of a config selection. The proper way
to do that is to add the Kconfig selection in a core kernel Kconfig,
have it depend on "HAVE_SECCOMP_FILTER" and then in each of these
configs, simply add in the arch Kconfig:

config <ARCH>
	[...]
	select HAVE_SECCOMP_FILTER


That way you don't need to duplicate the config option all over the
place.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 15:26                       ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-16 15:26 UTC (permalink / raw)
  To: linux-arm-kernel

Sorry to be absent from this thread so far, I just got back from my
travels and I'm now catching up on email.


On Wed, 2011-05-11 at 22:02 -0500, Will Drewry wrote:

> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 377a7a5..22e1668 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -1664,6 +1664,16 @@ config SECCOMP
>  	  and the task is only allowed to execute a few safe syscalls
>  	  defined by each seccomp mode.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config CC_STACKPROTECTOR
>  	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
>  	depends on EXPERIMENTAL
> diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
> index eccdefe..7641ee9 100644
> --- a/arch/microblaze/Kconfig
> +++ b/arch/microblaze/Kconfig
> @@ -129,6 +129,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  endmenu
>  
>  menu "Advanced setup"
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index 8e256cc..fe4cbda 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -2245,6 +2245,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config USE_OF
>  	bool "Flattened Device Tree support"
>  	select OF
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 8f4d50b..83499e4 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -605,6 +605,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  endmenu
>  
>  config ISA_DMA_API
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index 2508a6f..2777515 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -614,6 +614,16 @@ config SECCOMP
>  
>  	  If unsure, say Y.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  endmenu
>  
>  menu "Power Management"
> diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
> index 4b89da2..00c1521 100644
> --- a/arch/sh/Kconfig
> +++ b/arch/sh/Kconfig
> @@ -676,6 +676,16 @@ config SECCOMP
>  
>  	  If unsure, say N.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config SMP
>  	bool "Symmetric multi-processing support"
>  	depends on SYS_SUPPORTS_SMP
> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index e560d10..5b42255 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -270,6 +270,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config HOTPLUG_CPU
>  	bool "Support for hot-pluggable CPUs"
>  	depends on SPARC64 && SMP
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index cc6c53a..d6d44d9 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1485,6 +1485,16 @@ config SECCOMP
>  
>  	  If unsure, say Y. Only embedded should say N here.
>  
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	depends on SECCOMP && EXPERIMENTAL
> +	help
> +	  Per-process, inherited system call filtering using shared code
> +	  across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
> +	  is not available, enhanced filters will not be available.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config CC_STACKPROTECTOR
>  	bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
>  	---help---

You just cut-and-pasted 8 copies of a config selection. The proper way
to do that is to add the Kconfig selection in a core kernel Kconfig,
have it depend on "HAVE_SECCOMP_FILTER" and then in each of these
configs, simply add in the arch Kconfig:

config <ARCH>
	[...]
	select HAVE_SECCOMP_FILTER


That way you don't need to duplicate the config option all over the
place.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-16 15:26                       ` Steven Rostedt
  (?)
@ 2011-05-16 15:28                         ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-16 15:28 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, Frederic Weisbecker, Eric Paris, Ingo Molnar,
	kees.cook, agl, jmorris, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, microblaze-uclinux,
	linux-mips, linuxppc-dev, linux-s390, linux-sh, sparclinux

On Mon, May 16, 2011 at 10:26 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> Sorry to be absent from this thread so far, I just got back from my
> travels and I'm now catching up on email.
>
>
> On Wed, 2011-05-11 at 22:02 -0500, Will Drewry wrote:
>
>> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
>> index 377a7a5..22e1668 100644
>> --- a/arch/arm/Kconfig
>> +++ b/arch/arm/Kconfig
>> @@ -1664,6 +1664,16 @@ config SECCOMP
>>         and the task is only allowed to execute a few safe syscalls
>>         defined by each seccomp mode.
>>
>> +config SECCOMP_FILTER
>> +     bool "Enable seccomp-based system call filtering"
>> +     depends on SECCOMP && EXPERIMENTAL
>> +     help
>> +       Per-process, inherited system call filtering using shared code
>> +       across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
>> +       is not available, enhanced filters will not be available.
>> +
>> +       See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>>  config CC_STACKPROTECTOR
>>       bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
>>       depends on EXPERIMENTAL
>> diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
>> index eccdefe..7641ee9 100644
>> --- a/arch/microblaze/Kconfig
>> +++ b/arch/microblaze/Kconfig
>> @@ -129,6 +129,16 @@ config SECCOMP
>>
>>         If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> +     bool "Enable seccomp-based system call filtering"
>> +     depends on SECCOMP && EXPERIMENTAL
>> +     help
>> +       Per-process, inherited system call filtering using shared code
>> +       across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
>> +       is not available, enhanced filters will not be available.
>> +
>> +       See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>>  endmenu
>>
>>  menu "Advanced setup"
>> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
>> index 8e256cc..fe4cbda 100644
>> --- a/arch/mips/Kconfig
>> +++ b/arch/mips/Kconfig
>> @@ -2245,6 +2245,16 @@ config SECCOMP
>>
>>         If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> +     bool "Enable seccomp-based system call filtering"
>> +     depends on SECCOMP && EXPERIMENTAL
>> +     help
>> +       Per-process, inherited system call filtering using shared code
>> +       across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
>> +       is not available, enhanced filters will not be available.
>> +
>> +       See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>>  config USE_OF
>>       bool "Flattened Device Tree support"
>>       select OF
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index 8f4d50b..83499e4 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -605,6 +605,16 @@ config SECCOMP
>>
>>         If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> +     bool "Enable seccomp-based system call filtering"
>> +     depends on SECCOMP && EXPERIMENTAL
>> +     help
>> +       Per-process, inherited system call filtering using shared code
>> +       across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
>> +       is not available, enhanced filters will not be available.
>> +
>> +       See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>>  endmenu
>>
>>  config ISA_DMA_API
>> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
>> index 2508a6f..2777515 100644
>> --- a/arch/s390/Kconfig
>> +++ b/arch/s390/Kconfig
>> @@ -614,6 +614,16 @@ config SECCOMP
>>
>>         If unsure, say Y.
>>
>> +config SECCOMP_FILTER
>> +     bool "Enable seccomp-based system call filtering"
>> +     depends on SECCOMP && EXPERIMENTAL
>> +     help
>> +       Per-process, inherited system call filtering using shared code
>> +       across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
>> +       is not available, enhanced filters will not be available.
>> +
>> +       See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>>  endmenu
>>
>>  menu "Power Management"
>> diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
>> index 4b89da2..00c1521 100644
>> --- a/arch/sh/Kconfig
>> +++ b/arch/sh/Kconfig
>> @@ -676,6 +676,16 @@ config SECCOMP
>>
>>         If unsure, say N.
>>
>> +config SECCOMP_FILTER
>> +     bool "Enable seccomp-based system call filtering"
>> +     depends on SECCOMP && EXPERIMENTAL
>> +     help
>> +       Per-process, inherited system call filtering using shared code
>> +       across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
>> +       is not available, enhanced filters will not be available.
>> +
>> +       See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>>  config SMP
>>       bool "Symmetric multi-processing support"
>>       depends on SYS_SUPPORTS_SMP
>> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
>> index e560d10..5b42255 100644
>> --- a/arch/sparc/Kconfig
>> +++ b/arch/sparc/Kconfig
>> @@ -270,6 +270,16 @@ config SECCOMP
>>
>>         If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> +     bool "Enable seccomp-based system call filtering"
>> +     depends on SECCOMP && EXPERIMENTAL
>> +     help
>> +       Per-process, inherited system call filtering using shared code
>> +       across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
>> +       is not available, enhanced filters will not be available.
>> +
>> +       See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>>  config HOTPLUG_CPU
>>       bool "Support for hot-pluggable CPUs"
>>       depends on SPARC64 && SMP
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index cc6c53a..d6d44d9 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1485,6 +1485,16 @@ config SECCOMP
>>
>>         If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> +     bool "Enable seccomp-based system call filtering"
>> +     depends on SECCOMP && EXPERIMENTAL
>> +     help
>> +       Per-process, inherited system call filtering using shared code
>> +       across seccomp and ftrace_syscalls.  If CONFIG_FTRACE_SYSCALLS
>> +       is not available, enhanced filters will not be available.
>> +
>> +       See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>>  config CC_STACKPROTECTOR
>>       bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
>>       ---help---
>
> You just cut-and-pasted 8 copies of a config selection. The proper way
> to do that is to add the Kconfig selection in a core kernel Kconfig,
> have it depend on "HAVE_SECCOMP_FILTER" and then in each of these
> configs, simply add in the arch Kconfig:
>
> config <ARCH>
>        [...]
>        select HAVE_SECCOMP_FILTER
>
>
> That way you don't need to duplicate the config option all over the
> place.

Thanks!  I had seen the HAVE_* format but failed to acknowledge why!

I'll fix it in the next rev (assuming it still fits there)!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 15:28                         ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-16 15:28 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, jmorris, Ingo Molnar, linux-arm-kernel,
	Ingo Molnar, Serge E. Hallyn, Peter Zijlstra, microblaze-uclinux,
	Martin Schwidefsky, Thomas Gleixner, kees.cook, Roland McGrath,
	Michal Marek, Michal Simek, linuxppc-dev, linux-kernel,
	Eric Paris, Paul Mundt, Tejun Heo, linux390, Andrew Morton, agl,
	David S. Miller

On Mon, May 16, 2011 at 10:26 AM, Steven Rostedt <rostedt@goodmis.org> wrot=
e:
> Sorry to be absent from this thread so far, I just got back from my
> travels and I'm now catching up on email.
>
>
> On Wed, 2011-05-11 at 22:02 -0500, Will Drewry wrote:
>
>> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
>> index 377a7a5..22e1668 100644
>> --- a/arch/arm/Kconfig
>> +++ b/arch/arm/Kconfig
>> @@ -1664,6 +1664,16 @@ config SECCOMP
>> =A0 =A0 =A0 =A0 and the task is only allowed to execute a few safe sysca=
lls
>> =A0 =A0 =A0 =A0 defined by each seccomp mode.
>>
>> +config SECCOMP_FILTER
>> + =A0 =A0 bool "Enable seccomp-based system call filtering"
>> + =A0 =A0 depends on SECCOMP && EXPERIMENTAL
>> + =A0 =A0 help
>> + =A0 =A0 =A0 Per-process, inherited system call filtering using shared =
code
>> + =A0 =A0 =A0 across seccomp and ftrace_syscalls. =A0If CONFIG_FTRACE_SY=
SCALLS
>> + =A0 =A0 =A0 is not available, enhanced filters will not be available.
>> +
>> + =A0 =A0 =A0 See Documentation/prctl/seccomp_filter.txt for more detail=
.
>> +
>> =A0config CC_STACKPROTECTOR
>> =A0 =A0 =A0 bool "Enable -fstack-protector buffer overflow detection (EX=
PERIMENTAL)"
>> =A0 =A0 =A0 depends on EXPERIMENTAL
>> diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
>> index eccdefe..7641ee9 100644
>> --- a/arch/microblaze/Kconfig
>> +++ b/arch/microblaze/Kconfig
>> @@ -129,6 +129,16 @@ config SECCOMP
>>
>> =A0 =A0 =A0 =A0 If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> + =A0 =A0 bool "Enable seccomp-based system call filtering"
>> + =A0 =A0 depends on SECCOMP && EXPERIMENTAL
>> + =A0 =A0 help
>> + =A0 =A0 =A0 Per-process, inherited system call filtering using shared =
code
>> + =A0 =A0 =A0 across seccomp and ftrace_syscalls. =A0If CONFIG_FTRACE_SY=
SCALLS
>> + =A0 =A0 =A0 is not available, enhanced filters will not be available.
>> +
>> + =A0 =A0 =A0 See Documentation/prctl/seccomp_filter.txt for more detail=
.
>> +
>> =A0endmenu
>>
>> =A0menu "Advanced setup"
>> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
>> index 8e256cc..fe4cbda 100644
>> --- a/arch/mips/Kconfig
>> +++ b/arch/mips/Kconfig
>> @@ -2245,6 +2245,16 @@ config SECCOMP
>>
>> =A0 =A0 =A0 =A0 If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> + =A0 =A0 bool "Enable seccomp-based system call filtering"
>> + =A0 =A0 depends on SECCOMP && EXPERIMENTAL
>> + =A0 =A0 help
>> + =A0 =A0 =A0 Per-process, inherited system call filtering using shared =
code
>> + =A0 =A0 =A0 across seccomp and ftrace_syscalls. =A0If CONFIG_FTRACE_SY=
SCALLS
>> + =A0 =A0 =A0 is not available, enhanced filters will not be available.
>> +
>> + =A0 =A0 =A0 See Documentation/prctl/seccomp_filter.txt for more detail=
.
>> +
>> =A0config USE_OF
>> =A0 =A0 =A0 bool "Flattened Device Tree support"
>> =A0 =A0 =A0 select OF
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index 8f4d50b..83499e4 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -605,6 +605,16 @@ config SECCOMP
>>
>> =A0 =A0 =A0 =A0 If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> + =A0 =A0 bool "Enable seccomp-based system call filtering"
>> + =A0 =A0 depends on SECCOMP && EXPERIMENTAL
>> + =A0 =A0 help
>> + =A0 =A0 =A0 Per-process, inherited system call filtering using shared =
code
>> + =A0 =A0 =A0 across seccomp and ftrace_syscalls. =A0If CONFIG_FTRACE_SY=
SCALLS
>> + =A0 =A0 =A0 is not available, enhanced filters will not be available.
>> +
>> + =A0 =A0 =A0 See Documentation/prctl/seccomp_filter.txt for more detail=
.
>> +
>> =A0endmenu
>>
>> =A0config ISA_DMA_API
>> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
>> index 2508a6f..2777515 100644
>> --- a/arch/s390/Kconfig
>> +++ b/arch/s390/Kconfig
>> @@ -614,6 +614,16 @@ config SECCOMP
>>
>> =A0 =A0 =A0 =A0 If unsure, say Y.
>>
>> +config SECCOMP_FILTER
>> + =A0 =A0 bool "Enable seccomp-based system call filtering"
>> + =A0 =A0 depends on SECCOMP && EXPERIMENTAL
>> + =A0 =A0 help
>> + =A0 =A0 =A0 Per-process, inherited system call filtering using shared =
code
>> + =A0 =A0 =A0 across seccomp and ftrace_syscalls. =A0If CONFIG_FTRACE_SY=
SCALLS
>> + =A0 =A0 =A0 is not available, enhanced filters will not be available.
>> +
>> + =A0 =A0 =A0 See Documentation/prctl/seccomp_filter.txt for more detail=
.
>> +
>> =A0endmenu
>>
>> =A0menu "Power Management"
>> diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
>> index 4b89da2..00c1521 100644
>> --- a/arch/sh/Kconfig
>> +++ b/arch/sh/Kconfig
>> @@ -676,6 +676,16 @@ config SECCOMP
>>
>> =A0 =A0 =A0 =A0 If unsure, say N.
>>
>> +config SECCOMP_FILTER
>> + =A0 =A0 bool "Enable seccomp-based system call filtering"
>> + =A0 =A0 depends on SECCOMP && EXPERIMENTAL
>> + =A0 =A0 help
>> + =A0 =A0 =A0 Per-process, inherited system call filtering using shared =
code
>> + =A0 =A0 =A0 across seccomp and ftrace_syscalls. =A0If CONFIG_FTRACE_SY=
SCALLS
>> + =A0 =A0 =A0 is not available, enhanced filters will not be available.
>> +
>> + =A0 =A0 =A0 See Documentation/prctl/seccomp_filter.txt for more detail=
.
>> +
>> =A0config SMP
>> =A0 =A0 =A0 bool "Symmetric multi-processing support"
>> =A0 =A0 =A0 depends on SYS_SUPPORTS_SMP
>> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
>> index e560d10..5b42255 100644
>> --- a/arch/sparc/Kconfig
>> +++ b/arch/sparc/Kconfig
>> @@ -270,6 +270,16 @@ config SECCOMP
>>
>> =A0 =A0 =A0 =A0 If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> + =A0 =A0 bool "Enable seccomp-based system call filtering"
>> + =A0 =A0 depends on SECCOMP && EXPERIMENTAL
>> + =A0 =A0 help
>> + =A0 =A0 =A0 Per-process, inherited system call filtering using shared =
code
>> + =A0 =A0 =A0 across seccomp and ftrace_syscalls. =A0If CONFIG_FTRACE_SY=
SCALLS
>> + =A0 =A0 =A0 is not available, enhanced filters will not be available.
>> +
>> + =A0 =A0 =A0 See Documentation/prctl/seccomp_filter.txt for more detail=
.
>> +
>> =A0config HOTPLUG_CPU
>> =A0 =A0 =A0 bool "Support for hot-pluggable CPUs"
>> =A0 =A0 =A0 depends on SPARC64 && SMP
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index cc6c53a..d6d44d9 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1485,6 +1485,16 @@ config SECCOMP
>>
>> =A0 =A0 =A0 =A0 If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> + =A0 =A0 bool "Enable seccomp-based system call filtering"
>> + =A0 =A0 depends on SECCOMP && EXPERIMENTAL
>> + =A0 =A0 help
>> + =A0 =A0 =A0 Per-process, inherited system call filtering using shared =
code
>> + =A0 =A0 =A0 across seccomp and ftrace_syscalls. =A0If CONFIG_FTRACE_SY=
SCALLS
>> + =A0 =A0 =A0 is not available, enhanced filters will not be available.
>> +
>> + =A0 =A0 =A0 See Documentation/prctl/seccomp_filter.txt for more detail=
.
>> +
>> =A0config CC_STACKPROTECTOR
>> =A0 =A0 =A0 bool "Enable -fstack-protector buffer overflow detection (EX=
PERIMENTAL)"
>> =A0 =A0 =A0 ---help---
>
> You just cut-and-pasted 8 copies of a config selection. The proper way
> to do that is to add the Kconfig selection in a core kernel Kconfig,
> have it depend on "HAVE_SECCOMP_FILTER" and then in each of these
> configs, simply add in the arch Kconfig:
>
> config <ARCH>
> =A0 =A0 =A0 =A0[...]
> =A0 =A0 =A0 =A0select HAVE_SECCOMP_FILTER
>
>
> That way you don't need to duplicate the config option all over the
> place.

Thanks!  I had seen the HAVE_* format but failed to acknowledge why!

I'll fix it in the next rev (assuming it still fits there)!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 15:28                         ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-16 15:28 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, May 16, 2011 at 10:26 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> Sorry to be absent from this thread so far, I just got back from my
> travels and I'm now catching up on email.
>
>
> On Wed, 2011-05-11 at 22:02 -0500, Will Drewry wrote:
>
>> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
>> index 377a7a5..22e1668 100644
>> --- a/arch/arm/Kconfig
>> +++ b/arch/arm/Kconfig
>> @@ -1664,6 +1664,16 @@ config SECCOMP
>> ? ? ? ? and the task is only allowed to execute a few safe syscalls
>> ? ? ? ? defined by each seccomp mode.
>>
>> +config SECCOMP_FILTER
>> + ? ? bool "Enable seccomp-based system call filtering"
>> + ? ? depends on SECCOMP && EXPERIMENTAL
>> + ? ? help
>> + ? ? ? Per-process, inherited system call filtering using shared code
>> + ? ? ? across seccomp and ftrace_syscalls. ?If CONFIG_FTRACE_SYSCALLS
>> + ? ? ? is not available, enhanced filters will not be available.
>> +
>> + ? ? ? See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>> ?config CC_STACKPROTECTOR
>> ? ? ? bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
>> ? ? ? depends on EXPERIMENTAL
>> diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
>> index eccdefe..7641ee9 100644
>> --- a/arch/microblaze/Kconfig
>> +++ b/arch/microblaze/Kconfig
>> @@ -129,6 +129,16 @@ config SECCOMP
>>
>> ? ? ? ? If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> + ? ? bool "Enable seccomp-based system call filtering"
>> + ? ? depends on SECCOMP && EXPERIMENTAL
>> + ? ? help
>> + ? ? ? Per-process, inherited system call filtering using shared code
>> + ? ? ? across seccomp and ftrace_syscalls. ?If CONFIG_FTRACE_SYSCALLS
>> + ? ? ? is not available, enhanced filters will not be available.
>> +
>> + ? ? ? See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>> ?endmenu
>>
>> ?menu "Advanced setup"
>> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
>> index 8e256cc..fe4cbda 100644
>> --- a/arch/mips/Kconfig
>> +++ b/arch/mips/Kconfig
>> @@ -2245,6 +2245,16 @@ config SECCOMP
>>
>> ? ? ? ? If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> + ? ? bool "Enable seccomp-based system call filtering"
>> + ? ? depends on SECCOMP && EXPERIMENTAL
>> + ? ? help
>> + ? ? ? Per-process, inherited system call filtering using shared code
>> + ? ? ? across seccomp and ftrace_syscalls. ?If CONFIG_FTRACE_SYSCALLS
>> + ? ? ? is not available, enhanced filters will not be available.
>> +
>> + ? ? ? See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>> ?config USE_OF
>> ? ? ? bool "Flattened Device Tree support"
>> ? ? ? select OF
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index 8f4d50b..83499e4 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -605,6 +605,16 @@ config SECCOMP
>>
>> ? ? ? ? If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> + ? ? bool "Enable seccomp-based system call filtering"
>> + ? ? depends on SECCOMP && EXPERIMENTAL
>> + ? ? help
>> + ? ? ? Per-process, inherited system call filtering using shared code
>> + ? ? ? across seccomp and ftrace_syscalls. ?If CONFIG_FTRACE_SYSCALLS
>> + ? ? ? is not available, enhanced filters will not be available.
>> +
>> + ? ? ? See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>> ?endmenu
>>
>> ?config ISA_DMA_API
>> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
>> index 2508a6f..2777515 100644
>> --- a/arch/s390/Kconfig
>> +++ b/arch/s390/Kconfig
>> @@ -614,6 +614,16 @@ config SECCOMP
>>
>> ? ? ? ? If unsure, say Y.
>>
>> +config SECCOMP_FILTER
>> + ? ? bool "Enable seccomp-based system call filtering"
>> + ? ? depends on SECCOMP && EXPERIMENTAL
>> + ? ? help
>> + ? ? ? Per-process, inherited system call filtering using shared code
>> + ? ? ? across seccomp and ftrace_syscalls. ?If CONFIG_FTRACE_SYSCALLS
>> + ? ? ? is not available, enhanced filters will not be available.
>> +
>> + ? ? ? See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>> ?endmenu
>>
>> ?menu "Power Management"
>> diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
>> index 4b89da2..00c1521 100644
>> --- a/arch/sh/Kconfig
>> +++ b/arch/sh/Kconfig
>> @@ -676,6 +676,16 @@ config SECCOMP
>>
>> ? ? ? ? If unsure, say N.
>>
>> +config SECCOMP_FILTER
>> + ? ? bool "Enable seccomp-based system call filtering"
>> + ? ? depends on SECCOMP && EXPERIMENTAL
>> + ? ? help
>> + ? ? ? Per-process, inherited system call filtering using shared code
>> + ? ? ? across seccomp and ftrace_syscalls. ?If CONFIG_FTRACE_SYSCALLS
>> + ? ? ? is not available, enhanced filters will not be available.
>> +
>> + ? ? ? See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>> ?config SMP
>> ? ? ? bool "Symmetric multi-processing support"
>> ? ? ? depends on SYS_SUPPORTS_SMP
>> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
>> index e560d10..5b42255 100644
>> --- a/arch/sparc/Kconfig
>> +++ b/arch/sparc/Kconfig
>> @@ -270,6 +270,16 @@ config SECCOMP
>>
>> ? ? ? ? If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> + ? ? bool "Enable seccomp-based system call filtering"
>> + ? ? depends on SECCOMP && EXPERIMENTAL
>> + ? ? help
>> + ? ? ? Per-process, inherited system call filtering using shared code
>> + ? ? ? across seccomp and ftrace_syscalls. ?If CONFIG_FTRACE_SYSCALLS
>> + ? ? ? is not available, enhanced filters will not be available.
>> +
>> + ? ? ? See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>> ?config HOTPLUG_CPU
>> ? ? ? bool "Support for hot-pluggable CPUs"
>> ? ? ? depends on SPARC64 && SMP
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index cc6c53a..d6d44d9 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -1485,6 +1485,16 @@ config SECCOMP
>>
>> ? ? ? ? If unsure, say Y. Only embedded should say N here.
>>
>> +config SECCOMP_FILTER
>> + ? ? bool "Enable seccomp-based system call filtering"
>> + ? ? depends on SECCOMP && EXPERIMENTAL
>> + ? ? help
>> + ? ? ? Per-process, inherited system call filtering using shared code
>> + ? ? ? across seccomp and ftrace_syscalls. ?If CONFIG_FTRACE_SYSCALLS
>> + ? ? ? is not available, enhanced filters will not be available.
>> +
>> + ? ? ? See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>> ?config CC_STACKPROTECTOR
>> ? ? ? bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)"
>> ? ? ? ---help---
>
> You just cut-and-pasted 8 copies of a config selection. The proper way
> to do that is to add the Kconfig selection in a core kernel Kconfig,
> have it depend on "HAVE_SECCOMP_FILTER" and then in each of these
> configs, simply add in the arch Kconfig:
>
> config <ARCH>
> ? ? ? ?[...]
> ? ? ? ?select HAVE_SECCOMP_FILTER
>
>
> That way you don't need to duplicate the config option all over the
> place.

Thanks!  I had seen the HAVE_* format but failed to acknowledge why!

I'll fix it in the next rev (assuming it still fits there)!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-16 12:43                                       ` Ingo Molnar
  (?)
@ 2011-05-16 15:29                                         ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-16 15:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Eric Paris, James Morris, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, kees.cook, agl, Peter Zijlstra,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds

On Mon, May 16, 2011 at 7:43 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> > Note, i'm not actually asking for the moon, a pony and more.
>> >
>> > I fully submit that we are yet far away from being able to do a full LSM
>> > via this mechanism.
>> >
>> > What i'm asking for is that because the syscall point steps taken by Will
>> > look very promising so it would be nice to do *that* in a slightly more
>> > flexible scheme that does not condemn it to be limited to the syscall
>> > boundary and such ...
>>
>> What do you suggest here?
>>
>> From my brief exploration of the ftrace/perf (and seccomp) code, I don't see
>> a clean way of integrating over the existing interfaces to the ftrace
>> framework (e.g., the global perf event pump seems to be a mismatch), but I
>> may be missing something obvious. [...]
>
> Well, there's no global perf event pump. Here we'd obviously want to use
> buffer-less events that do no output (pumping) whatsoever - i.e. just counting
> events with filters attached.

Cool - I missed these entirely.  I'll get reading :)

> What i suggest is to:
>
>  - share syscall events        # you are fine with that as your patch makes use of them
>  - share the scripting engine  # you are fine with that as your patch makes use of it
>
>  - share *any* other event to do_exit() a process at syscall exit time
>
>  - share any other active event that kernel developers specifically enable
>   for active use to impact security-relevant execution even sooner than
>   syscall exit time - not just system calls
>
>  - share the actual facility that manages (sets/gets) filters

These make sense to me at a high level.  I'll throw in a few initial
comments, but I'll be back for a round-two once I catch up on the rest
of the perf code.

> So right now you have this structure for your new feature:
>
>  Documentation/trace/seccomp_filter.txt |   75 +++++
>  arch/x86/kernel/ldt.c                  |    5
>  arch/x86/kernel/tls.c                  |    7
>  fs/proc/array.c                        |   21 +
>  fs/proc/base.c                         |   25 +
>  include/linux/ftrace_event.h           |    9
>  include/linux/seccomp.h                |   98 +++++++
>  include/linux/syscalls.h               |   54 ++--
>  include/trace/syscall.h                |    6
>  kernel/Makefile                        |    1
>  kernel/fork.c                          |    8
>  kernel/perf_event.c                    |    7
>  kernel/seccomp.c                       |  144 ++++++++++-
>  kernel/seccomp_filter.c                |  428 +++++++++++++++++++++++++++++++++
>  kernel/sys.c                           |   11
>  kernel/trace/Kconfig                   |   10
>  kernel/trace/trace_events_filter.c     |   60 ++--
>  kernel/trace/trace_syscalls.c          |   96 ++++++-
>  18 files changed, 986 insertions(+), 79 deletions(-)
>
> Firstly, one specific problem i can see is that kernel/seccomp_filter.c
> hardcodes to the system call boundary. Which is fine to a prototype
> implementation (and obviously fine for something that builds upon seccomp) but
> not fine in terms of a flexible Linux security feature :-)
>
> You have hardcoded these syscall assumptions via:
>
>  struct seccomp_filter {
>        struct list_head list;
>        struct rcu_head rcu;
>        int syscall_nr;
>        struct syscall_metadata *data;
>        struct event_filter *event_filter;
>  };

(This structure is a bit different in the new rev of the patch, but
nothing relevant to this specific part of the discussion :)

> Which comes just because you chose to enumerate only syscall events - instead
> of enumerating all events.

While I'm willing to live with the tradeoff, using ftrace event
numbers from FTRACE_SYSCALLS means that the filter will be unable to
hook a fair number of syscalls: execve, clone, etc  (all the ptregs
fixup syscalls on x86) and anything that returns int instead of long
(on x86).  Though the last two patches in the initial patch series
provided a proposed clean up for the latter :/

The current revision of the seccomp filter patch can function in a
bitmask-like state when CONFIG_FTRACE is unset or
CONFIG_FTRACE_SYSCALLS is unset.  This also means any platform with
CONFIG_SECCOMP support can start using this right away, but I realize
that is more of a short-term gain rather than a long-term one.

> Instead of that please bind to all events instead - syscalls are just one of
> the many events we have. Type 'perf list' and see how many event types we have,
> and quite a few could be enabled for 'active feedback' sandboxing as well.
>
> Secondly, and this is really a variant of the first problem you have, the way
> you process event filter 'failures' is pretty limited. You utilize the regular
> seccomp method which works by calling into __secure_computing() and silently
> accepting syscalls or generating a hard do_exit() on even the slightest of
> filter failures.
>
> Instead of that what we'd want to have is to have regular syscall events
> registered, *and* enabled for such active filtering purposes. The moment the
> filter hits such an 'active' event would set the TIF_NOTIFY_RESUME flag and
> some other attribute in the task and the kernel would do a do_exit() at the
> earliest of opportunities before calling the syscall or at the
> return-from-syscall point latest.
>
> Note that no seccomp specific code would have to execute here, we can already
> generate events both at syscall entry and at syscall exit points, the only new
> bit we'd need is for the 'kill the task' [or abort the syscall] policy action.
>
> This is *far* more generic still yields the same short-term end result as far
> as your sandboxing is concerned.

Almost :/  I still need to review the code you've pointed out, but, at
present, the ftrace hooks occur after the seccomp and syscall auditing
hooks.  This means that that code is exposed no matter what in this
model.  To trim the exposed surface to userspace, we really need those
early hooks.  While I can see both hacky and less hacky approaches
around this, it stills strikes me that the seccomp thread flag and
early interception are good to reuse.  One option might be to allow
seccomp to be a secure-syscall event source, but I suspect that lands
more on the hack-y side of the fence :)

Anyway, I'll read up on the other filters and try to understand
exactly how per-task, counting perf events are being handled.

> What we'd need for this is a way to mark existing TRACE_EVENT()s as 'active'.
> We'd mark all syscall events as 'active' straight away.

It seems like we'll still end up special casing system calls no matter
what as they are the userland vector into the kernel.  Marking
syscalls active right away sounds roughly equivalent to calling
prctl(PR_SET_SECCOMP, 2).  This could easily be done following the
model of syscall_nr_to_meta() since that enumerates every system call
meta data entry which yields both entry and exit event numbers. Of
course, I'd need to understand how that ties in with the per-task,
counting code.

> [ Detail: these trace events return a return code, which the calling code can
>  use, that way event return values could be used sooner than syscall exit
>  points. IRQ code could make use of it as well, so for example this way we
>  could filter based early packet inspection, still in the IRQ code. ]
>
> Then what your feature would do is to simply open up the events you are
> interested in (just as counting events, without any buffers), and set 'active'
> filters in them them to 'deny/allow' specific combinations of parameters only.
>
> Thirdly, you have added a separate facility to set/query filters via /proc,
> this lives mostly in /proc/<pid>/seccomp_filter. This seccomp specificity would
> go away altogether: it should be possible to query existing events and filters
> even outside of the seccomp facility, so if you want something explicit here
> beyond the current event ioctl() to set filters please improve the generic
> facility - which right now is poorer than what you have implemented so it's in
> good need to be improved! :-)

I haven't looked at the other interface, so I will.  I know about the
existence of the perf_event_open syscall, but that's about it.  Would
it make sense to use the same syscall in a unified-interface world?
Even if the right way, semantically it seems awkward.

> All in one such a solution would enable us to reuse code in a lot more deeper
> fashion while still providing all the functionality you are interested in.
> These are all short-term concerns, not pie-in-the-sky future ideal LSM
> concerns.

Thanks for the concrete feedback! I'll follow up again once I better
understand the code you're citing as well as the existing userland
interface to it.

cheers!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 15:29                                         ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-16 15:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Peter Zijlstra, Steven Rostedt,
	Tejun Heo, Thomas Gleixner, linux-arm-kernel, Michal Marek,
	Michal Simek, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Mon, May 16, 2011 at 7:43 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> > Note, i'm not actually asking for the moon, a pony and more.
>> >
>> > I fully submit that we are yet far away from being able to do a full L=
SM
>> > via this mechanism.
>> >
>> > What i'm asking for is that because the syscall point steps taken by W=
ill
>> > look very promising so it would be nice to do *that* in a slightly mor=
e
>> > flexible scheme that does not condemn it to be limited to the syscall
>> > boundary and such ...
>>
>> What do you suggest here?
>>
>> From my brief exploration of the ftrace/perf (and seccomp) code, I don't=
 see
>> a clean way of integrating over the existing interfaces to the ftrace
>> framework (e.g., the global perf event pump seems to be a mismatch), but=
 I
>> may be missing something obvious. [...]
>
> Well, there's no global perf event pump. Here we'd obviously want to use
> buffer-less events that do no output (pumping) whatsoever - i.e. just cou=
nting
> events with filters attached.

Cool - I missed these entirely.  I'll get reading :)

> What i suggest is to:
>
> =A0- share syscall events =A0 =A0 =A0 =A0# you are fine with that as your=
 patch makes use of them
> =A0- share the scripting engine =A0# you are fine with that as your patch=
 makes use of it
>
> =A0- share *any* other event to do_exit() a process at syscall exit time
>
> =A0- share any other active event that kernel developers specifically ena=
ble
> =A0 for active use to impact security-relevant execution even sooner than
> =A0 syscall exit time - not just system calls
>
> =A0- share the actual facility that manages (sets/gets) filters

These make sense to me at a high level.  I'll throw in a few initial
comments, but I'll be back for a round-two once I catch up on the rest
of the perf code.

> So right now you have this structure for your new feature:
>
> =A0Documentation/trace/seccomp_filter.txt | =A0 75 +++++
> =A0arch/x86/kernel/ldt.c =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0| =A0 =A05
> =A0arch/x86/kernel/tls.c =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0| =A0 =A07
> =A0fs/proc/array.c =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0| =A0 2=
1 +
> =A0fs/proc/base.c =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 | =A0 2=
5 +
> =A0include/linux/ftrace_event.h =A0 =A0 =A0 =A0 =A0 | =A0 =A09
> =A0include/linux/seccomp.h =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0| =A0 98 ++++++=
+
> =A0include/linux/syscalls.h =A0 =A0 =A0 =A0 =A0 =A0 =A0 | =A0 54 ++--
> =A0include/trace/syscall.h =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0| =A0 =A06
> =A0kernel/Makefile =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0| =A0 =
=A01
> =A0kernel/fork.c =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0| =A0=
 =A08
> =A0kernel/perf_event.c =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0| =A0 =A07
> =A0kernel/seccomp.c =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 | =A0144 =
++++++++++-
> =A0kernel/seccomp_filter.c =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0| =A0428 ++++++=
+++++++++++++++++++++++++++
> =A0kernel/sys.c =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 | =A0=
 11
> =A0kernel/trace/Kconfig =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0 | =A0 10
> =A0kernel/trace/trace_events_filter.c =A0 =A0 | =A0 60 ++--
> =A0kernel/trace/trace_syscalls.c =A0 =A0 =A0 =A0 =A0| =A0 96 ++++++-
> =A018 files changed, 986 insertions(+), 79 deletions(-)
>
> Firstly, one specific problem i can see is that kernel/seccomp_filter.c
> hardcodes to the system call boundary. Which is fine to a prototype
> implementation (and obviously fine for something that builds upon seccomp=
) but
> not fine in terms of a flexible Linux security feature :-)
>
> You have hardcoded these syscall assumptions via:
>
> =A0struct seccomp_filter {
> =A0 =A0 =A0 =A0struct list_head list;
> =A0 =A0 =A0 =A0struct rcu_head rcu;
> =A0 =A0 =A0 =A0int syscall_nr;
> =A0 =A0 =A0 =A0struct syscall_metadata *data;
> =A0 =A0 =A0 =A0struct event_filter *event_filter;
> =A0};

(This structure is a bit different in the new rev of the patch, but
nothing relevant to this specific part of the discussion :)

> Which comes just because you chose to enumerate only syscall events - ins=
tead
> of enumerating all events.

While I'm willing to live with the tradeoff, using ftrace event
numbers from FTRACE_SYSCALLS means that the filter will be unable to
hook a fair number of syscalls: execve, clone, etc  (all the ptregs
fixup syscalls on x86) and anything that returns int instead of long
(on x86).  Though the last two patches in the initial patch series
provided a proposed clean up for the latter :/

The current revision of the seccomp filter patch can function in a
bitmask-like state when CONFIG_FTRACE is unset or
CONFIG_FTRACE_SYSCALLS is unset.  This also means any platform with
CONFIG_SECCOMP support can start using this right away, but I realize
that is more of a short-term gain rather than a long-term one.

> Instead of that please bind to all events instead - syscalls are just one=
 of
> the many events we have. Type 'perf list' and see how many event types we=
 have,
> and quite a few could be enabled for 'active feedback' sandboxing as well=
.
>
> Secondly, and this is really a variant of the first problem you have, the=
 way
> you process event filter 'failures' is pretty limited. You utilize the re=
gular
> seccomp method which works by calling into __secure_computing() and silen=
tly
> accepting syscalls or generating a hard do_exit() on even the slightest o=
f
> filter failures.
>
> Instead of that what we'd want to have is to have regular syscall events
> registered, *and* enabled for such active filtering purposes. The moment =
the
> filter hits such an 'active' event would set the TIF_NOTIFY_RESUME flag a=
nd
> some other attribute in the task and the kernel would do a do_exit() at t=
he
> earliest of opportunities before calling the syscall or at the
> return-from-syscall point latest.
>
> Note that no seccomp specific code would have to execute here, we can alr=
eady
> generate events both at syscall entry and at syscall exit points, the onl=
y new
> bit we'd need is for the 'kill the task' [or abort the syscall] policy ac=
tion.
>
> This is *far* more generic still yields the same short-term end result as=
 far
> as your sandboxing is concerned.

Almost :/  I still need to review the code you've pointed out, but, at
present, the ftrace hooks occur after the seccomp and syscall auditing
hooks.  This means that that code is exposed no matter what in this
model.  To trim the exposed surface to userspace, we really need those
early hooks.  While I can see both hacky and less hacky approaches
around this, it stills strikes me that the seccomp thread flag and
early interception are good to reuse.  One option might be to allow
seccomp to be a secure-syscall event source, but I suspect that lands
more on the hack-y side of the fence :)

Anyway, I'll read up on the other filters and try to understand
exactly how per-task, counting perf events are being handled.

> What we'd need for this is a way to mark existing TRACE_EVENT()s as 'acti=
ve'.
> We'd mark all syscall events as 'active' straight away.

It seems like we'll still end up special casing system calls no matter
what as they are the userland vector into the kernel.  Marking
syscalls active right away sounds roughly equivalent to calling
prctl(PR_SET_SECCOMP, 2).  This could easily be done following the
model of syscall_nr_to_meta() since that enumerates every system call
meta data entry which yields both entry and exit event numbers. Of
course, I'd need to understand how that ties in with the per-task,
counting code.

> [ Detail: these trace events return a return code, which the calling code=
 can
> =A0use, that way event return values could be used sooner than syscall ex=
it
> =A0points. IRQ code could make use of it as well, so for example this way=
 we
> =A0could filter based early packet inspection, still in the IRQ code. ]
>
> Then what your feature would do is to simply open up the events you are
> interested in (just as counting events, without any buffers), and set 'ac=
tive'
> filters in them them to 'deny/allow' specific combinations of parameters =
only.
>
> Thirdly, you have added a separate facility to set/query filters via /pro=
c,
> this lives mostly in /proc/<pid>/seccomp_filter. This seccomp specificity=
 would
> go away altogether: it should be possible to query existing events and fi=
lters
> even outside of the seccomp facility, so if you want something explicit h=
ere
> beyond the current event ioctl() to set filters please improve the generi=
c
> facility - which right now is poorer than what you have implemented so it=
's in
> good need to be improved! :-)

I haven't looked at the other interface, so I will.  I know about the
existence of the perf_event_open syscall, but that's about it.  Would
it make sense to use the same syscall in a unified-interface world?
Even if the right way, semantically it seems awkward.

> All in one such a solution would enable us to reuse code in a lot more de=
eper
> fashion while still providing all the functionality you are interested in=
.
> These are all short-term concerns, not pie-in-the-sky future ideal LSM
> concerns.

Thanks for the concrete feedback! I'll follow up again once I better
understand the code you're citing as well as the existing userland
interface to it.

cheers!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 15:29                                         ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-16 15:29 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, May 16, 2011 at 7:43 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> > Note, i'm not actually asking for the moon, a pony and more.
>> >
>> > I fully submit that we are yet far away from being able to do a full LSM
>> > via this mechanism.
>> >
>> > What i'm asking for is that because the syscall point steps taken by Will
>> > look very promising so it would be nice to do *that* in a slightly more
>> > flexible scheme that does not condemn it to be limited to the syscall
>> > boundary and such ...
>>
>> What do you suggest here?
>>
>> From my brief exploration of the ftrace/perf (and seccomp) code, I don't see
>> a clean way of integrating over the existing interfaces to the ftrace
>> framework (e.g., the global perf event pump seems to be a mismatch), but I
>> may be missing something obvious. [...]
>
> Well, there's no global perf event pump. Here we'd obviously want to use
> buffer-less events that do no output (pumping) whatsoever - i.e. just counting
> events with filters attached.

Cool - I missed these entirely.  I'll get reading :)

> What i suggest is to:
>
> ?- share syscall events ? ? ? ?# you are fine with that as your patch makes use of them
> ?- share the scripting engine ?# you are fine with that as your patch makes use of it
>
> ?- share *any* other event to do_exit() a process at syscall exit time
>
> ?- share any other active event that kernel developers specifically enable
> ? for active use to impact security-relevant execution even sooner than
> ? syscall exit time - not just system calls
>
> ?- share the actual facility that manages (sets/gets) filters

These make sense to me at a high level.  I'll throw in a few initial
comments, but I'll be back for a round-two once I catch up on the rest
of the perf code.

> So right now you have this structure for your new feature:
>
> ?Documentation/trace/seccomp_filter.txt | ? 75 +++++
> ?arch/x86/kernel/ldt.c ? ? ? ? ? ? ? ? ?| ? ?5
> ?arch/x86/kernel/tls.c ? ? ? ? ? ? ? ? ?| ? ?7
> ?fs/proc/array.c ? ? ? ? ? ? ? ? ? ? ? ?| ? 21 +
> ?fs/proc/base.c ? ? ? ? ? ? ? ? ? ? ? ? | ? 25 +
> ?include/linux/ftrace_event.h ? ? ? ? ? | ? ?9
> ?include/linux/seccomp.h ? ? ? ? ? ? ? ?| ? 98 +++++++
> ?include/linux/syscalls.h ? ? ? ? ? ? ? | ? 54 ++--
> ?include/trace/syscall.h ? ? ? ? ? ? ? ?| ? ?6
> ?kernel/Makefile ? ? ? ? ? ? ? ? ? ? ? ?| ? ?1
> ?kernel/fork.c ? ? ? ? ? ? ? ? ? ? ? ? ?| ? ?8
> ?kernel/perf_event.c ? ? ? ? ? ? ? ? ? ?| ? ?7
> ?kernel/seccomp.c ? ? ? ? ? ? ? ? ? ? ? | ?144 ++++++++++-
> ?kernel/seccomp_filter.c ? ? ? ? ? ? ? ?| ?428 +++++++++++++++++++++++++++++++++
> ?kernel/sys.c ? ? ? ? ? ? ? ? ? ? ? ? ? | ? 11
> ?kernel/trace/Kconfig ? ? ? ? ? ? ? ? ? | ? 10
> ?kernel/trace/trace_events_filter.c ? ? | ? 60 ++--
> ?kernel/trace/trace_syscalls.c ? ? ? ? ?| ? 96 ++++++-
> ?18 files changed, 986 insertions(+), 79 deletions(-)
>
> Firstly, one specific problem i can see is that kernel/seccomp_filter.c
> hardcodes to the system call boundary. Which is fine to a prototype
> implementation (and obviously fine for something that builds upon seccomp) but
> not fine in terms of a flexible Linux security feature :-)
>
> You have hardcoded these syscall assumptions via:
>
> ?struct seccomp_filter {
> ? ? ? ?struct list_head list;
> ? ? ? ?struct rcu_head rcu;
> ? ? ? ?int syscall_nr;
> ? ? ? ?struct syscall_metadata *data;
> ? ? ? ?struct event_filter *event_filter;
> ?};

(This structure is a bit different in the new rev of the patch, but
nothing relevant to this specific part of the discussion :)

> Which comes just because you chose to enumerate only syscall events - instead
> of enumerating all events.

While I'm willing to live with the tradeoff, using ftrace event
numbers from FTRACE_SYSCALLS means that the filter will be unable to
hook a fair number of syscalls: execve, clone, etc  (all the ptregs
fixup syscalls on x86) and anything that returns int instead of long
(on x86).  Though the last two patches in the initial patch series
provided a proposed clean up for the latter :/

The current revision of the seccomp filter patch can function in a
bitmask-like state when CONFIG_FTRACE is unset or
CONFIG_FTRACE_SYSCALLS is unset.  This also means any platform with
CONFIG_SECCOMP support can start using this right away, but I realize
that is more of a short-term gain rather than a long-term one.

> Instead of that please bind to all events instead - syscalls are just one of
> the many events we have. Type 'perf list' and see how many event types we have,
> and quite a few could be enabled for 'active feedback' sandboxing as well.
>
> Secondly, and this is really a variant of the first problem you have, the way
> you process event filter 'failures' is pretty limited. You utilize the regular
> seccomp method which works by calling into __secure_computing() and silently
> accepting syscalls or generating a hard do_exit() on even the slightest of
> filter failures.
>
> Instead of that what we'd want to have is to have regular syscall events
> registered, *and* enabled for such active filtering purposes. The moment the
> filter hits such an 'active' event would set the TIF_NOTIFY_RESUME flag and
> some other attribute in the task and the kernel would do a do_exit() at the
> earliest of opportunities before calling the syscall or at the
> return-from-syscall point latest.
>
> Note that no seccomp specific code would have to execute here, we can already
> generate events both at syscall entry and at syscall exit points, the only new
> bit we'd need is for the 'kill the task' [or abort the syscall] policy action.
>
> This is *far* more generic still yields the same short-term end result as far
> as your sandboxing is concerned.

Almost :/  I still need to review the code you've pointed out, but, at
present, the ftrace hooks occur after the seccomp and syscall auditing
hooks.  This means that that code is exposed no matter what in this
model.  To trim the exposed surface to userspace, we really need those
early hooks.  While I can see both hacky and less hacky approaches
around this, it stills strikes me that the seccomp thread flag and
early interception are good to reuse.  One option might be to allow
seccomp to be a secure-syscall event source, but I suspect that lands
more on the hack-y side of the fence :)

Anyway, I'll read up on the other filters and try to understand
exactly how per-task, counting perf events are being handled.

> What we'd need for this is a way to mark existing TRACE_EVENT()s as 'active'.
> We'd mark all syscall events as 'active' straight away.

It seems like we'll still end up special casing system calls no matter
what as they are the userland vector into the kernel.  Marking
syscalls active right away sounds roughly equivalent to calling
prctl(PR_SET_SECCOMP, 2).  This could easily be done following the
model of syscall_nr_to_meta() since that enumerates every system call
meta data entry which yields both entry and exit event numbers. Of
course, I'd need to understand how that ties in with the per-task,
counting code.

> [ Detail: these trace events return a return code, which the calling code can
> ?use, that way event return values could be used sooner than syscall exit
> ?points. IRQ code could make use of it as well, so for example this way we
> ?could filter based early packet inspection, still in the IRQ code. ]
>
> Then what your feature would do is to simply open up the events you are
> interested in (just as counting events, without any buffers), and set 'active'
> filters in them them to 'deny/allow' specific combinations of parameters only.
>
> Thirdly, you have added a separate facility to set/query filters via /proc,
> this lives mostly in /proc/<pid>/seccomp_filter. This seccomp specificity would
> go away altogether: it should be possible to query existing events and filters
> even outside of the seccomp facility, so if you want something explicit here
> beyond the current event ioctl() to set filters please improve the generic
> facility - which right now is poorer than what you have implemented so it's in
> good need to be improved! :-)

I haven't looked at the other interface, so I will.  I know about the
existence of the perf_event_open syscall, but that's about it.  Would
it make sense to use the same syscall in a unified-interface world?
Even if the right way, semantically it seems awkward.

> All in one such a solution would enable us to reuse code in a lot more deeper
> fashion while still providing all the functionality you are interested in.
> These are all short-term concerns, not pie-in-the-sky future ideal LSM
> concerns.

Thanks for the concrete feedback! I'll follow up again once I better
understand the code you're citing as well as the existing userland
interface to it.

cheers!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-13 14:57                                                 ` Ingo Molnar
  (?)
@ 2011-05-16 16:23                                                   ` Steven Rostedt
  -1 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-16 16:23 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, James Morris, Will Drewry, linux-kernel,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Fri, 2011-05-13 at 16:57 +0200, Ingo Molnar wrote:

> > > > Then there's the whole indirection argument, if you don't need
> > > > indirection, its often better to not use it, I myself much prefer code
> > > > to look like:
> > > > 
> > > >    foo1(bar);
> > > >    foo2(bar);
> > > >    foo3(bar);
> > > > 
> > > > Than:
> > > > 
> > > >    foo_notifier(bar);
> > > > 
> > > > Simply because its much clearer who all are involved without me having
> > > > to grep around to see who registers for foo_notifier and wth they do
> > > > with it. It also makes it much harder to sneak in another user, whereas
> > > > its nearly impossible to find new notifier users.
> > > > 
> > > > Its also much faster, no extra memory accesses, no indirect function
> > > > calls, no other muck.
> > > 
> > > But i suspect this question has been settled, given the fact that even pure 
> > > observer events need and already process a chain of events? Am i missing 
> > > something about your argument?
> > 
> > I'm saying that there's reasons to not use notifiers passive or active.
> > 
> > Mostly the whole notifier/indirection muck comes up once you want
> > modules to make use of the thing, because then you need dynamic
> > management of the callback list.
> 
> But your argument assumes that we'd have a chain of functions to call, like 
> regular notifiers.
> 
> While the natural model here would be to have a list of registered event 
> structs for that point, with different filters but basically the same callback 
> mechanism (a call into the filter engine in essence).
> 
> Also note that the common case would be no event registered - and we'd 
> automatically optimize that case via the existing jump labels optimization.

I agree that I prefer the "notifier" type over having direct function
calls. Yes, it's easier to read and figure out what functions are
called, but notifiers can be optimized for the default case where
nothing is called (jump-label nop).

> 
> > (Then again, I'm fairly glad we don't have explicit callbacks in kernel/cpu.c 
> > for all the cpu-hotplug callbacks :-)
> > 
> > Anyway, I oppose for the existing events to gain an active role.
> 
> Why if 'being active' is optional and useful?

I'm a bit nervous about the 'active' role of (trace_)events, because of
the way multiple callbacks can be registered. How would:

	err = event_x();
	if (err == -EACCESS) {

be handled? Would we need a way to prioritize which call back gets the
return value? One way I guess would be to add a check_event option,
where you pass in an ENUM of the event you want:

	event_x();
	err = check_event_x(MYEVENT);

If something registered itself as "MYEVENT" to event_x, then you get the
return code of MYEVENT. If the MYEVENT was not registered, a -ENODEV or
something could be returned. I'm sure we could even optimize it such a
way if no active events have been registered to event_x, that
check_event_x() will return -ENODEV without any branches.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 16:23                                                   ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-16 16:23 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Eric Paris, Paul Mundt,
	Tejun Heo, linux390, Andrew Morton, agl, David S. Miller

On Fri, 2011-05-13 at 16:57 +0200, Ingo Molnar wrote:

> > > > Then there's the whole indirection argument, if you don't need
> > > > indirection, its often better to not use it, I myself much prefer code
> > > > to look like:
> > > > 
> > > >    foo1(bar);
> > > >    foo2(bar);
> > > >    foo3(bar);
> > > > 
> > > > Than:
> > > > 
> > > >    foo_notifier(bar);
> > > > 
> > > > Simply because its much clearer who all are involved without me having
> > > > to grep around to see who registers for foo_notifier and wth they do
> > > > with it. It also makes it much harder to sneak in another user, whereas
> > > > its nearly impossible to find new notifier users.
> > > > 
> > > > Its also much faster, no extra memory accesses, no indirect function
> > > > calls, no other muck.
> > > 
> > > But i suspect this question has been settled, given the fact that even pure 
> > > observer events need and already process a chain of events? Am i missing 
> > > something about your argument?
> > 
> > I'm saying that there's reasons to not use notifiers passive or active.
> > 
> > Mostly the whole notifier/indirection muck comes up once you want
> > modules to make use of the thing, because then you need dynamic
> > management of the callback list.
> 
> But your argument assumes that we'd have a chain of functions to call, like 
> regular notifiers.
> 
> While the natural model here would be to have a list of registered event 
> structs for that point, with different filters but basically the same callback 
> mechanism (a call into the filter engine in essence).
> 
> Also note that the common case would be no event registered - and we'd 
> automatically optimize that case via the existing jump labels optimization.

I agree that I prefer the "notifier" type over having direct function
calls. Yes, it's easier to read and figure out what functions are
called, but notifiers can be optimized for the default case where
nothing is called (jump-label nop).

> 
> > (Then again, I'm fairly glad we don't have explicit callbacks in kernel/cpu.c 
> > for all the cpu-hotplug callbacks :-)
> > 
> > Anyway, I oppose for the existing events to gain an active role.
> 
> Why if 'being active' is optional and useful?

I'm a bit nervous about the 'active' role of (trace_)events, because of
the way multiple callbacks can be registered. How would:

	err = event_x();
	if (err == -EACCESS) {

be handled? Would we need a way to prioritize which call back gets the
return value? One way I guess would be to add a check_event option,
where you pass in an ENUM of the event you want:

	event_x();
	err = check_event_x(MYEVENT);

If something registered itself as "MYEVENT" to event_x, then you get the
return code of MYEVENT. If the MYEVENT was not registered, a -ENODEV or
something could be returned. I'm sure we could even optimize it such a
way if no active events have been registered to event_x, that
check_event_x() will return -ENODEV without any branches.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 16:23                                                   ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-16 16:23 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, 2011-05-13 at 16:57 +0200, Ingo Molnar wrote:

> > > > Then there's the whole indirection argument, if you don't need
> > > > indirection, its often better to not use it, I myself much prefer code
> > > > to look like:
> > > > 
> > > >    foo1(bar);
> > > >    foo2(bar);
> > > >    foo3(bar);
> > > > 
> > > > Than:
> > > > 
> > > >    foo_notifier(bar);
> > > > 
> > > > Simply because its much clearer who all are involved without me having
> > > > to grep around to see who registers for foo_notifier and wth they do
> > > > with it. It also makes it much harder to sneak in another user, whereas
> > > > its nearly impossible to find new notifier users.
> > > > 
> > > > Its also much faster, no extra memory accesses, no indirect function
> > > > calls, no other muck.
> > > 
> > > But i suspect this question has been settled, given the fact that even pure 
> > > observer events need and already process a chain of events? Am i missing 
> > > something about your argument?
> > 
> > I'm saying that there's reasons to not use notifiers passive or active.
> > 
> > Mostly the whole notifier/indirection muck comes up once you want
> > modules to make use of the thing, because then you need dynamic
> > management of the callback list.
> 
> But your argument assumes that we'd have a chain of functions to call, like 
> regular notifiers.
> 
> While the natural model here would be to have a list of registered event 
> structs for that point, with different filters but basically the same callback 
> mechanism (a call into the filter engine in essence).
> 
> Also note that the common case would be no event registered - and we'd 
> automatically optimize that case via the existing jump labels optimization.

I agree that I prefer the "notifier" type over having direct function
calls. Yes, it's easier to read and figure out what functions are
called, but notifiers can be optimized for the default case where
nothing is called (jump-label nop).

> 
> > (Then again, I'm fairly glad we don't have explicit callbacks in kernel/cpu.c 
> > for all the cpu-hotplug callbacks :-)
> > 
> > Anyway, I oppose for the existing events to gain an active role.
> 
> Why if 'being active' is optional and useful?

I'm a bit nervous about the 'active' role of (trace_)events, because of
the way multiple callbacks can be registered. How would:

	err = event_x();
	if (err == -EACCESS) {

be handled? Would we need a way to prioritize which call back gets the
return value? One way I guess would be to add a check_event option,
where you pass in an ENUM of the event you want:

	event_x();
	err = check_event_x(MYEVENT);

If something registered itself as "MYEVENT" to event_x, then you get the
return code of MYEVENT. If the MYEVENT was not registered, a -ENODEV or
something could be returned. I'm sure we could even optimize it such a
way if no active events have been registered to event_x, that
check_event_x() will return -ENODEV without any branches.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-16 16:23                                                   ` Steven Rostedt
  (?)
@ 2011-05-16 16:52                                                     ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 16:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, James Morris, Will Drewry, linux-kernel,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds


* Steven Rostedt <rostedt@goodmis.org> wrote:

> I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> way multiple callbacks can be registered. How would:
> 
> 	err = event_x();
> 	if (err == -EACCESS) {
> 
> be handled? [...]

The default behavior would be something obvious: to trigger all callbacks and 
use the first non-zero return value.

> [...] Would we need a way to prioritize which call back gets the return 
> value? One way I guess would be to add a check_event option, where you pass 
> in an ENUM of the event you want:
> 
> 	event_x();
> 	err = check_event_x(MYEVENT);
> 
> If something registered itself as "MYEVENT" to event_x, then you get the 
> return code of MYEVENT. If the MYEVENT was not registered, a -ENODEV or 
> something could be returned. I'm sure we could even optimize it such a way if 
> no active events have been registered to event_x, that check_event_x() will 
> return -ENODEV without any branches.

I would keep it simple and extensible - that way we can complicate it when the 
need arises! :)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 16:52                                                     ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 16:52 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Eric Paris, Paul Mundt,
	Tejun Heo, linux390, Andrew Morton, agl, David S. Miller


* Steven Rostedt <rostedt@goodmis.org> wrote:

> I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> way multiple callbacks can be registered. How would:
> 
> 	err = event_x();
> 	if (err == -EACCESS) {
> 
> be handled? [...]

The default behavior would be something obvious: to trigger all callbacks and 
use the first non-zero return value.

> [...] Would we need a way to prioritize which call back gets the return 
> value? One way I guess would be to add a check_event option, where you pass 
> in an ENUM of the event you want:
> 
> 	event_x();
> 	err = check_event_x(MYEVENT);
> 
> If something registered itself as "MYEVENT" to event_x, then you get the 
> return code of MYEVENT. If the MYEVENT was not registered, a -ENODEV or 
> something could be returned. I'm sure we could even optimize it such a way if 
> no active events have been registered to event_x, that check_event_x() will 
> return -ENODEV without any branches.

I would keep it simple and extensible - that way we can complicate it when the 
need arises! :)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 16:52                                                     ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-16 16:52 UTC (permalink / raw)
  To: linux-arm-kernel


* Steven Rostedt <rostedt@goodmis.org> wrote:

> I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> way multiple callbacks can be registered. How would:
> 
> 	err = event_x();
> 	if (err == -EACCESS) {
> 
> be handled? [...]

The default behavior would be something obvious: to trigger all callbacks and 
use the first non-zero return value.

> [...] Would we need a way to prioritize which call back gets the return 
> value? One way I guess would be to add a check_event option, where you pass 
> in an ENUM of the event you want:
> 
> 	event_x();
> 	err = check_event_x(MYEVENT);
> 
> If something registered itself as "MYEVENT" to event_x, then you get the 
> return code of MYEVENT. If the MYEVENT was not registered, a -ENODEV or 
> something could be returned. I'm sure we could even optimize it such a way if 
> no active events have been registered to event_x, that check_event_x() will 
> return -ENODEV without any branches.

I would keep it simple and extensible - that way we can complicate it when the 
need arises! :)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-16 16:52                                                     ` Ingo Molnar
  (?)
@ 2011-05-16 17:03                                                       ` Steven Rostedt
  -1 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-16 17:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, James Morris, Will Drewry, linux-kernel,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> * Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > way multiple callbacks can be registered. How would:
> > 
> > 	err = event_x();
> > 	if (err == -EACCESS) {
> > 
> > be handled? [...]
> 
> The default behavior would be something obvious: to trigger all callbacks and 
> use the first non-zero return value.


But how do we know which callback that was from? There's no ordering of
what callbacks are called first.

> 
> > [...] Would we need a way to prioritize which call back gets the return 
> > value? One way I guess would be to add a check_event option, where you pass 
> > in an ENUM of the event you want:
> > 
> > 	event_x();
> > 	err = check_event_x(MYEVENT);
> > 
> > If something registered itself as "MYEVENT" to event_x, then you get the 
> > return code of MYEVENT. If the MYEVENT was not registered, a -ENODEV or 
> > something could be returned. I'm sure we could even optimize it such a way if 
> > no active events have been registered to event_x, that check_event_x() will 
> > return -ENODEV without any branches.
> 
> I would keep it simple and extensible - that way we can complicate it when the 
> need arises! :)

The above is rather trivial to implement. I don't think it complicates
anything.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 17:03                                                       ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-16 17:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Eric Paris, Paul Mundt,
	Tejun Heo, linux390, Andrew Morton, agl, David S. Miller

On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> * Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > way multiple callbacks can be registered. How would:
> > 
> > 	err = event_x();
> > 	if (err == -EACCESS) {
> > 
> > be handled? [...]
> 
> The default behavior would be something obvious: to trigger all callbacks and 
> use the first non-zero return value.


But how do we know which callback that was from? There's no ordering of
what callbacks are called first.

> 
> > [...] Would we need a way to prioritize which call back gets the return 
> > value? One way I guess would be to add a check_event option, where you pass 
> > in an ENUM of the event you want:
> > 
> > 	event_x();
> > 	err = check_event_x(MYEVENT);
> > 
> > If something registered itself as "MYEVENT" to event_x, then you get the 
> > return code of MYEVENT. If the MYEVENT was not registered, a -ENODEV or 
> > something could be returned. I'm sure we could even optimize it such a way if 
> > no active events have been registered to event_x, that check_event_x() will 
> > return -ENODEV without any branches.
> 
> I would keep it simple and extensible - that way we can complicate it when the 
> need arises! :)

The above is rather trivial to implement. I don't think it complicates
anything.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-16 17:03                                                       ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-16 17:03 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> * Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > way multiple callbacks can be registered. How would:
> > 
> > 	err = event_x();
> > 	if (err == -EACCESS) {
> > 
> > be handled? [...]
> 
> The default behavior would be something obvious: to trigger all callbacks and 
> use the first non-zero return value.


But how do we know which callback that was from? There's no ordering of
what callbacks are called first.

> 
> > [...] Would we need a way to prioritize which call back gets the return 
> > value? One way I guess would be to add a check_event option, where you pass 
> > in an ENUM of the event you want:
> > 
> > 	event_x();
> > 	err = check_event_x(MYEVENT);
> > 
> > If something registered itself as "MYEVENT" to event_x, then you get the 
> > return code of MYEVENT. If the MYEVENT was not registered, a -ENODEV or 
> > something could be returned. I'm sure we could even optimize it such a way if 
> > no active events have been registered to event_x, that check_event_x() will 
> > return -ENODEV without any branches.
> 
> I would keep it simple and extensible - that way we can complicate it when the 
> need arises! :)

The above is rather trivial to implement. I don't think it complicates
anything.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-16 15:08                                   ` Ingo Molnar
  (?)
@ 2011-05-17  2:24                                     ` James Morris
  -1 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-17  2:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, Russell King, Michal Simek,
	Ralf Baechle, Benjamin Herrenschmidt, Paul Mackerras,
	Martin Schwidefsky, Heiko Carstens, linux390, Paul Mundt,
	David S. Miller, Thomas Gleixner, H. Peter Anvin, x86,
	Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds

On Mon, 16 May 2011, Ingo Molnar wrote:

> > Not really.
> > 
> > Firstly, what is the security goal of these restrictions? [...]
> 
> To do what i described above? Namely:
> 
>  " Sandboxed code should only be allowed to open files in /home/sandbox/, /lib/
>    and /usr/lib/ "

These are access rules, they don't really describe a high-level security 
goal.  How do you know it's ok to open everything in these directories?


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17  2:24                                     ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-17  2:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, Linus Torvalds, Ingo Molnar, kees.cook, Serge E. Hallyn,
	Peter Zijlstra, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Mon, 16 May 2011, Ingo Molnar wrote:

> > Not really.
> > 
> > Firstly, what is the security goal of these restrictions? [...]
> 
> To do what i described above? Namely:
> 
>  " Sandboxed code should only be allowed to open files in /home/sandbox/, /lib/
>    and /usr/lib/ "

These are access rules, they don't really describe a high-level security 
goal.  How do you know it's ok to open everything in these directories?


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17  2:24                                     ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-17  2:24 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 16 May 2011, Ingo Molnar wrote:

> > Not really.
> > 
> > Firstly, what is the security goal of these restrictions? [...]
> 
> To do what i described above? Namely:
> 
>  " Sandboxed code should only be allowed to open files in /home/sandbox/, /lib/
>    and /usr/lib/ "

These are access rules, they don't really describe a high-level security 
goal.  How do you know it's ok to open everything in these directories?


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-16 17:03                                                       ` Steven Rostedt
  (?)
@ 2011-05-17 12:42                                                         ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 12:42 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, James Morris, Will Drewry, linux-kernel,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds


* Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> > > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > > way multiple callbacks can be registered. How would:
> > > 
> > > 	err = event_x();
> > > 	if (err == -EACCESS) {
> > > 
> > > be handled? [...]
> > 
> > The default behavior would be something obvious: to trigger all callbacks and 
> > use the first non-zero return value.
> 
> But how do we know which callback that was from? There's no ordering of what 
> callbacks are called first.

We do not have to know that - nor do the calling sites care in general. Do you 
have some specific usecase in mind where the identity of the callback that 
generates a match matters?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 12:42                                                         ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 12:42 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Eric Paris, Paul Mundt,
	Tejun Heo, linux390, Andrew Morton, agl, David S. Miller


* Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> > > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > > way multiple callbacks can be registered. How would:
> > > 
> > > 	err = event_x();
> > > 	if (err == -EACCESS) {
> > > 
> > > be handled? [...]
> > 
> > The default behavior would be something obvious: to trigger all callbacks and 
> > use the first non-zero return value.
> 
> But how do we know which callback that was from? There's no ordering of what 
> callbacks are called first.

We do not have to know that - nor do the calling sites care in general. Do you 
have some specific usecase in mind where the identity of the callback that 
generates a match matters?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 12:42                                                         ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 12:42 UTC (permalink / raw)
  To: linux-arm-kernel


* Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> > > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > > way multiple callbacks can be registered. How would:
> > > 
> > > 	err = event_x();
> > > 	if (err == -EACCESS) {
> > > 
> > > be handled? [...]
> > 
> > The default behavior would be something obvious: to trigger all callbacks and 
> > use the first non-zero return value.
> 
> But how do we know which callback that was from? There's no ordering of what 
> callbacks are called first.

We do not have to know that - nor do the calling sites care in general. Do you 
have some specific usecase in mind where the identity of the callback that 
generates a match matters?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-16 15:29                                         ` Will Drewry
  (?)
@ 2011-05-17 12:57                                           ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 12:57 UTC (permalink / raw)
  To: Will Drewry
  Cc: Eric Paris, James Morris, linux-kernel, Steven Rostedt,
	Frederic Weisbecker, kees.cook, agl, Peter Zijlstra,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds


* Will Drewry <wad@chromium.org> wrote:

> > This is *far* more generic still yields the same short-term end result as 
> > far as your sandboxing is concerned.
> 
> Almost :/ [...]

Hey that's a pretty good result from a subsystem that was not written with your 
usecase in mind *at all* ;-)

> [...]  I still need to review the code you've pointed out, but, at present, 
> the ftrace hooks occur after the seccomp and syscall auditing hooks.  This 
> means that that code is exposed no matter what in this model.  To trim the 
> exposed surface to userspace, we really need those early hooks.  While I can 
> see both hacky and less hacky approaches around this, it stills strikes me 
> that the seccomp thread flag and early interception are good to reuse.  One 
> option might be to allow seccomp to be a secure-syscall event source, but I 
> suspect that lands more on the hack-y side of the fence :)

Agreed, there should be no security compromise imposed on your usecase, at all.

You could move the event callback sooner into the syscall-entry sequence to 
make sure it's the highest priority thing to process?

There's no semantic dependency on its current location so this can be changed 
AFAICS.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 12:57                                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 12:57 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Peter Zijlstra, Steven Rostedt,
	Tejun Heo, Thomas Gleixner, linux-arm-kernel, Michal Marek,
	Michal Simek, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* Will Drewry <wad@chromium.org> wrote:

> > This is *far* more generic still yields the same short-term end result as 
> > far as your sandboxing is concerned.
> 
> Almost :/ [...]

Hey that's a pretty good result from a subsystem that was not written with your 
usecase in mind *at all* ;-)

> [...]  I still need to review the code you've pointed out, but, at present, 
> the ftrace hooks occur after the seccomp and syscall auditing hooks.  This 
> means that that code is exposed no matter what in this model.  To trim the 
> exposed surface to userspace, we really need those early hooks.  While I can 
> see both hacky and less hacky approaches around this, it stills strikes me 
> that the seccomp thread flag and early interception are good to reuse.  One 
> option might be to allow seccomp to be a secure-syscall event source, but I 
> suspect that lands more on the hack-y side of the fence :)

Agreed, there should be no security compromise imposed on your usecase, at all.

You could move the event callback sooner into the syscall-entry sequence to 
make sure it's the highest priority thing to process?

There's no semantic dependency on its current location so this can be changed 
AFAICS.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 12:57                                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 12:57 UTC (permalink / raw)
  To: linux-arm-kernel


* Will Drewry <wad@chromium.org> wrote:

> > This is *far* more generic still yields the same short-term end result as 
> > far as your sandboxing is concerned.
> 
> Almost :/ [...]

Hey that's a pretty good result from a subsystem that was not written with your 
usecase in mind *at all* ;-)

> [...]  I still need to review the code you've pointed out, but, at present, 
> the ftrace hooks occur after the seccomp and syscall auditing hooks.  This 
> means that that code is exposed no matter what in this model.  To trim the 
> exposed surface to userspace, we really need those early hooks.  While I can 
> see both hacky and less hacky approaches around this, it stills strikes me 
> that the seccomp thread flag and early interception are good to reuse.  One 
> option might be to allow seccomp to be a secure-syscall event source, but I 
> suspect that lands more on the hack-y side of the fence :)

Agreed, there should be no security compromise imposed on your usecase, at all.

You could move the event callback sooner into the syscall-entry sequence to 
make sure it's the highest priority thing to process?

There's no semantic dependency on its current location so this can be changed 
AFAICS.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-17 12:42                                                         ` Ingo Molnar
  (?)
@ 2011-05-17 13:05                                                           ` Steven Rostedt
  -1 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-17 13:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, James Morris, Will Drewry, linux-kernel,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Tue, 2011-05-17 at 14:42 +0200, Ingo Molnar wrote:
> * Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> > > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > > 
> > > > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > > > way multiple callbacks can be registered. How would:
> > > > 
> > > > 	err = event_x();
> > > > 	if (err == -EACCESS) {
> > > > 
> > > > be handled? [...]
> > > 
> > > The default behavior would be something obvious: to trigger all callbacks and 
> > > use the first non-zero return value.
> > 
> > But how do we know which callback that was from? There's no ordering of what 
> > callbacks are called first.
> 
> We do not have to know that - nor do the calling sites care in general. Do you 
> have some specific usecase in mind where the identity of the callback that 
> generates a match matters?

Maybe I'm confused. I was thinking that these event_*() are what we
currently call trace_*(), but the event_*(), I assume, can return a
value if a call back returns one.

Thus, we now have the ability to dynamically attach function calls to
arbitrary points in the kernel that can have an affect on the code that
called it. Right now, we only have the ability to attach function calls
to these locations that have passive affects (tracing/profiling).

But you say, "nor do the calling sites care in general". Then what do
these calling sites do with the return code? Are we limiting these
actions to security only? Or can we have some other feature. I can
envision that we can make the Linux kernel quite dynamic here with "self
modifying code". That is, anywhere we have "hooks", perhaps we could
replace them with dynamic switches (jump labels). Maybe events would not
be the best use, but they could be a generic one.

Knowing what callback returned the result would be beneficial. Right
now, you are saying if the call back return anything, just abort the
call, not knowing what callback was called.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 13:05                                                           ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-17 13:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Eric Paris, Paul Mundt,
	Tejun Heo, linux390, Andrew Morton, agl, David S. Miller

On Tue, 2011-05-17 at 14:42 +0200, Ingo Molnar wrote:
> * Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> > > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > > 
> > > > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > > > way multiple callbacks can be registered. How would:
> > > > 
> > > > 	err = event_x();
> > > > 	if (err == -EACCESS) {
> > > > 
> > > > be handled? [...]
> > > 
> > > The default behavior would be something obvious: to trigger all callbacks and 
> > > use the first non-zero return value.
> > 
> > But how do we know which callback that was from? There's no ordering of what 
> > callbacks are called first.
> 
> We do not have to know that - nor do the calling sites care in general. Do you 
> have some specific usecase in mind where the identity of the callback that 
> generates a match matters?

Maybe I'm confused. I was thinking that these event_*() are what we
currently call trace_*(), but the event_*(), I assume, can return a
value if a call back returns one.

Thus, we now have the ability to dynamically attach function calls to
arbitrary points in the kernel that can have an affect on the code that
called it. Right now, we only have the ability to attach function calls
to these locations that have passive affects (tracing/profiling).

But you say, "nor do the calling sites care in general". Then what do
these calling sites do with the return code? Are we limiting these
actions to security only? Or can we have some other feature. I can
envision that we can make the Linux kernel quite dynamic here with "self
modifying code". That is, anywhere we have "hooks", perhaps we could
replace them with dynamic switches (jump labels). Maybe events would not
be the best use, but they could be a generic one.

Knowing what callback returned the result would be beneficial. Right
now, you are saying if the call back return anything, just abort the
call, not knowing what callback was called.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 13:05                                                           ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-17 13:05 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 2011-05-17 at 14:42 +0200, Ingo Molnar wrote:
> * Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> > > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > > 
> > > > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > > > way multiple callbacks can be registered. How would:
> > > > 
> > > > 	err = event_x();
> > > > 	if (err == -EACCESS) {
> > > > 
> > > > be handled? [...]
> > > 
> > > The default behavior would be something obvious: to trigger all callbacks and 
> > > use the first non-zero return value.
> > 
> > But how do we know which callback that was from? There's no ordering of what 
> > callbacks are called first.
> 
> We do not have to know that - nor do the calling sites care in general. Do you 
> have some specific usecase in mind where the identity of the callback that 
> generates a match matters?

Maybe I'm confused. I was thinking that these event_*() are what we
currently call trace_*(), but the event_*(), I assume, can return a
value if a call back returns one.

Thus, we now have the ability to dynamically attach function calls to
arbitrary points in the kernel that can have an affect on the code that
called it. Right now, we only have the ability to attach function calls
to these locations that have passive affects (tracing/profiling).

But you say, "nor do the calling sites care in general". Then what do
these calling sites do with the return code? Are we limiting these
actions to security only? Or can we have some other feature. I can
envision that we can make the Linux kernel quite dynamic here with "self
modifying code". That is, anywhere we have "hooks", perhaps we could
replace them with dynamic switches (jump labels). Maybe events would not
be the best use, but they could be a generic one.

Knowing what callback returned the result would be beneficial. Right
now, you are saying if the call back return anything, just abort the
call, not knowing what callback was called.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-17  2:24                                     ` James Morris
  (?)
@ 2011-05-17 13:10                                       ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 13:10 UTC (permalink / raw)
  To: James Morris
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, Russell King, Michal Simek,
	Ralf Baechle, Benjamin Herrenschmidt, Paul Mackerras,
	Martin Schwidefsky, Heiko Carstens, linux390, Paul Mundt,
	David S. Miller, Thomas Gleixner, H. Peter Anvin, x86,
	Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds


* James Morris <jmorris@namei.org> wrote:

> On Mon, 16 May 2011, Ingo Molnar wrote:
> 
> > > Not really.
> > > 
> > > Firstly, what is the security goal of these restrictions? [...]
> > 
> > To do what i described above? Namely:
> > 
> >  " Sandboxed code should only be allowed to open files in /home/sandbox/, /lib/
> >    and /usr/lib/ "
> 
> These are access rules, they don't really describe a high-level security 
> goal. [...]

Restrictng sandboxed code to only open files within a given VFS namespace 
boundary sure sounds like a high-level security goal to me.

If implemented and set up correctly then it restricts sandboxed code to only be 
able to open files reachable via that VFS sub-namespace.

That is a rather meaningful high-level concept. What higher level concept do 
you want to argue?

> [...]  How do you know it's ok to open everything in these directories?

How do you know it's ok to open /etc/hosts? The sysadmin has configured the 
system that way.

How do you know that it's ok for sandboxed code to open files in 
/home/sandbox/? The sandbox developer has configured the system that way.

I'm not sure i get your point.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 13:10                                       ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 13:10 UTC (permalink / raw)
  To: James Morris
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, Linus Torvalds, Ingo Molnar, kees.cook, Serge E. Hallyn,
	Peter Zijlstra, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* James Morris <jmorris@namei.org> wrote:

> On Mon, 16 May 2011, Ingo Molnar wrote:
> 
> > > Not really.
> > > 
> > > Firstly, what is the security goal of these restrictions? [...]
> > 
> > To do what i described above? Namely:
> > 
> >  " Sandboxed code should only be allowed to open files in /home/sandbox/, /lib/
> >    and /usr/lib/ "
> 
> These are access rules, they don't really describe a high-level security 
> goal. [...]

Restrictng sandboxed code to only open files within a given VFS namespace 
boundary sure sounds like a high-level security goal to me.

If implemented and set up correctly then it restricts sandboxed code to only be 
able to open files reachable via that VFS sub-namespace.

That is a rather meaningful high-level concept. What higher level concept do 
you want to argue?

> [...]  How do you know it's ok to open everything in these directories?

How do you know it's ok to open /etc/hosts? The sysadmin has configured the 
system that way.

How do you know that it's ok for sandboxed code to open files in 
/home/sandbox/? The sandbox developer has configured the system that way.

I'm not sure i get your point.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 13:10                                       ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 13:10 UTC (permalink / raw)
  To: linux-arm-kernel


* James Morris <jmorris@namei.org> wrote:

> On Mon, 16 May 2011, Ingo Molnar wrote:
> 
> > > Not really.
> > > 
> > > Firstly, what is the security goal of these restrictions? [...]
> > 
> > To do what i described above? Namely:
> > 
> >  " Sandboxed code should only be allowed to open files in /home/sandbox/, /lib/
> >    and /usr/lib/ "
> 
> These are access rules, they don't really describe a high-level security 
> goal. [...]

Restrictng sandboxed code to only open files within a given VFS namespace 
boundary sure sounds like a high-level security goal to me.

If implemented and set up correctly then it restricts sandboxed code to only be 
able to open files reachable via that VFS sub-namespace.

That is a rather meaningful high-level concept. What higher level concept do 
you want to argue?

> [...]  How do you know it's ok to open everything in these directories?

How do you know it's ok to open /etc/hosts? The sysadmin has configured the 
system that way.

How do you know that it's ok for sandboxed code to open files in 
/home/sandbox/? The sandbox developer has configured the system that way.

I'm not sure i get your point.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-17 13:05                                                           ` Steven Rostedt
  (?)
@ 2011-05-17 13:19                                                             ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 13:19 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, James Morris, Will Drewry, linux-kernel,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds


* Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 2011-05-17 at 14:42 +0200, Ingo Molnar wrote:
> > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> > > On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> > > > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > > > 
> > > > > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > > > > way multiple callbacks can be registered. How would:
> > > > > 
> > > > > 	err = event_x();
> > > > > 	if (err == -EACCESS) {
> > > > > 
> > > > > be handled? [...]
> > > > 
> > > > The default behavior would be something obvious: to trigger all callbacks and 
> > > > use the first non-zero return value.
> > > 
> > > But how do we know which callback that was from? There's no ordering of what 
> > > callbacks are called first.
> > 
> > We do not have to know that - nor do the calling sites care in general. Do you 
> > have some specific usecase in mind where the identity of the callback that 
> > generates a match matters?
> 
> Maybe I'm confused. I was thinking that these event_*() are what we
> currently call trace_*(), but the event_*(), I assume, can return a
> value if a call back returns one.

Yeah - and the call site can treat it as:

 - Ugh, if i get an error i need to abort whatever i was about to do

or (more advanced future use):

 - If i get a positive value i need to re-evaluate the parameters that were 
   passed in, they were changed

> Thus, we now have the ability to dynamically attach function calls to 
> arbitrary points in the kernel that can have an affect on the code that 
> called it. Right now, we only have the ability to attach function calls to 
> these locations that have passive affects (tracing/profiling).

Well, they can only have the effect that the calling site accepts and handles. 
So the 'effect' is not arbitrary and not defined by the callbacks, it is 
controlled and handled by the calling code.

We do not want invisible side-effects, opaque hooks, etc.

Instead of that we want (this is the getname() example i cited in the thread) 
explicit effects, like:

 if (event_vfs_getname(result))
	return ERR_PTR(-EPERM);

> But you say, "nor do the calling sites care in general". Then what do
> these calling sites do with the return code? Are we limiting these
> actions to security only? Or can we have some other feature. [...]

Yeah, not just security. One other example that came up recently is whether to 
panic the box on certain (bad) events such as NMI errors. This too could be 
made flexible via the event filter code: we already capture many events, so 
places that might conceivably do some policy could do so based on a filter 
condition.

> [...] I can envision that we can make the Linux kernel quite dynamic here 
> with "self modifying code". That is, anywhere we have "hooks", perhaps we 
> could replace them with dynamic switches (jump labels). Maybe events would 
> not be the best use, but they could be a generic one.

events and explicit function calls and explicit side-effects are pretty much 
the only thing that are acceptable. We do not want opaque hooks and arbitrary 
side-effects.

> Knowing what callback returned the result would be beneficial. Right now, you 
> are saying if the call back return anything, just abort the call, not knowing 
> what callback was called.

Yeah, and that's a feature: that way a number of conditions can be attached. 
Multiple security frameworks may have effect on a task or multiple tools might 
set policy action on a given event.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 13:19                                                             ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 13:19 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Martin Schwidefsky,
	Thomas Gleixner, Roland McGrath, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Eric Paris, Paul Mundt,
	Tejun Heo, linux390, Andrew Morton, agl, David S. Miller


* Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 2011-05-17 at 14:42 +0200, Ingo Molnar wrote:
> > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> > > On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> > > > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > > > 
> > > > > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > > > > way multiple callbacks can be registered. How would:
> > > > > 
> > > > > 	err = event_x();
> > > > > 	if (err == -EACCESS) {
> > > > > 
> > > > > be handled? [...]
> > > > 
> > > > The default behavior would be something obvious: to trigger all callbacks and 
> > > > use the first non-zero return value.
> > > 
> > > But how do we know which callback that was from? There's no ordering of what 
> > > callbacks are called first.
> > 
> > We do not have to know that - nor do the calling sites care in general. Do you 
> > have some specific usecase in mind where the identity of the callback that 
> > generates a match matters?
> 
> Maybe I'm confused. I was thinking that these event_*() are what we
> currently call trace_*(), but the event_*(), I assume, can return a
> value if a call back returns one.

Yeah - and the call site can treat it as:

 - Ugh, if i get an error i need to abort whatever i was about to do

or (more advanced future use):

 - If i get a positive value i need to re-evaluate the parameters that were 
   passed in, they were changed

> Thus, we now have the ability to dynamically attach function calls to 
> arbitrary points in the kernel that can have an affect on the code that 
> called it. Right now, we only have the ability to attach function calls to 
> these locations that have passive affects (tracing/profiling).

Well, they can only have the effect that the calling site accepts and handles. 
So the 'effect' is not arbitrary and not defined by the callbacks, it is 
controlled and handled by the calling code.

We do not want invisible side-effects, opaque hooks, etc.

Instead of that we want (this is the getname() example i cited in the thread) 
explicit effects, like:

 if (event_vfs_getname(result))
	return ERR_PTR(-EPERM);

> But you say, "nor do the calling sites care in general". Then what do
> these calling sites do with the return code? Are we limiting these
> actions to security only? Or can we have some other feature. [...]

Yeah, not just security. One other example that came up recently is whether to 
panic the box on certain (bad) events such as NMI errors. This too could be 
made flexible via the event filter code: we already capture many events, so 
places that might conceivably do some policy could do so based on a filter 
condition.

> [...] I can envision that we can make the Linux kernel quite dynamic here 
> with "self modifying code". That is, anywhere we have "hooks", perhaps we 
> could replace them with dynamic switches (jump labels). Maybe events would 
> not be the best use, but they could be a generic one.

events and explicit function calls and explicit side-effects are pretty much 
the only thing that are acceptable. We do not want opaque hooks and arbitrary 
side-effects.

> Knowing what callback returned the result would be beneficial. Right now, you 
> are saying if the call back return anything, just abort the call, not knowing 
> what callback was called.

Yeah, and that's a feature: that way a number of conditions can be attached. 
Multiple security frameworks may have effect on a task or multiple tools might 
set policy action on a given event.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 13:19                                                             ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 13:19 UTC (permalink / raw)
  To: linux-arm-kernel


* Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 2011-05-17 at 14:42 +0200, Ingo Molnar wrote:
> > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> > > On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
> > > > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > > > 
> > > > > I'm a bit nervous about the 'active' role of (trace_)events, because of the 
> > > > > way multiple callbacks can be registered. How would:
> > > > > 
> > > > > 	err = event_x();
> > > > > 	if (err == -EACCESS) {
> > > > > 
> > > > > be handled? [...]
> > > > 
> > > > The default behavior would be something obvious: to trigger all callbacks and 
> > > > use the first non-zero return value.
> > > 
> > > But how do we know which callback that was from? There's no ordering of what 
> > > callbacks are called first.
> > 
> > We do not have to know that - nor do the calling sites care in general. Do you 
> > have some specific usecase in mind where the identity of the callback that 
> > generates a match matters?
> 
> Maybe I'm confused. I was thinking that these event_*() are what we
> currently call trace_*(), but the event_*(), I assume, can return a
> value if a call back returns one.

Yeah - and the call site can treat it as:

 - Ugh, if i get an error i need to abort whatever i was about to do

or (more advanced future use):

 - If i get a positive value i need to re-evaluate the parameters that were 
   passed in, they were changed

> Thus, we now have the ability to dynamically attach function calls to 
> arbitrary points in the kernel that can have an affect on the code that 
> called it. Right now, we only have the ability to attach function calls to 
> these locations that have passive affects (tracing/profiling).

Well, they can only have the effect that the calling site accepts and handles. 
So the 'effect' is not arbitrary and not defined by the callbacks, it is 
controlled and handled by the calling code.

We do not want invisible side-effects, opaque hooks, etc.

Instead of that we want (this is the getname() example i cited in the thread) 
explicit effects, like:

 if (event_vfs_getname(result))
	return ERR_PTR(-EPERM);

> But you say, "nor do the calling sites care in general". Then what do
> these calling sites do with the return code? Are we limiting these
> actions to security only? Or can we have some other feature. [...]

Yeah, not just security. One other example that came up recently is whether to 
panic the box on certain (bad) events such as NMI errors. This too could be 
made flexible via the event filter code: we already capture many events, so 
places that might conceivably do some policy could do so based on a filter 
condition.

> [...] I can envision that we can make the Linux kernel quite dynamic here 
> with "self modifying code". That is, anywhere we have "hooks", perhaps we 
> could replace them with dynamic switches (jump labels). Maybe events would 
> not be the best use, but they could be a generic one.

events and explicit function calls and explicit side-effects are pretty much 
the only thing that are acceptable. We do not want opaque hooks and arbitrary 
side-effects.

> Knowing what callback returned the result would be beneficial. Right now, you 
> are saying if the call back return anything, just abort the call, not knowing 
> what callback was called.

Yeah, and that's a feature: that way a number of conditions can be attached. 
Multiple security frameworks may have effect on a task or multiple tools might 
set policy action on a given event.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 13:29                                         ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-17 13:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, Russell King, Michal Simek,
	Ralf Baechle, Benjamin Herrenschmidt, Paul Mackerras,
	Martin Schwidefsky, Heiko Carstens, linux390, Paul Mundt,
	David S. Miller, Thomas Gleixner, H. Peter Anvin, x86,
	Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds

On Tue, 17 May 2011, Ingo Molnar wrote:

> I'm not sure i get your point.

Your example was not complete as described.  After an apparently simple 
specification, you've since added several qualifiers and assumptions, and 
I still doubt that it's complete.

A higher level goal would look like

"Allow a sandbox app access only to approved resources, to contain the 
effects of flaws in the app", or similar.

Note that this includes a threat model (remote attacker taking control of 
the app) and a general and fully stated strategy for dealing with it.

From there, you can start to analyze how to implement the goal, at which 
point you'd start thinking about configuration, assumptions, filesystem 
access, namespaces, indirect access (e.g. via sockets, rpc, ipc, shared 
memory, invocation).

Anyway, this is getting off track from the main discussion, but you 
asked...



- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 13:29                                         ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-17 13:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, Russell King, Michal Simek,
	Ralf Baechle, Benjamin Herrenschmidt, Paul Mackerras,
	Martin Schwidefsky, Heiko Carstens, linux390, Paul Mundt,
	David S. Miller, Thomas Gleixner, H. Peter Anvin, x86,
	Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds

On Tue, 17 May 2011, Ingo Molnar wrote:

> I'm not sure i get your point.

Your example was not complete as described.  After an apparently simple 
specification, you've since added several qualifiers and assumptions, and 
I still doubt that it's complete.

A higher level goal would look like

"Allow a sandbox app access only to approved resources, to contain the 
effects of flaws in the app", or similar.

Note that this includes a threat model (remote attacker taking control of 
the app) and a general and fully stated strategy for dealing with it.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 13:29                                         ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-17 13:29 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, Linus Torvalds, Ingo Molnar, kees.cook, Serge E. Hallyn,
	Peter Zijlstra, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Tue, 17 May 2011, Ingo Molnar wrote:

> I'm not sure i get your point.

Your example was not complete as described.  After an apparently simple 
specification, you've since added several qualifiers and assumptions, and 
I still doubt that it's complete.

A higher level goal would look like

"Allow a sandbox app access only to approved resources, to contain the 
effects of flaws in the app", or similar.

Note that this includes a threat model (remote attacker taking control of 
the app) and a general and fully stated strategy for dealing with it.

>From there, you can start to analyze how to implement the goal, at which 
point you'd start thinking about configuration, assumptions, filesystem 
access, namespaces, indirect access (e.g. via sockets, rpc, ipc, shared 
memory, invocation).

Anyway, this is getting off track from the main discussion, but you 
asked...



- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 13:29                                         ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-17 13:29 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 17 May 2011, Ingo Molnar wrote:

> I'm not sure i get your point.

Your example was not complete as described.  After an apparently simple 
specification, you've since added several qualifiers and assumptions, and 
I still doubt that it's complete.

A higher level goal would look like

"Allow a sandbox app access only to approved resources, to contain the 
effects of flaws in the app", or similar.

Note that this includes a threat model (remote attacker taking control of 
the app) and a general and fully stated strategy for dealing with it.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-17 13:29                                         ` James Morris
  (?)
@ 2011-05-17 18:34                                           ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 18:34 UTC (permalink / raw)
  To: James Morris
  Cc: Will Drewry, linux-kernel, Steven Rostedt, Frederic Weisbecker,
	Eric Paris, kees.cook, agl, Peter Zijlstra, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, Russell King, Michal Simek,
	Ralf Baechle, Benjamin Herrenschmidt, Paul Mackerras,
	Martin Schwidefsky, Heiko Carstens, linux390, Paul Mundt,
	David S. Miller, Thomas Gleixner, H. Peter Anvin, x86,
	Peter Zijlstra, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds


* James Morris <jmorris@namei.org> wrote:

> On Tue, 17 May 2011, Ingo Molnar wrote:
> 
> > I'm not sure i get your point.
> 
> Your example was not complete as described.  After an apparently simple 
> specification, you've since added several qualifiers and assumptions, [...]

I havent added any qualifiers really (i added examples/description), the opt-in 
method i mentioned in my first mail should be pretty robust:

 | Firstly, using the filter code i deny the various link creation syscalls so 
 | that sandboxed code cannot escape for example by creating a symlink to 
 | outside the permitted VFS namespace. (Note: we opt-in to syscalls, that way 
 | new syscalls added by new kernels are denied by defalt. The current symlink 
 | creation syscalls are not opted in to.)

> [...] and I still doubt that it's complete.

I could too claim that i doubt that the SELinux kernel implementation is 
secure!

So how about we both come up with specific examples about how it's not secure, 
instead of going down the fear-uncertainty-and-doubt road? ;-)

> A higher level goal would look like
> 
> "Allow a sandbox app access only to approved resources, to contain the 
> effects of flaws in the app", or similar.

I see what you mean.

I really think that "restricting sandboxed code to only open files within a 
given VFS namespace boundary" is the most useful highlevel description here - 
which is really a subset of a "allow a sandbox app access only to an easily 
approved set of files" highlevel concept.

There's no "to contain ..." bit here: *all* of the sandboxed app code is 
untrusted, so there's no 'remote attacker' and we do not limit our threat to 
flaws in the app. We want to contain apps to within a small subset of Linux 
functionality, and we want to do that within regular apps (without having to be 
superuser), full stop.

> Note that this includes a threat model (remote attacker taking control of the 
> app) and a general and fully stated strategy for dealing with it.

Attacker does not have to be remote - most sandboxing concepts protect against 
locally installed plugins/apps/applets. In sandboxing the whole app is 
considered untrusted - not just some flaw in it, abused remotely.

> From there, you can start to analyze how to implement the goal, at which 
> point you'd start thinking about configuration, assumptions, filesystem 
> access, namespaces, indirect access (e.g. via sockets, rpc, ipc, shared 
> memory, invocation).

Sandboxed code generally does not have access to anything fancy like that - if 
it is added then all possible side effects have to be examined.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 18:34                                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 18:34 UTC (permalink / raw)
  To: James Morris
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, Linus Torvalds, Ingo Molnar, kees.cook, Serge E. Hallyn,
	Peter Zijlstra, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* James Morris <jmorris@namei.org> wrote:

> On Tue, 17 May 2011, Ingo Molnar wrote:
> 
> > I'm not sure i get your point.
> 
> Your example was not complete as described.  After an apparently simple 
> specification, you've since added several qualifiers and assumptions, [...]

I havent added any qualifiers really (i added examples/description), the opt-in 
method i mentioned in my first mail should be pretty robust:

 | Firstly, using the filter code i deny the various link creation syscalls so 
 | that sandboxed code cannot escape for example by creating a symlink to 
 | outside the permitted VFS namespace. (Note: we opt-in to syscalls, that way 
 | new syscalls added by new kernels are denied by defalt. The current symlink 
 | creation syscalls are not opted in to.)

> [...] and I still doubt that it's complete.

I could too claim that i doubt that the SELinux kernel implementation is 
secure!

So how about we both come up with specific examples about how it's not secure, 
instead of going down the fear-uncertainty-and-doubt road? ;-)

> A higher level goal would look like
> 
> "Allow a sandbox app access only to approved resources, to contain the 
> effects of flaws in the app", or similar.

I see what you mean.

I really think that "restricting sandboxed code to only open files within a 
given VFS namespace boundary" is the most useful highlevel description here - 
which is really a subset of a "allow a sandbox app access only to an easily 
approved set of files" highlevel concept.

There's no "to contain ..." bit here: *all* of the sandboxed app code is 
untrusted, so there's no 'remote attacker' and we do not limit our threat to 
flaws in the app. We want to contain apps to within a small subset of Linux 
functionality, and we want to do that within regular apps (without having to be 
superuser), full stop.

> Note that this includes a threat model (remote attacker taking control of the 
> app) and a general and fully stated strategy for dealing with it.

Attacker does not have to be remote - most sandboxing concepts protect against 
locally installed plugins/apps/applets. In sandboxing the whole app is 
considered untrusted - not just some flaw in it, abused remotely.

> From there, you can start to analyze how to implement the goal, at which 
> point you'd start thinking about configuration, assumptions, filesystem 
> access, namespaces, indirect access (e.g. via sockets, rpc, ipc, shared 
> memory, invocation).

Sandboxed code generally does not have access to anything fancy like that - if 
it is added then all possible side effects have to be examined.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-17 18:34                                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-17 18:34 UTC (permalink / raw)
  To: linux-arm-kernel


* James Morris <jmorris@namei.org> wrote:

> On Tue, 17 May 2011, Ingo Molnar wrote:
> 
> > I'm not sure i get your point.
> 
> Your example was not complete as described.  After an apparently simple 
> specification, you've since added several qualifiers and assumptions, [...]

I havent added any qualifiers really (i added examples/description), the opt-in 
method i mentioned in my first mail should be pretty robust:

 | Firstly, using the filter code i deny the various link creation syscalls so 
 | that sandboxed code cannot escape for example by creating a symlink to 
 | outside the permitted VFS namespace. (Note: we opt-in to syscalls, that way 
 | new syscalls added by new kernels are denied by defalt. The current symlink 
 | creation syscalls are not opted in to.)

> [...] and I still doubt that it's complete.

I could too claim that i doubt that the SELinux kernel implementation is 
secure!

So how about we both come up with specific examples about how it's not secure, 
instead of going down the fear-uncertainty-and-doubt road? ;-)

> A higher level goal would look like
> 
> "Allow a sandbox app access only to approved resources, to contain the 
> effects of flaws in the app", or similar.

I see what you mean.

I really think that "restricting sandboxed code to only open files within a 
given VFS namespace boundary" is the most useful highlevel description here - 
which is really a subset of a "allow a sandbox app access only to an easily 
approved set of files" highlevel concept.

There's no "to contain ..." bit here: *all* of the sandboxed app code is 
untrusted, so there's no 'remote attacker' and we do not limit our threat to 
flaws in the app. We want to contain apps to within a small subset of Linux 
functionality, and we want to do that within regular apps (without having to be 
superuser), full stop.

> Note that this includes a threat model (remote attacker taking control of the 
> app) and a general and fully stated strategy for dealing with it.

Attacker does not have to be remote - most sandboxing concepts protect against 
locally installed plugins/apps/applets. In sandboxing the whole app is 
considered untrusted - not just some flaw in it, abused remotely.

> From there, you can start to analyze how to implement the goal, at which 
> point you'd start thinking about configuration, assumptions, filesystem 
> access, namespaces, indirect access (e.g. via sockets, rpc, ipc, shared 
> memory, invocation).

Sandboxed code generally does not have access to anything fancy like that - if 
it is added then all possible side effects have to be examined.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-17 13:19                                                             ` Ingo Molnar
  (?)
@ 2011-05-19  4:07                                                               ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-19  4:07 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Peter Zijlstra, James Morris, linux-kernel,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Tue, May 17, 2011 at 6:19 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> On Tue, 2011-05-17 at 14:42 +0200, Ingo Molnar wrote:
>> > * Steven Rostedt <rostedt@goodmis.org> wrote:
>> >
>> > > On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
>> > > > * Steven Rostedt <rostedt@goodmis.org> wrote:
>> > > >
>> > > > > I'm a bit nervous about the 'active' role of (trace_)events, because of the
>> > > > > way multiple callbacks can be registered. How would:
>> > > > >
>> > > > >       err = event_x();
>> > > > >       if (err == -EACCESS) {
>> > > > >
>> > > > > be handled? [...]
>> > > >
>> > > > The default behavior would be something obvious: to trigger all callbacks and
>> > > > use the first non-zero return value.
>> > >
>> > > But how do we know which callback that was from? There's no ordering of what
>> > > callbacks are called first.
>> >
>> > We do not have to know that - nor do the calling sites care in general. Do you
>> > have some specific usecase in mind where the identity of the callback that
>> > generates a match matters?
>>
>> Maybe I'm confused. I was thinking that these event_*() are what we
>> currently call trace_*(), but the event_*(), I assume, can return a
>> value if a call back returns one.
>
> Yeah - and the call site can treat it as:
>
>  - Ugh, if i get an error i need to abort whatever i was about to do
>
> or (more advanced future use):
>
>  - If i get a positive value i need to re-evaluate the parameters that were
>   passed in, they were changed

Do event_* that return non-void exist in the tree at all now?  I've
looked at the various tracepoint macros as well as some of the other
handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
places where a return value is honored nor could be.  At best, the
perf_tp_event can be short-circuited it in the hlist_for_each, but
it'd still need a way to bubble up a failure and result in not calling
the trace/event that the hook precedes.

Am I missing something really obvious?  I don't feel I've gotten a
good handle on exactly how all the tracing code gets triggered, so
perhaps I'm still a level (or three) too shallow. (I can see the asm
hooks for trace functions and I can see where that translates to
registered calls - like trace_function - but I don't see how the
hooked calls can be trivially aborted).

As is, I'm not sure how the perf and ftrace infrastructure could be
reused cleanly without a fair number of hacks to the interface and a
good bit of reworking.  I can already see a number of challenges
around reusing the sys_perf_event_open interface and the fact that
reimplementing something even as simple as seccomp mode=1 seems to
require a fair amount of tweaking to avoid from being leaky.  (E.g.,
enabling all TRACE_EVENT()s for syscalls will miss unhooked syscalls
so either acceptance matching needs to be propagated up the stack
along with some seccomp-like task modality or seccomp-on-perf would
have to depend on sys_enter events with syscall number predicate
matching and fail when a filter discard applies to all active events.)

At present, I'm leaning back towards the v2 series (plus the requested
minor changes) for the benefit of code clarity and its fail-secure
behavior.  Even just considering the reduced case of seccomp mode 1
being implemented on the shared infrastructure, I feel like I missing
something that makes it viable.  Any clues?

If not, I don't think a seccomp mode 2 interface via prctl would be
intractable if the long term movement is to a ftrace/perf backend - it
just means that the in-kernel code would change to wrap whatever the
final design ended up being.

Thanks and sorry if I'm being dense!

>> Thus, we now have the ability to dynamically attach function calls to
>> arbitrary points in the kernel that can have an affect on the code that
>> called it. Right now, we only have the ability to attach function calls to
>> these locations that have passive affects (tracing/profiling).
>
> Well, they can only have the effect that the calling site accepts and handles.
> So the 'effect' is not arbitrary and not defined by the callbacks, it is
> controlled and handled by the calling code.
>
> We do not want invisible side-effects, opaque hooks, etc.
>
> Instead of that we want (this is the getname() example i cited in the thread)
> explicit effects, like:
>
>  if (event_vfs_getname(result))
>        return ERR_PTR(-EPERM);
>
>> But you say, "nor do the calling sites care in general". Then what do
>> these calling sites do with the return code? Are we limiting these
>> actions to security only? Or can we have some other feature. [...]
>
> Yeah, not just security. One other example that came up recently is whether to
> panic the box on certain (bad) events such as NMI errors. This too could be
> made flexible via the event filter code: we already capture many events, so
> places that might conceivably do some policy could do so based on a filter
> condition.

This sounds great - I just wish I could figure out how it'd work :)

>> [...] I can envision that we can make the Linux kernel quite dynamic here
>> with "self modifying code". That is, anywhere we have "hooks", perhaps we
>> could replace them with dynamic switches (jump labels). Maybe events would
>> not be the best use, but they could be a generic one.
>
> events and explicit function calls and explicit side-effects are pretty much
> the only thing that are acceptable. We do not want opaque hooks and arbitrary
> side-effects.
>
>> Knowing what callback returned the result would be beneficial. Right now, you
>> are saying if the call back return anything, just abort the call, not knowing
>> what callback was called.
>
> Yeah, and that's a feature: that way a number of conditions can be attached.
> Multiple security frameworks may have effect on a task or multiple tools might
> set policy action on a given event.
>
> Thanks,
>
>        Ingo
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-19  4:07                                                               ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-19  4:07 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, kees.cook, Serge E. Hallyn, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, Roland McGrath,
	Michal Marek, Michal Simek, linuxppc-dev, linux-kernel,
	Ralf Baechle, Paul Mundt, Tejun Heo, linux390, Andrew Morton,
	agl, David S. Miller

On Tue, May 17, 2011 at 6:19 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> On Tue, 2011-05-17 at 14:42 +0200, Ingo Molnar wrote:
>> > * Steven Rostedt <rostedt@goodmis.org> wrote:
>> >
>> > > On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
>> > > > * Steven Rostedt <rostedt@goodmis.org> wrote:
>> > > >
>> > > > > I'm a bit nervous about the 'active' role of (trace_)events, bec=
ause of the
>> > > > > way multiple callbacks can be registered. How would:
>> > > > >
>> > > > > =A0 =A0 =A0 err =3D event_x();
>> > > > > =A0 =A0 =A0 if (err =3D=3D -EACCESS) {
>> > > > >
>> > > > > be handled? [...]
>> > > >
>> > > > The default behavior would be something obvious: to trigger all ca=
llbacks and
>> > > > use the first non-zero return value.
>> > >
>> > > But how do we know which callback that was from? There's no ordering=
 of what
>> > > callbacks are called first.
>> >
>> > We do not have to know that - nor do the calling sites care in general=
. Do you
>> > have some specific usecase in mind where the identity of the callback =
that
>> > generates a match matters?
>>
>> Maybe I'm confused. I was thinking that these event_*() are what we
>> currently call trace_*(), but the event_*(), I assume, can return a
>> value if a call back returns one.
>
> Yeah - and the call site can treat it as:
>
> =A0- Ugh, if i get an error i need to abort whatever i was about to do
>
> or (more advanced future use):
>
> =A0- If i get a positive value i need to re-evaluate the parameters that =
were
> =A0 passed in, they were changed

Do event_* that return non-void exist in the tree at all now?  I've
looked at the various tracepoint macros as well as some of the other
handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
places where a return value is honored nor could be.  At best, the
perf_tp_event can be short-circuited it in the hlist_for_each, but
it'd still need a way to bubble up a failure and result in not calling
the trace/event that the hook precedes.

Am I missing something really obvious?  I don't feel I've gotten a
good handle on exactly how all the tracing code gets triggered, so
perhaps I'm still a level (or three) too shallow. (I can see the asm
hooks for trace functions and I can see where that translates to
registered calls - like trace_function - but I don't see how the
hooked calls can be trivially aborted).

As is, I'm not sure how the perf and ftrace infrastructure could be
reused cleanly without a fair number of hacks to the interface and a
good bit of reworking.  I can already see a number of challenges
around reusing the sys_perf_event_open interface and the fact that
reimplementing something even as simple as seccomp mode=3D1 seems to
require a fair amount of tweaking to avoid from being leaky.  (E.g.,
enabling all TRACE_EVENT()s for syscalls will miss unhooked syscalls
so either acceptance matching needs to be propagated up the stack
along with some seccomp-like task modality or seccomp-on-perf would
have to depend on sys_enter events with syscall number predicate
matching and fail when a filter discard applies to all active events.)

At present, I'm leaning back towards the v2 series (plus the requested
minor changes) for the benefit of code clarity and its fail-secure
behavior.  Even just considering the reduced case of seccomp mode 1
being implemented on the shared infrastructure, I feel like I missing
something that makes it viable.  Any clues?

If not, I don't think a seccomp mode 2 interface via prctl would be
intractable if the long term movement is to a ftrace/perf backend - it
just means that the in-kernel code would change to wrap whatever the
final design ended up being.

Thanks and sorry if I'm being dense!

>> Thus, we now have the ability to dynamically attach function calls to
>> arbitrary points in the kernel that can have an affect on the code that
>> called it. Right now, we only have the ability to attach function calls =
to
>> these locations that have passive affects (tracing/profiling).
>
> Well, they can only have the effect that the calling site accepts and han=
dles.
> So the 'effect' is not arbitrary and not defined by the callbacks, it is
> controlled and handled by the calling code.
>
> We do not want invisible side-effects, opaque hooks, etc.
>
> Instead of that we want (this is the getname() example i cited in the thr=
ead)
> explicit effects, like:
>
> =A0if (event_vfs_getname(result))
> =A0 =A0 =A0 =A0return ERR_PTR(-EPERM);
>
>> But you say, "nor do the calling sites care in general". Then what do
>> these calling sites do with the return code? Are we limiting these
>> actions to security only? Or can we have some other feature. [...]
>
> Yeah, not just security. One other example that came up recently is wheth=
er to
> panic the box on certain (bad) events such as NMI errors. This too could =
be
> made flexible via the event filter code: we already capture many events, =
so
> places that might conceivably do some policy could do so based on a filte=
r
> condition.

This sounds great - I just wish I could figure out how it'd work :)

>> [...] I can envision that we can make the Linux kernel quite dynamic her=
e
>> with "self modifying code". That is, anywhere we have "hooks", perhaps w=
e
>> could replace them with dynamic switches (jump labels). Maybe events wou=
ld
>> not be the best use, but they could be a generic one.
>
> events and explicit function calls and explicit side-effects are pretty m=
uch
> the only thing that are acceptable. We do not want opaque hooks and arbit=
rary
> side-effects.
>
>> Knowing what callback returned the result would be beneficial. Right now=
, you
>> are saying if the call back return anything, just abort the call, not kn=
owing
>> what callback was called.
>
> Yeah, and that's a feature: that way a number of conditions can be attach=
ed.
> Multiple security frameworks may have effect on a task or multiple tools =
might
> set policy action on a given event.
>
> Thanks,
>
> =A0 =A0 =A0 =A0Ingo
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-19  4:07                                                               ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-19  4:07 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, May 17, 2011 at 6:19 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Steven Rostedt <rostedt@goodmis.org> wrote:
>
>> On Tue, 2011-05-17 at 14:42 +0200, Ingo Molnar wrote:
>> > * Steven Rostedt <rostedt@goodmis.org> wrote:
>> >
>> > > On Mon, 2011-05-16 at 18:52 +0200, Ingo Molnar wrote:
>> > > > * Steven Rostedt <rostedt@goodmis.org> wrote:
>> > > >
>> > > > > I'm a bit nervous about the 'active' role of (trace_)events, because of the
>> > > > > way multiple callbacks can be registered. How would:
>> > > > >
>> > > > > ? ? ? err = event_x();
>> > > > > ? ? ? if (err == -EACCESS) {
>> > > > >
>> > > > > be handled? [...]
>> > > >
>> > > > The default behavior would be something obvious: to trigger all callbacks and
>> > > > use the first non-zero return value.
>> > >
>> > > But how do we know which callback that was from? There's no ordering of what
>> > > callbacks are called first.
>> >
>> > We do not have to know that - nor do the calling sites care in general. Do you
>> > have some specific usecase in mind where the identity of the callback that
>> > generates a match matters?
>>
>> Maybe I'm confused. I was thinking that these event_*() are what we
>> currently call trace_*(), but the event_*(), I assume, can return a
>> value if a call back returns one.
>
> Yeah - and the call site can treat it as:
>
> ?- Ugh, if i get an error i need to abort whatever i was about to do
>
> or (more advanced future use):
>
> ?- If i get a positive value i need to re-evaluate the parameters that were
> ? passed in, they were changed

Do event_* that return non-void exist in the tree at all now?  I've
looked at the various tracepoint macros as well as some of the other
handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
places where a return value is honored nor could be.  At best, the
perf_tp_event can be short-circuited it in the hlist_for_each, but
it'd still need a way to bubble up a failure and result in not calling
the trace/event that the hook precedes.

Am I missing something really obvious?  I don't feel I've gotten a
good handle on exactly how all the tracing code gets triggered, so
perhaps I'm still a level (or three) too shallow. (I can see the asm
hooks for trace functions and I can see where that translates to
registered calls - like trace_function - but I don't see how the
hooked calls can be trivially aborted).

As is, I'm not sure how the perf and ftrace infrastructure could be
reused cleanly without a fair number of hacks to the interface and a
good bit of reworking.  I can already see a number of challenges
around reusing the sys_perf_event_open interface and the fact that
reimplementing something even as simple as seccomp mode=1 seems to
require a fair amount of tweaking to avoid from being leaky.  (E.g.,
enabling all TRACE_EVENT()s for syscalls will miss unhooked syscalls
so either acceptance matching needs to be propagated up the stack
along with some seccomp-like task modality or seccomp-on-perf would
have to depend on sys_enter events with syscall number predicate
matching and fail when a filter discard applies to all active events.)

At present, I'm leaning back towards the v2 series (plus the requested
minor changes) for the benefit of code clarity and its fail-secure
behavior.  Even just considering the reduced case of seccomp mode 1
being implemented on the shared infrastructure, I feel like I missing
something that makes it viable.  Any clues?

If not, I don't think a seccomp mode 2 interface via prctl would be
intractable if the long term movement is to a ftrace/perf backend - it
just means that the in-kernel code would change to wrap whatever the
final design ended up being.

Thanks and sorry if I'm being dense!

>> Thus, we now have the ability to dynamically attach function calls to
>> arbitrary points in the kernel that can have an affect on the code that
>> called it. Right now, we only have the ability to attach function calls to
>> these locations that have passive affects (tracing/profiling).
>
> Well, they can only have the effect that the calling site accepts and handles.
> So the 'effect' is not arbitrary and not defined by the callbacks, it is
> controlled and handled by the calling code.
>
> We do not want invisible side-effects, opaque hooks, etc.
>
> Instead of that we want (this is the getname() example i cited in the thread)
> explicit effects, like:
>
> ?if (event_vfs_getname(result))
> ? ? ? ?return ERR_PTR(-EPERM);
>
>> But you say, "nor do the calling sites care in general". Then what do
>> these calling sites do with the return code? Are we limiting these
>> actions to security only? Or can we have some other feature. [...]
>
> Yeah, not just security. One other example that came up recently is whether to
> panic the box on certain (bad) events such as NMI errors. This too could be
> made flexible via the event filter code: we already capture many events, so
> places that might conceivably do some policy could do so based on a filter
> condition.

This sounds great - I just wish I could figure out how it'd work :)

>> [...] I can envision that we can make the Linux kernel quite dynamic here
>> with "self modifying code". That is, anywhere we have "hooks", perhaps we
>> could replace them with dynamic switches (jump labels). Maybe events would
>> not be the best use, but they could be a generic one.
>
> events and explicit function calls and explicit side-effects are pretty much
> the only thing that are acceptable. We do not want opaque hooks and arbitrary
> side-effects.
>
>> Knowing what callback returned the result would be beneficial. Right now, you
>> are saying if the call back return anything, just abort the call, not knowing
>> what callback was called.
>
> Yeah, and that's a feature: that way a number of conditions can be attached.
> Multiple security frameworks may have effect on a task or multiple tools might
> set policy action on a given event.
>
> Thanks,
>
> ? ? ? ?Ingo
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-19  4:07                                                               ` Will Drewry
  (?)
@ 2011-05-19 12:22                                                                 ` Steven Rostedt
  -1 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-19 12:22 UTC (permalink / raw)
  To: Will Drewry
  Cc: Ingo Molnar, Peter Zijlstra, James Morris, linux-kernel,
	Frederic Weisbecker, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Roland McGrath, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Wed, 2011-05-18 at 21:07 -0700, Will Drewry wrote:

> Do event_* that return non-void exist in the tree at all now?  I've
> looked at the various tracepoint macros as well as some of the other
> handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
> places where a return value is honored nor could be.  At best, the
> perf_tp_event can be short-circuited it in the hlist_for_each, but
> it'd still need a way to bubble up a failure and result in not calling
> the trace/event that the hook precedes.

No, none of the current trace hooks have return values. That was what I
was talking about how to implement in my previous emails.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-19 12:22                                                                 ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-19 12:22 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	linux-arm-kernel, Ingo Molnar, Serge E. Hallyn,
	Martin Schwidefsky, Thomas Gleixner, kees.cook, Roland McGrath,
	Michal Marek, Michal Simek, linuxppc-dev, linux-kernel,
	Eric Paris, Paul Mundt, Tejun Heo, linux390, Andrew Morton, agl,
	David S. Miller

On Wed, 2011-05-18 at 21:07 -0700, Will Drewry wrote:

> Do event_* that return non-void exist in the tree at all now?  I've
> looked at the various tracepoint macros as well as some of the other
> handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
> places where a return value is honored nor could be.  At best, the
> perf_tp_event can be short-circuited it in the hlist_for_each, but
> it'd still need a way to bubble up a failure and result in not calling
> the trace/event that the hook precedes.

No, none of the current trace hooks have return values. That was what I
was talking about how to implement in my previous emails.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-19 12:22                                                                 ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-19 12:22 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 2011-05-18 at 21:07 -0700, Will Drewry wrote:

> Do event_* that return non-void exist in the tree at all now?  I've
> looked at the various tracepoint macros as well as some of the other
> handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
> places where a return value is honored nor could be.  At best, the
> perf_tp_event can be short-circuited it in the hlist_for_each, but
> it'd still need a way to bubble up a failure and result in not calling
> the trace/event that the hook precedes.

No, none of the current trace hooks have return values. That was what I
was talking about how to implement in my previous emails.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-19 12:22                                                                 ` Steven Rostedt
  (?)
@ 2011-05-19 21:05                                                                   ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-19 21:05 UTC (permalink / raw)
  To: Steven Rostedt, Ingo Molnar, Peter Zijlstra, Frederic Weisbecker
  Cc: James Morris, linux-kernel, Eric Paris, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Thu, May 19, 2011 at 7:22 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Wed, 2011-05-18 at 21:07 -0700, Will Drewry wrote:
>
>> Do event_* that return non-void exist in the tree at all now?  I've
>> looked at the various tracepoint macros as well as some of the other
>> handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
>> places where a return value is honored nor could be.  At best, the
>> perf_tp_event can be short-circuited it in the hlist_for_each, but
>> it'd still need a way to bubble up a failure and result in not calling
>> the trace/event that the hook precedes.
>
> No, none of the current trace hooks have return values. That was what I
> was talking about how to implement in my previous emails.

Led on by complete ignorance, I think I'm finally beginning to untwist
the different pieces of the tracing infrastructure.  Unfortunately,
that means I took a few wrong turns along the way...

I think function tracing looks something like this:

ftrace_call has been injected into at a specific callsite.  Upon hit:
1. ftrace_call triggers
2. does some checks then calls ftrace_trace_function (via mcount instrs)
3. ftrace_trace_function may be a single func or a list. For a list it
will be: ftrace_list_func
4. ftrace_list_func calls each registered hook for that function in a
while loop ignoring return values
5. registered hook funcs may then track the call, farm it out to
specific sub-handlers, etc.

This seems to be a red herring for my use case :/ though this helped
me understand your back and forth (Ingo & Steve) regarding dynamic
versus explicit events.


System call tracing is done via kernel/tracepoint.c events fed in via
arch/[arch]/kernel/ptrace.c where it calls trace_sys_enter.  This
yields direct sys_enter and sys_exit event sources (and an event class
to hook up per-system call events).  This means that
ftrace_syscall_enter() does the event prep prior to doing a filter
check comparing the ftrace_event object for the given syscall_nr to
the event data.  perf_sysenter_enter() is similar but it pushes the
info over to perf_tp_event to be matched not against the global
syscall event entry, but against any sub-event in the linked list on
that syscall's event.

Is that roughly an accurate representation of the two?  I wish I
hadn't gotten distracted along the function path, but at least I
learned something (and it is relevant to the larger scope of this
thread's discussion).


After doing that digging, it looks like providing hook call
pre-emption and return value propagation will be a unique and fun task
for each tracer and event subsystem.  If I look solely at tracepoints,
a generic change would be to make the trace_##subsys function return
an int (which I think was the event_vfs_getname proposal?).  The other
option is to change the trace_sys_enter proto to include a 'int
*retcode'.

That change would allow the propagation of some sort of policy
information.  To put it to use, seccomp mode 1 could be implemented on
top of trace_syscalls.  The following changes would need to happen:
1. dummy metadata should be inserted for all unhooked system calls
2. perf_trace_buf_submit would need to return an int or a new
TRACE_REG_SECCOMP_REGISTER handler would need to be setup in
syscall_enter_register.
3. If perf is abused, a "kill/fail_on_discard" bit would be added to
event->attrs.
4. perf_trace_buf_submit/perf_tp_event will return 0 for no matches,
'n' for the number of event matches, and -EACCES/? if a
'fail_on_discard' event is seen.
5. perf_syscall_enter would set *retcode = perf_trace_buf_submit()'s retcode
6. trace_sys_enter() would need to be moved to be the first entry
arch/../kernel/ptrace.c for incoming syscalls
7. if trace_sys_enter() yields a negative return code, then
do_exit(SIGKILL) the process and return.

Entering into seccomp mode 1 would require adding a  "0" filter for
every system call number (which is why we need a dummy event call for
them since failing to check the bitmask can't be flagged
fail_on_discard) with the fail_on_discard bit.  For the three calls
that are allowed, a '1' filter would be set.

That would roughly implement seccomp mode 1.  It's pretty ugly and the
fact that every system call that's disallowed has to be blacklisted is
not ideal.  An alternate model would be to just use the seccomp mode
as we do today and let secure_computing() handle the return code of "#
of matches".  If it the # of matches is 0, it terminates. A
'fail_on_discard' bit then would only be good to stop further
tracepoint callback evaluation.  This approach would also *almost* nix
the need to provide dummy syscall hooks.  (Since sigreturn isn't
hooked on x86 because it uses ptregs fixup, a dummy would still be
needed to apply a "1" filter to.)

Even with that tweak to move to a whitelist model, the perf event
evaluation and tracepoint callback ordering is still not guaranteed.
Without changing tracepoint itself, all other TPs will still execute.
And for perf events, it'll be first-come-first-serve until a
fail_on_discard is hit.

After using seccomp mode=1 as the sample case to reduce scope, it's
possible to ignore it for now :) and look at the seccomp-filter/mode=2
case.  The same mechanism could be used to inject more expressive
filters.  This would be done over the sys_perf_event_open() interface
assuming the new attr is added to stop perf event list enumeration.
Assuming a whitelist model, a call to prctl(PR_SET_SECCOMP, 2) would
be needed (maybe with the ON_EXEC flag option too to mirror the
event->attr on-exec bit). That would yield the ability to register
perf events for system calls then use ioctl() to SET_FILTER on them.

Reducing the privileges of the filters after installation could be
done with another attribute bit like 'set_filter_ands'.  If that is
also set on the event, and a filter is installed to ensure that
sys_perf_event_open is blocked, then it should be reasonably sane.

I'd need to add a PERF_EVENT_IOC_GET_FILTER handler to allow
extracting the settings.

Clearly, I haven't written the code for that yet, though making the
change for a single platform shouldn't be too much code.

So that leaves me with some questions:
- Is this the type of reuse that was being encouraged?
- Does it really make sense to cram this through the perf interface
and events?  While the changed attributes are innocuous and
potentially reusable, it seems that a large amount of the perf
facilities are exposed that could have weird side effects, but I'm
sure I still don't fully grok the perf infrastructure.
- Does extending one tracepoint to provide return values via a pointer
make sense? I'd hesitate to make all tracepoint hooks return an int by
default.
- If all tracepoints returned an int, what would the standard value
look like - or would it need to be per tracepoint impl?
- How is ambiguity resolved if multiple perf_events are registered for
a syscall with different filters?  Maybe a 'stop_on_match'? though
ordering is still a problem then.
- Is there a way to affect a task-wide change without a seccomp flag
(or new task_struct entry) via the existing sys_perf_event_open
syscall?  I considered suggesting a attr bit call 'secure_computing'
that when an event with the bit is enabled, it automagically forces
the task into seccomp enforcement mode, but that, again, seemed
hackish.

While I like the idea of sharing the tracepoints infrastructure and
the trace_syscalls hooks as well as using a pre-existing interface
with very minor changes, I'm worried that the complexity of the
interface and of the infrastructure might undermine the ability to
continue meeting the desired security goals.  I had originally stayed
very close to the seccomp world because of how trivial it is to review
the code and verify its accuracy/effectiveness.  This approach leaves
a lot of gaps for kernel code to seep through and a fair amount of
ambiguity in what locked down syscall filters might look like.

To me, the best part of the above is that it shows that even if we go
with a prctl SET_SECCOMP_FILTER-style interface, it is completely
certain that if a perf/ftrace-based security infrastructure is on our
future, it will be entirely compatible -- even if the prctl()
interface is just the "simpler" interface at that point somewhere down
the road.


Regardless, I'll hack up a proof of concept based on the outline
above. Perhaps something more elegant will emerge once I start
tweaking the source, but I'm still seeing too many gaps to be
comfortable so far.


[[There is a whole other approach to this too. We could continue with
the prctl() interface and mirror the trace_sys_enter model for
secure_computing().   Instead of keeping a seccomp_t-based hlist of
events, they could be stored in a new hlist for seccomp_events in
struct ftrace_event_call.  The ftrace filters would be installed there
and the seccomp_syscall_enter() function could do the checks and pass
up some state data on the task's seccomp_t that indicates it needs to
do_exit().  That would likely reduce the amount of code in
seccomp_filter.c pretty seriously (though not entirely
beneficially).]]


Thanks again for all the feedback and insights! I really hope we can
come to an agreeable approach for implementing kernel attack surface
reduction.
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-19 21:05                                                                   ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-19 21:05 UTC (permalink / raw)
  To: Steven Rostedt, Ingo Molnar, Peter Zijlstra, Frederic Weisbecker
  Cc: linux-mips, linux-sh, Heiko Carstens, Oleg Nesterov,
	David Howells, Paul Mackerras, Ralf Baechle, H. Peter Anvin,
	sparclinux, Jiri Slaby, linux-s390, Russell King, x86,
	James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Tejun Heo, Thomas Gleixner, linux-arm-kernel,
	Michal Marek, Michal Simek, linuxppc-dev, linux-kernel,
	Eric Paris, Paul Mundt, Martin Schwidefsky, linux390,
	Andrew Morton, agl, David S. Miller

On Thu, May 19, 2011 at 7:22 AM, Steven Rostedt <rostedt@goodmis.org> wrote=
:
> On Wed, 2011-05-18 at 21:07 -0700, Will Drewry wrote:
>
>> Do event_* that return non-void exist in the tree at all now? =A0I've
>> looked at the various tracepoint macros as well as some of the other
>> handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
>> places where a return value is honored nor could be. =A0At best, the
>> perf_tp_event can be short-circuited it in the hlist_for_each, but
>> it'd still need a way to bubble up a failure and result in not calling
>> the trace/event that the hook precedes.
>
> No, none of the current trace hooks have return values. That was what I
> was talking about how to implement in my previous emails.

Led on by complete ignorance, I think I'm finally beginning to untwist
the different pieces of the tracing infrastructure.  Unfortunately,
that means I took a few wrong turns along the way...

I think function tracing looks something like this:

ftrace_call has been injected into at a specific callsite.  Upon hit:
1. ftrace_call triggers
2. does some checks then calls ftrace_trace_function (via mcount instrs)
3. ftrace_trace_function may be a single func or a list. For a list it
will be: ftrace_list_func
4. ftrace_list_func calls each registered hook for that function in a
while loop ignoring return values
5. registered hook funcs may then track the call, farm it out to
specific sub-handlers, etc.

This seems to be a red herring for my use case :/ though this helped
me understand your back and forth (Ingo & Steve) regarding dynamic
versus explicit events.


System call tracing is done via kernel/tracepoint.c events fed in via
arch/[arch]/kernel/ptrace.c where it calls trace_sys_enter.  This
yields direct sys_enter and sys_exit event sources (and an event class
to hook up per-system call events).  This means that
ftrace_syscall_enter() does the event prep prior to doing a filter
check comparing the ftrace_event object for the given syscall_nr to
the event data.  perf_sysenter_enter() is similar but it pushes the
info over to perf_tp_event to be matched not against the global
syscall event entry, but against any sub-event in the linked list on
that syscall's event.

Is that roughly an accurate representation of the two?  I wish I
hadn't gotten distracted along the function path, but at least I
learned something (and it is relevant to the larger scope of this
thread's discussion).


After doing that digging, it looks like providing hook call
pre-emption and return value propagation will be a unique and fun task
for each tracer and event subsystem.  If I look solely at tracepoints,
a generic change would be to make the trace_##subsys function return
an int (which I think was the event_vfs_getname proposal?).  The other
option is to change the trace_sys_enter proto to include a 'int
*retcode'.

That change would allow the propagation of some sort of policy
information.  To put it to use, seccomp mode 1 could be implemented on
top of trace_syscalls.  The following changes would need to happen:
1. dummy metadata should be inserted for all unhooked system calls
2. perf_trace_buf_submit would need to return an int or a new
TRACE_REG_SECCOMP_REGISTER handler would need to be setup in
syscall_enter_register.
3. If perf is abused, a "kill/fail_on_discard" bit would be added to
event->attrs.
4. perf_trace_buf_submit/perf_tp_event will return 0 for no matches,
'n' for the number of event matches, and -EACCES/? if a
'fail_on_discard' event is seen.
5. perf_syscall_enter would set *retcode =3D perf_trace_buf_submit()'s retc=
ode
6. trace_sys_enter() would need to be moved to be the first entry
arch/../kernel/ptrace.c for incoming syscalls
7. if trace_sys_enter() yields a negative return code, then
do_exit(SIGKILL) the process and return.

Entering into seccomp mode 1 would require adding a  "0" filter for
every system call number (which is why we need a dummy event call for
them since failing to check the bitmask can't be flagged
fail_on_discard) with the fail_on_discard bit.  For the three calls
that are allowed, a '1' filter would be set.

That would roughly implement seccomp mode 1.  It's pretty ugly and the
fact that every system call that's disallowed has to be blacklisted is
not ideal.  An alternate model would be to just use the seccomp mode
as we do today and let secure_computing() handle the return code of "#
of matches".  If it the # of matches is 0, it terminates. A
'fail_on_discard' bit then would only be good to stop further
tracepoint callback evaluation.  This approach would also *almost* nix
the need to provide dummy syscall hooks.  (Since sigreturn isn't
hooked on x86 because it uses ptregs fixup, a dummy would still be
needed to apply a "1" filter to.)

Even with that tweak to move to a whitelist model, the perf event
evaluation and tracepoint callback ordering is still not guaranteed.
Without changing tracepoint itself, all other TPs will still execute.
And for perf events, it'll be first-come-first-serve until a
fail_on_discard is hit.

After using seccomp mode=3D1 as the sample case to reduce scope, it's
possible to ignore it for now :) and look at the seccomp-filter/mode=3D2
case.  The same mechanism could be used to inject more expressive
filters.  This would be done over the sys_perf_event_open() interface
assuming the new attr is added to stop perf event list enumeration.
Assuming a whitelist model, a call to prctl(PR_SET_SECCOMP, 2) would
be needed (maybe with the ON_EXEC flag option too to mirror the
event->attr on-exec bit). That would yield the ability to register
perf events for system calls then use ioctl() to SET_FILTER on them.

Reducing the privileges of the filters after installation could be
done with another attribute bit like 'set_filter_ands'.  If that is
also set on the event, and a filter is installed to ensure that
sys_perf_event_open is blocked, then it should be reasonably sane.

I'd need to add a PERF_EVENT_IOC_GET_FILTER handler to allow
extracting the settings.

Clearly, I haven't written the code for that yet, though making the
change for a single platform shouldn't be too much code.

So that leaves me with some questions:
- Is this the type of reuse that was being encouraged?
- Does it really make sense to cram this through the perf interface
and events?  While the changed attributes are innocuous and
potentially reusable, it seems that a large amount of the perf
facilities are exposed that could have weird side effects, but I'm
sure I still don't fully grok the perf infrastructure.
- Does extending one tracepoint to provide return values via a pointer
make sense? I'd hesitate to make all tracepoint hooks return an int by
default.
- If all tracepoints returned an int, what would the standard value
look like - or would it need to be per tracepoint impl?
- How is ambiguity resolved if multiple perf_events are registered for
a syscall with different filters?  Maybe a 'stop_on_match'? though
ordering is still a problem then.
- Is there a way to affect a task-wide change without a seccomp flag
(or new task_struct entry) via the existing sys_perf_event_open
syscall?  I considered suggesting a attr bit call 'secure_computing'
that when an event with the bit is enabled, it automagically forces
the task into seccomp enforcement mode, but that, again, seemed
hackish.

While I like the idea of sharing the tracepoints infrastructure and
the trace_syscalls hooks as well as using a pre-existing interface
with very minor changes, I'm worried that the complexity of the
interface and of the infrastructure might undermine the ability to
continue meeting the desired security goals.  I had originally stayed
very close to the seccomp world because of how trivial it is to review
the code and verify its accuracy/effectiveness.  This approach leaves
a lot of gaps for kernel code to seep through and a fair amount of
ambiguity in what locked down syscall filters might look like.

To me, the best part of the above is that it shows that even if we go
with a prctl SET_SECCOMP_FILTER-style interface, it is completely
certain that if a perf/ftrace-based security infrastructure is on our
future, it will be entirely compatible -- even if the prctl()
interface is just the "simpler" interface at that point somewhere down
the road.


Regardless, I'll hack up a proof of concept based on the outline
above. Perhaps something more elegant will emerge once I start
tweaking the source, but I'm still seeing too many gaps to be
comfortable so far.


[[There is a whole other approach to this too. We could continue with
the prctl() interface and mirror the trace_sys_enter model for
secure_computing().   Instead of keeping a seccomp_t-based hlist of
events, they could be stored in a new hlist for seccomp_events in
struct ftrace_event_call.  The ftrace filters would be installed there
and the seccomp_syscall_enter() function could do the checks and pass
up some state data on the task's seccomp_t that indicates it needs to
do_exit().  That would likely reduce the amount of code in
seccomp_filter.c pretty seriously (though not entirely
beneficially).]]


Thanks again for all the feedback and insights! I really hope we can
come to an agreeable approach for implementing kernel attack surface
reduction.
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-19 21:05                                                                   ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-19 21:05 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, May 19, 2011 at 7:22 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Wed, 2011-05-18 at 21:07 -0700, Will Drewry wrote:
>
>> Do event_* that return non-void exist in the tree at all now? ?I've
>> looked at the various tracepoint macros as well as some of the other
>> handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
>> places where a return value is honored nor could be. ?At best, the
>> perf_tp_event can be short-circuited it in the hlist_for_each, but
>> it'd still need a way to bubble up a failure and result in not calling
>> the trace/event that the hook precedes.
>
> No, none of the current trace hooks have return values. That was what I
> was talking about how to implement in my previous emails.

Led on by complete ignorance, I think I'm finally beginning to untwist
the different pieces of the tracing infrastructure.  Unfortunately,
that means I took a few wrong turns along the way...

I think function tracing looks something like this:

ftrace_call has been injected into at a specific callsite.  Upon hit:
1. ftrace_call triggers
2. does some checks then calls ftrace_trace_function (via mcount instrs)
3. ftrace_trace_function may be a single func or a list. For a list it
will be: ftrace_list_func
4. ftrace_list_func calls each registered hook for that function in a
while loop ignoring return values
5. registered hook funcs may then track the call, farm it out to
specific sub-handlers, etc.

This seems to be a red herring for my use case :/ though this helped
me understand your back and forth (Ingo & Steve) regarding dynamic
versus explicit events.


System call tracing is done via kernel/tracepoint.c events fed in via
arch/[arch]/kernel/ptrace.c where it calls trace_sys_enter.  This
yields direct sys_enter and sys_exit event sources (and an event class
to hook up per-system call events).  This means that
ftrace_syscall_enter() does the event prep prior to doing a filter
check comparing the ftrace_event object for the given syscall_nr to
the event data.  perf_sysenter_enter() is similar but it pushes the
info over to perf_tp_event to be matched not against the global
syscall event entry, but against any sub-event in the linked list on
that syscall's event.

Is that roughly an accurate representation of the two?  I wish I
hadn't gotten distracted along the function path, but at least I
learned something (and it is relevant to the larger scope of this
thread's discussion).


After doing that digging, it looks like providing hook call
pre-emption and return value propagation will be a unique and fun task
for each tracer and event subsystem.  If I look solely at tracepoints,
a generic change would be to make the trace_##subsys function return
an int (which I think was the event_vfs_getname proposal?).  The other
option is to change the trace_sys_enter proto to include a 'int
*retcode'.

That change would allow the propagation of some sort of policy
information.  To put it to use, seccomp mode 1 could be implemented on
top of trace_syscalls.  The following changes would need to happen:
1. dummy metadata should be inserted for all unhooked system calls
2. perf_trace_buf_submit would need to return an int or a new
TRACE_REG_SECCOMP_REGISTER handler would need to be setup in
syscall_enter_register.
3. If perf is abused, a "kill/fail_on_discard" bit would be added to
event->attrs.
4. perf_trace_buf_submit/perf_tp_event will return 0 for no matches,
'n' for the number of event matches, and -EACCES/? if a
'fail_on_discard' event is seen.
5. perf_syscall_enter would set *retcode = perf_trace_buf_submit()'s retcode
6. trace_sys_enter() would need to be moved to be the first entry
arch/../kernel/ptrace.c for incoming syscalls
7. if trace_sys_enter() yields a negative return code, then
do_exit(SIGKILL) the process and return.

Entering into seccomp mode 1 would require adding a  "0" filter for
every system call number (which is why we need a dummy event call for
them since failing to check the bitmask can't be flagged
fail_on_discard) with the fail_on_discard bit.  For the three calls
that are allowed, a '1' filter would be set.

That would roughly implement seccomp mode 1.  It's pretty ugly and the
fact that every system call that's disallowed has to be blacklisted is
not ideal.  An alternate model would be to just use the seccomp mode
as we do today and let secure_computing() handle the return code of "#
of matches".  If it the # of matches is 0, it terminates. A
'fail_on_discard' bit then would only be good to stop further
tracepoint callback evaluation.  This approach would also *almost* nix
the need to provide dummy syscall hooks.  (Since sigreturn isn't
hooked on x86 because it uses ptregs fixup, a dummy would still be
needed to apply a "1" filter to.)

Even with that tweak to move to a whitelist model, the perf event
evaluation and tracepoint callback ordering is still not guaranteed.
Without changing tracepoint itself, all other TPs will still execute.
And for perf events, it'll be first-come-first-serve until a
fail_on_discard is hit.

After using seccomp mode=1 as the sample case to reduce scope, it's
possible to ignore it for now :) and look at the seccomp-filter/mode=2
case.  The same mechanism could be used to inject more expressive
filters.  This would be done over the sys_perf_event_open() interface
assuming the new attr is added to stop perf event list enumeration.
Assuming a whitelist model, a call to prctl(PR_SET_SECCOMP, 2) would
be needed (maybe with the ON_EXEC flag option too to mirror the
event->attr on-exec bit). That would yield the ability to register
perf events for system calls then use ioctl() to SET_FILTER on them.

Reducing the privileges of the filters after installation could be
done with another attribute bit like 'set_filter_ands'.  If that is
also set on the event, and a filter is installed to ensure that
sys_perf_event_open is blocked, then it should be reasonably sane.

I'd need to add a PERF_EVENT_IOC_GET_FILTER handler to allow
extracting the settings.

Clearly, I haven't written the code for that yet, though making the
change for a single platform shouldn't be too much code.

So that leaves me with some questions:
- Is this the type of reuse that was being encouraged?
- Does it really make sense to cram this through the perf interface
and events?  While the changed attributes are innocuous and
potentially reusable, it seems that a large amount of the perf
facilities are exposed that could have weird side effects, but I'm
sure I still don't fully grok the perf infrastructure.
- Does extending one tracepoint to provide return values via a pointer
make sense? I'd hesitate to make all tracepoint hooks return an int by
default.
- If all tracepoints returned an int, what would the standard value
look like - or would it need to be per tracepoint impl?
- How is ambiguity resolved if multiple perf_events are registered for
a syscall with different filters?  Maybe a 'stop_on_match'? though
ordering is still a problem then.
- Is there a way to affect a task-wide change without a seccomp flag
(or new task_struct entry) via the existing sys_perf_event_open
syscall?  I considered suggesting a attr bit call 'secure_computing'
that when an event with the bit is enabled, it automagically forces
the task into seccomp enforcement mode, but that, again, seemed
hackish.

While I like the idea of sharing the tracepoints infrastructure and
the trace_syscalls hooks as well as using a pre-existing interface
with very minor changes, I'm worried that the complexity of the
interface and of the infrastructure might undermine the ability to
continue meeting the desired security goals.  I had originally stayed
very close to the seccomp world because of how trivial it is to review
the code and verify its accuracy/effectiveness.  This approach leaves
a lot of gaps for kernel code to seep through and a fair amount of
ambiguity in what locked down syscall filters might look like.

To me, the best part of the above is that it shows that even if we go
with a prctl SET_SECCOMP_FILTER-style interface, it is completely
certain that if a perf/ftrace-based security infrastructure is on our
future, it will be entirely compatible -- even if the prctl()
interface is just the "simpler" interface at that point somewhere down
the road.


Regardless, I'll hack up a proof of concept based on the outline
above. Perhaps something more elegant will emerge once I start
tweaking the source, but I'm still seeing too many gaps to be
comfortable so far.


[[There is a whole other approach to this too. We could continue with
the prctl() interface and mirror the trace_sys_enter model for
secure_computing().   Instead of keeping a seccomp_t-based hlist of
events, they could be stored in a new hlist for seccomp_events in
struct ftrace_event_call.  The ftrace filters would be installed there
and the seccomp_syscall_enter() function could do the checks and pass
up some state data on the task's seccomp_t that indicates it needs to
do_exit().  That would likely reduce the amount of code in
seccomp_filter.c pretty seriously (though not entirely
beneficially).]]


Thanks again for all the feedback and insights! I really hope we can
come to an agreeable approach for implementing kernel attack surface
reduction.
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-19 21:05                                                                   ` Will Drewry
  (?)
@ 2011-05-24 15:59                                                                     ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-24 15:59 UTC (permalink / raw)
  To: Steven Rostedt, Ingo Molnar, Peter Zijlstra, Frederic Weisbecker
  Cc: James Morris, linux-kernel, Eric Paris, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Thu, May 19, 2011 at 4:05 PM, Will Drewry <wad@chromium.org> wrote:
> On Thu, May 19, 2011 at 7:22 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
>> On Wed, 2011-05-18 at 21:07 -0700, Will Drewry wrote:
>>
>>> Do event_* that return non-void exist in the tree at all now?  I've
>>> looked at the various tracepoint macros as well as some of the other
>>> handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
>>> places where a return value is honored nor could be.  At best, the
>>> perf_tp_event can be short-circuited it in the hlist_for_each, but
>>> it'd still need a way to bubble up a failure and result in not calling
>>> the trace/event that the hook precedes.
>>
>> No, none of the current trace hooks have return values. That was what I
>> was talking about how to implement in my previous emails.
>
> Led on by complete ignorance, I think I'm finally beginning to untwist
> the different pieces of the tracing infrastructure.  Unfortunately,
> that means I took a few wrong turns along the way...
>
> I think function tracing looks something like this:
>
> ftrace_call has been injected into at a specific callsite.  Upon hit:
> 1. ftrace_call triggers
> 2. does some checks then calls ftrace_trace_function (via mcount instrs)
> 3. ftrace_trace_function may be a single func or a list. For a list it
> will be: ftrace_list_func
> 4. ftrace_list_func calls each registered hook for that function in a
> while loop ignoring return values
> 5. registered hook funcs may then track the call, farm it out to
> specific sub-handlers, etc.
>
> This seems to be a red herring for my use case :/ though this helped
> me understand your back and forth (Ingo & Steve) regarding dynamic
> versus explicit events.
>
>
> System call tracing is done via kernel/tracepoint.c events fed in via
> arch/[arch]/kernel/ptrace.c where it calls trace_sys_enter.  This
> yields direct sys_enter and sys_exit event sources (and an event class
> to hook up per-system call events).  This means that
> ftrace_syscall_enter() does the event prep prior to doing a filter
> check comparing the ftrace_event object for the given syscall_nr to
> the event data.  perf_sysenter_enter() is similar but it pushes the
> info over to perf_tp_event to be matched not against the global
> syscall event entry, but against any sub-event in the linked list on
> that syscall's event.
>
> Is that roughly an accurate representation of the two?  I wish I
> hadn't gotten distracted along the function path, but at least I
> learned something (and it is relevant to the larger scope of this
> thread's discussion).
>
>
> After doing that digging, it looks like providing hook call
> pre-emption and return value propagation will be a unique and fun task
> for each tracer and event subsystem.  If I look solely at tracepoints,
> a generic change would be to make the trace_##subsys function return
> an int (which I think was the event_vfs_getname proposal?).  The other
> option is to change the trace_sys_enter proto to include a 'int
> *retcode'.
>
> That change would allow the propagation of some sort of policy
> information.  To put it to use, seccomp mode 1 could be implemented on
> top of trace_syscalls.  The following changes would need to happen:
> 1. dummy metadata should be inserted for all unhooked system calls
> 2. perf_trace_buf_submit would need to return an int or a new
> TRACE_REG_SECCOMP_REGISTER handler would need to be setup in
> syscall_enter_register.
> 3. If perf is abused, a "kill/fail_on_discard" bit would be added to
> event->attrs.
> 4. perf_trace_buf_submit/perf_tp_event will return 0 for no matches,
> 'n' for the number of event matches, and -EACCES/? if a
> 'fail_on_discard' event is seen.
> 5. perf_syscall_enter would set *retcode = perf_trace_buf_submit()'s retcode
> 6. trace_sys_enter() would need to be moved to be the first entry
> arch/../kernel/ptrace.c for incoming syscalls
> 7. if trace_sys_enter() yields a negative return code, then
> do_exit(SIGKILL) the process and return.
>
> Entering into seccomp mode 1 would require adding a  "0" filter for
> every system call number (which is why we need a dummy event call for
> them since failing to check the bitmask can't be flagged
> fail_on_discard) with the fail_on_discard bit.  For the three calls
> that are allowed, a '1' filter would be set.
>
> That would roughly implement seccomp mode 1.  It's pretty ugly and the
> fact that every system call that's disallowed has to be blacklisted is
> not ideal.  An alternate model would be to just use the seccomp mode
> as we do today and let secure_computing() handle the return code of "#
> of matches".  If it the # of matches is 0, it terminates. A
> 'fail_on_discard' bit then would only be good to stop further
> tracepoint callback evaluation.  This approach would also *almost* nix
> the need to provide dummy syscall hooks.  (Since sigreturn isn't
> hooked on x86 because it uses ptregs fixup, a dummy would still be
> needed to apply a "1" filter to.)
>
> Even with that tweak to move to a whitelist model, the perf event
> evaluation and tracepoint callback ordering is still not guaranteed.
> Without changing tracepoint itself, all other TPs will still execute.
> And for perf events, it'll be first-come-first-serve until a
> fail_on_discard is hit.
>
> After using seccomp mode=1 as the sample case to reduce scope, it's
> possible to ignore it for now :) and look at the seccomp-filter/mode=2
> case.  The same mechanism could be used to inject more expressive
> filters.  This would be done over the sys_perf_event_open() interface
> assuming the new attr is added to stop perf event list enumeration.
> Assuming a whitelist model, a call to prctl(PR_SET_SECCOMP, 2) would
> be needed (maybe with the ON_EXEC flag option too to mirror the
> event->attr on-exec bit). That would yield the ability to register
> perf events for system calls then use ioctl() to SET_FILTER on them.
>
> Reducing the privileges of the filters after installation could be
> done with another attribute bit like 'set_filter_ands'.  If that is
> also set on the event, and a filter is installed to ensure that
> sys_perf_event_open is blocked, then it should be reasonably sane.
>
> I'd need to add a PERF_EVENT_IOC_GET_FILTER handler to allow
> extracting the settings.
>
> Clearly, I haven't written the code for that yet, though making the
> change for a single platform shouldn't be too much code.
>
> So that leaves me with some questions:
> - Is this the type of reuse that was being encouraged?
> - Does it really make sense to cram this through the perf interface
> and events?  While the changed attributes are innocuous and
> potentially reusable, it seems that a large amount of the perf
> facilities are exposed that could have weird side effects, but I'm
> sure I still don't fully grok the perf infrastructure.
> - Does extending one tracepoint to provide return values via a pointer
> make sense? I'd hesitate to make all tracepoint hooks return an int by
> default.
> - If all tracepoints returned an int, what would the standard value
> look like - or would it need to be per tracepoint impl?
> - How is ambiguity resolved if multiple perf_events are registered for
> a syscall with different filters?  Maybe a 'stop_on_match'? though
> ordering is still a problem then.
> - Is there a way to affect a task-wide change without a seccomp flag
> (or new task_struct entry) via the existing sys_perf_event_open
> syscall?  I considered suggesting a attr bit call 'secure_computing'
> that when an event with the bit is enabled, it automagically forces
> the task into seccomp enforcement mode, but that, again, seemed
> hackish.
>
> While I like the idea of sharing the tracepoints infrastructure and
> the trace_syscalls hooks as well as using a pre-existing interface
> with very minor changes, I'm worried that the complexity of the
> interface and of the infrastructure might undermine the ability to
> continue meeting the desired security goals.  I had originally stayed
> very close to the seccomp world because of how trivial it is to review
> the code and verify its accuracy/effectiveness.  This approach leaves
> a lot of gaps for kernel code to seep through and a fair amount of
> ambiguity in what locked down syscall filters might look like.
>
> To me, the best part of the above is that it shows that even if we go
> with a prctl SET_SECCOMP_FILTER-style interface, it is completely
> certain that if a perf/ftrace-based security infrastructure is on our
> future, it will be entirely compatible -- even if the prctl()
> interface is just the "simpler" interface at that point somewhere down
> the road.
>
>
> Regardless, I'll hack up a proof of concept based on the outline
> above. Perhaps something more elegant will emerge once I start
> tweaking the source, but I'm still seeing too many gaps to be
> comfortable so far.
>
>
> [[There is a whole other approach to this too. We could continue with
> the prctl() interface and mirror the trace_sys_enter model for
> secure_computing().   Instead of keeping a seccomp_t-based hlist of
> events, they could be stored in a new hlist for seccomp_events in
> struct ftrace_event_call.  The ftrace filters would be installed there
> and the seccomp_syscall_enter() function could do the checks and pass
> up some state data on the task's seccomp_t that indicates it needs to
> do_exit().  That would likely reduce the amount of code in
> seccomp_filter.c pretty seriously (though not entirely
> beneficially).]]
>
>
> Thanks again for all the feedback and insights! I really hope we can
> come to an agreeable approach for implementing kernel attack surface
> reduction.

Just a quick follow up.  I have a PoC of the perf-based system call
filtering code in place which uses seccomp.mode as a task_context
flag, adds require_secure and err_on_discard perf_event_attrs, and
enforces these new events terminate the process in perf_syscall_enter
via a return value on perf_buf_submit.  This lets a call like:

LD_LIBRARY_PATH=/host/home/wad/kernel.uml/tests/
/host/home/wad/kernel.uml/tests/perf record \
-S \
-e 'syscalls:sys_enter_access' \
-e 'syscalls:sys_enter_brk' \
-e 'syscalls:sys_enter_close' \
-e 'syscalls:sys_enter_exit_group' \
-e 'syscalls:sys_enter_fcntl64' \
-e 'syscalls:sys_enter_fstat64' \
-e 'syscalls:sys_enter_getdents64' \
-e 'syscalls:sys_enter_getpid' \
-e 'syscalls:sys_enter_getuid' \
-e 'syscalls:sys_enter_ioctl' \
-e 'syscalls:sys_enter_lstat64' \
-e 'syscalls:sys_enter_mmap_pgoff' \
-e 'syscalls:sys_enter_mprotect' \
-e 'syscalls:sys_enter_munmap' \
-e 'syscalls:sys_enter_open' \
-e 'syscalls:sys_enter_read' \
-e 'syscalls:sys_enter_stat64' \
-e 'syscalls:sys_enter_time' \
-e 'syscalls:sys_enter_newuname' \
-e 'syscalls:sys_enter_write' --filter "fd == 1" \
/bin/ls

run successfully while omitting a syscall or passing in --filter "fd
== 0" properly terminates it. (ignoring the need for execve and
set_thread_area for PoC purposes).

The change avoids defining a new trace call type or a huge number of
internal changes and hides seccomp.mode=2 from ABI-exposure in prctl,
but the attack surface is non-trivial to verify, and I'm not sure if
this ABI change makes sense. It amounts to:

 include/linux/ftrace_event.h  |    4 +-
 include/linux/perf_event.h    |   10 +++++---
 kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
 kernel/seccomp.c              |    8 ++++++
 kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
 5 files changed, 82 insertions(+), 16 deletions(-)

And can be found here: http://static.dataspill.org/perf_secure/v1/

If there is any interest at all, I can post it properly to this giant
CC list.  However,  it's unclear to me where this thread stands.  I
also see a completely independent path for implementing system call
filtering now, but I'll hold off posting that patch until I see where
this goes.

Any guidance will be appreciated - thanks again!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 15:59                                                                     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-24 15:59 UTC (permalink / raw)
  To: Steven Rostedt, Ingo Molnar, Peter Zijlstra, Frederic Weisbecker
  Cc: linux-mips, linux-sh, Heiko Carstens, Oleg Nesterov,
	David Howells, Paul Mackerras, Ralf Baechle, H. Peter Anvin,
	sparclinux, Jiri Slaby, linux-s390, Russell King, x86,
	James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Tejun Heo, Thomas Gleixner, linux-arm-kernel,
	Michal Marek, Michal Simek, linuxppc-dev, linux-kernel,
	Eric Paris, Paul Mundt, Martin Schwidefsky, linux390,
	Andrew Morton, agl, David S. Miller

On Thu, May 19, 2011 at 4:05 PM, Will Drewry <wad@chromium.org> wrote:
> On Thu, May 19, 2011 at 7:22 AM, Steven Rostedt <rostedt@goodmis.org> wro=
te:
>> On Wed, 2011-05-18 at 21:07 -0700, Will Drewry wrote:
>>
>>> Do event_* that return non-void exist in the tree at all now? =A0I've
>>> looked at the various tracepoint macros as well as some of the other
>>> handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
>>> places where a return value is honored nor could be. =A0At best, the
>>> perf_tp_event can be short-circuited it in the hlist_for_each, but
>>> it'd still need a way to bubble up a failure and result in not calling
>>> the trace/event that the hook precedes.
>>
>> No, none of the current trace hooks have return values. That was what I
>> was talking about how to implement in my previous emails.
>
> Led on by complete ignorance, I think I'm finally beginning to untwist
> the different pieces of the tracing infrastructure. =A0Unfortunately,
> that means I took a few wrong turns along the way...
>
> I think function tracing looks something like this:
>
> ftrace_call has been injected into at a specific callsite. =A0Upon hit:
> 1. ftrace_call triggers
> 2. does some checks then calls ftrace_trace_function (via mcount instrs)
> 3. ftrace_trace_function may be a single func or a list. For a list it
> will be: ftrace_list_func
> 4. ftrace_list_func calls each registered hook for that function in a
> while loop ignoring return values
> 5. registered hook funcs may then track the call, farm it out to
> specific sub-handlers, etc.
>
> This seems to be a red herring for my use case :/ though this helped
> me understand your back and forth (Ingo & Steve) regarding dynamic
> versus explicit events.
>
>
> System call tracing is done via kernel/tracepoint.c events fed in via
> arch/[arch]/kernel/ptrace.c where it calls trace_sys_enter. =A0This
> yields direct sys_enter and sys_exit event sources (and an event class
> to hook up per-system call events). =A0This means that
> ftrace_syscall_enter() does the event prep prior to doing a filter
> check comparing the ftrace_event object for the given syscall_nr to
> the event data. =A0perf_sysenter_enter() is similar but it pushes the
> info over to perf_tp_event to be matched not against the global
> syscall event entry, but against any sub-event in the linked list on
> that syscall's event.
>
> Is that roughly an accurate representation of the two? =A0I wish I
> hadn't gotten distracted along the function path, but at least I
> learned something (and it is relevant to the larger scope of this
> thread's discussion).
>
>
> After doing that digging, it looks like providing hook call
> pre-emption and return value propagation will be a unique and fun task
> for each tracer and event subsystem. =A0If I look solely at tracepoints,
> a generic change would be to make the trace_##subsys function return
> an int (which I think was the event_vfs_getname proposal?). =A0The other
> option is to change the trace_sys_enter proto to include a 'int
> *retcode'.
>
> That change would allow the propagation of some sort of policy
> information. =A0To put it to use, seccomp mode 1 could be implemented on
> top of trace_syscalls. =A0The following changes would need to happen:
> 1. dummy metadata should be inserted for all unhooked system calls
> 2. perf_trace_buf_submit would need to return an int or a new
> TRACE_REG_SECCOMP_REGISTER handler would need to be setup in
> syscall_enter_register.
> 3. If perf is abused, a "kill/fail_on_discard" bit would be added to
> event->attrs.
> 4. perf_trace_buf_submit/perf_tp_event will return 0 for no matches,
> 'n' for the number of event matches, and -EACCES/? if a
> 'fail_on_discard' event is seen.
> 5. perf_syscall_enter would set *retcode =3D perf_trace_buf_submit()'s re=
tcode
> 6. trace_sys_enter() would need to be moved to be the first entry
> arch/../kernel/ptrace.c for incoming syscalls
> 7. if trace_sys_enter() yields a negative return code, then
> do_exit(SIGKILL) the process and return.
>
> Entering into seccomp mode 1 would require adding a =A0"0" filter for
> every system call number (which is why we need a dummy event call for
> them since failing to check the bitmask can't be flagged
> fail_on_discard) with the fail_on_discard bit. =A0For the three calls
> that are allowed, a '1' filter would be set.
>
> That would roughly implement seccomp mode 1. =A0It's pretty ugly and the
> fact that every system call that's disallowed has to be blacklisted is
> not ideal. =A0An alternate model would be to just use the seccomp mode
> as we do today and let secure_computing() handle the return code of "#
> of matches". =A0If it the # of matches is 0, it terminates. A
> 'fail_on_discard' bit then would only be good to stop further
> tracepoint callback evaluation. =A0This approach would also *almost* nix
> the need to provide dummy syscall hooks. =A0(Since sigreturn isn't
> hooked on x86 because it uses ptregs fixup, a dummy would still be
> needed to apply a "1" filter to.)
>
> Even with that tweak to move to a whitelist model, the perf event
> evaluation and tracepoint callback ordering is still not guaranteed.
> Without changing tracepoint itself, all other TPs will still execute.
> And for perf events, it'll be first-come-first-serve until a
> fail_on_discard is hit.
>
> After using seccomp mode=3D1 as the sample case to reduce scope, it's
> possible to ignore it for now :) and look at the seccomp-filter/mode=3D2
> case. =A0The same mechanism could be used to inject more expressive
> filters. =A0This would be done over the sys_perf_event_open() interface
> assuming the new attr is added to stop perf event list enumeration.
> Assuming a whitelist model, a call to prctl(PR_SET_SECCOMP, 2) would
> be needed (maybe with the ON_EXEC flag option too to mirror the
> event->attr on-exec bit). That would yield the ability to register
> perf events for system calls then use ioctl() to SET_FILTER on them.
>
> Reducing the privileges of the filters after installation could be
> done with another attribute bit like 'set_filter_ands'. =A0If that is
> also set on the event, and a filter is installed to ensure that
> sys_perf_event_open is blocked, then it should be reasonably sane.
>
> I'd need to add a PERF_EVENT_IOC_GET_FILTER handler to allow
> extracting the settings.
>
> Clearly, I haven't written the code for that yet, though making the
> change for a single platform shouldn't be too much code.
>
> So that leaves me with some questions:
> - Is this the type of reuse that was being encouraged?
> - Does it really make sense to cram this through the perf interface
> and events? =A0While the changed attributes are innocuous and
> potentially reusable, it seems that a large amount of the perf
> facilities are exposed that could have weird side effects, but I'm
> sure I still don't fully grok the perf infrastructure.
> - Does extending one tracepoint to provide return values via a pointer
> make sense? I'd hesitate to make all tracepoint hooks return an int by
> default.
> - If all tracepoints returned an int, what would the standard value
> look like - or would it need to be per tracepoint impl?
> - How is ambiguity resolved if multiple perf_events are registered for
> a syscall with different filters? =A0Maybe a 'stop_on_match'? though
> ordering is still a problem then.
> - Is there a way to affect a task-wide change without a seccomp flag
> (or new task_struct entry) via the existing sys_perf_event_open
> syscall? =A0I considered suggesting a attr bit call 'secure_computing'
> that when an event with the bit is enabled, it automagically forces
> the task into seccomp enforcement mode, but that, again, seemed
> hackish.
>
> While I like the idea of sharing the tracepoints infrastructure and
> the trace_syscalls hooks as well as using a pre-existing interface
> with very minor changes, I'm worried that the complexity of the
> interface and of the infrastructure might undermine the ability to
> continue meeting the desired security goals. =A0I had originally stayed
> very close to the seccomp world because of how trivial it is to review
> the code and verify its accuracy/effectiveness. =A0This approach leaves
> a lot of gaps for kernel code to seep through and a fair amount of
> ambiguity in what locked down syscall filters might look like.
>
> To me, the best part of the above is that it shows that even if we go
> with a prctl SET_SECCOMP_FILTER-style interface, it is completely
> certain that if a perf/ftrace-based security infrastructure is on our
> future, it will be entirely compatible -- even if the prctl()
> interface is just the "simpler" interface at that point somewhere down
> the road.
>
>
> Regardless, I'll hack up a proof of concept based on the outline
> above. Perhaps something more elegant will emerge once I start
> tweaking the source, but I'm still seeing too many gaps to be
> comfortable so far.
>
>
> [[There is a whole other approach to this too. We could continue with
> the prctl() interface and mirror the trace_sys_enter model for
> secure_computing(). =A0 Instead of keeping a seccomp_t-based hlist of
> events, they could be stored in a new hlist for seccomp_events in
> struct ftrace_event_call. =A0The ftrace filters would be installed there
> and the seccomp_syscall_enter() function could do the checks and pass
> up some state data on the task's seccomp_t that indicates it needs to
> do_exit(). =A0That would likely reduce the amount of code in
> seccomp_filter.c pretty seriously (though not entirely
> beneficially).]]
>
>
> Thanks again for all the feedback and insights! I really hope we can
> come to an agreeable approach for implementing kernel attack surface
> reduction.

Just a quick follow up.  I have a PoC of the perf-based system call
filtering code in place which uses seccomp.mode as a task_context
flag, adds require_secure and err_on_discard perf_event_attrs, and
enforces these new events terminate the process in perf_syscall_enter
via a return value on perf_buf_submit.  This lets a call like:

LD_LIBRARY_PATH=3D/host/home/wad/kernel.uml/tests/
/host/home/wad/kernel.uml/tests/perf record \
-S \
-e 'syscalls:sys_enter_access' \
-e 'syscalls:sys_enter_brk' \
-e 'syscalls:sys_enter_close' \
-e 'syscalls:sys_enter_exit_group' \
-e 'syscalls:sys_enter_fcntl64' \
-e 'syscalls:sys_enter_fstat64' \
-e 'syscalls:sys_enter_getdents64' \
-e 'syscalls:sys_enter_getpid' \
-e 'syscalls:sys_enter_getuid' \
-e 'syscalls:sys_enter_ioctl' \
-e 'syscalls:sys_enter_lstat64' \
-e 'syscalls:sys_enter_mmap_pgoff' \
-e 'syscalls:sys_enter_mprotect' \
-e 'syscalls:sys_enter_munmap' \
-e 'syscalls:sys_enter_open' \
-e 'syscalls:sys_enter_read' \
-e 'syscalls:sys_enter_stat64' \
-e 'syscalls:sys_enter_time' \
-e 'syscalls:sys_enter_newuname' \
-e 'syscalls:sys_enter_write' --filter "fd =3D=3D 1" \
/bin/ls

run successfully while omitting a syscall or passing in --filter "fd
=3D=3D 0" properly terminates it. (ignoring the need for execve and
set_thread_area for PoC purposes).

The change avoids defining a new trace call type or a huge number of
internal changes and hides seccomp.mode=3D2 from ABI-exposure in prctl,
but the attack surface is non-trivial to verify, and I'm not sure if
this ABI change makes sense. It amounts to:

 include/linux/ftrace_event.h  |    4 +-
 include/linux/perf_event.h    |   10 +++++---
 kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++=
---
 kernel/seccomp.c              |    8 ++++++
 kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
 5 files changed, 82 insertions(+), 16 deletions(-)

And can be found here: http://static.dataspill.org/perf_secure/v1/

If there is any interest at all, I can post it properly to this giant
CC list.  However,  it's unclear to me where this thread stands.  I
also see a completely independent path for implementing system call
filtering now, but I'll hold off posting that patch until I see where
this goes.

Any guidance will be appreciated - thanks again!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 15:59                                                                     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-24 15:59 UTC (permalink / raw)
  To: linux-arm-kernel

On Thu, May 19, 2011 at 4:05 PM, Will Drewry <wad@chromium.org> wrote:
> On Thu, May 19, 2011 at 7:22 AM, Steven Rostedt <rostedt@goodmis.org> wrote:
>> On Wed, 2011-05-18 at 21:07 -0700, Will Drewry wrote:
>>
>>> Do event_* that return non-void exist in the tree at all now? ?I've
>>> looked at the various tracepoint macros as well as some of the other
>>> handlers (trace_function, perf_tp_event, etc) and I'm not seeing any
>>> places where a return value is honored nor could be. ?At best, the
>>> perf_tp_event can be short-circuited it in the hlist_for_each, but
>>> it'd still need a way to bubble up a failure and result in not calling
>>> the trace/event that the hook precedes.
>>
>> No, none of the current trace hooks have return values. That was what I
>> was talking about how to implement in my previous emails.
>
> Led on by complete ignorance, I think I'm finally beginning to untwist
> the different pieces of the tracing infrastructure. ?Unfortunately,
> that means I took a few wrong turns along the way...
>
> I think function tracing looks something like this:
>
> ftrace_call has been injected into at a specific callsite. ?Upon hit:
> 1. ftrace_call triggers
> 2. does some checks then calls ftrace_trace_function (via mcount instrs)
> 3. ftrace_trace_function may be a single func or a list. For a list it
> will be: ftrace_list_func
> 4. ftrace_list_func calls each registered hook for that function in a
> while loop ignoring return values
> 5. registered hook funcs may then track the call, farm it out to
> specific sub-handlers, etc.
>
> This seems to be a red herring for my use case :/ though this helped
> me understand your back and forth (Ingo & Steve) regarding dynamic
> versus explicit events.
>
>
> System call tracing is done via kernel/tracepoint.c events fed in via
> arch/[arch]/kernel/ptrace.c where it calls trace_sys_enter. ?This
> yields direct sys_enter and sys_exit event sources (and an event class
> to hook up per-system call events). ?This means that
> ftrace_syscall_enter() does the event prep prior to doing a filter
> check comparing the ftrace_event object for the given syscall_nr to
> the event data. ?perf_sysenter_enter() is similar but it pushes the
> info over to perf_tp_event to be matched not against the global
> syscall event entry, but against any sub-event in the linked list on
> that syscall's event.
>
> Is that roughly an accurate representation of the two? ?I wish I
> hadn't gotten distracted along the function path, but at least I
> learned something (and it is relevant to the larger scope of this
> thread's discussion).
>
>
> After doing that digging, it looks like providing hook call
> pre-emption and return value propagation will be a unique and fun task
> for each tracer and event subsystem. ?If I look solely at tracepoints,
> a generic change would be to make the trace_##subsys function return
> an int (which I think was the event_vfs_getname proposal?). ?The other
> option is to change the trace_sys_enter proto to include a 'int
> *retcode'.
>
> That change would allow the propagation of some sort of policy
> information. ?To put it to use, seccomp mode 1 could be implemented on
> top of trace_syscalls. ?The following changes would need to happen:
> 1. dummy metadata should be inserted for all unhooked system calls
> 2. perf_trace_buf_submit would need to return an int or a new
> TRACE_REG_SECCOMP_REGISTER handler would need to be setup in
> syscall_enter_register.
> 3. If perf is abused, a "kill/fail_on_discard" bit would be added to
> event->attrs.
> 4. perf_trace_buf_submit/perf_tp_event will return 0 for no matches,
> 'n' for the number of event matches, and -EACCES/? if a
> 'fail_on_discard' event is seen.
> 5. perf_syscall_enter would set *retcode = perf_trace_buf_submit()'s retcode
> 6. trace_sys_enter() would need to be moved to be the first entry
> arch/../kernel/ptrace.c for incoming syscalls
> 7. if trace_sys_enter() yields a negative return code, then
> do_exit(SIGKILL) the process and return.
>
> Entering into seccomp mode 1 would require adding a ?"0" filter for
> every system call number (which is why we need a dummy event call for
> them since failing to check the bitmask can't be flagged
> fail_on_discard) with the fail_on_discard bit. ?For the three calls
> that are allowed, a '1' filter would be set.
>
> That would roughly implement seccomp mode 1. ?It's pretty ugly and the
> fact that every system call that's disallowed has to be blacklisted is
> not ideal. ?An alternate model would be to just use the seccomp mode
> as we do today and let secure_computing() handle the return code of "#
> of matches". ?If it the # of matches is 0, it terminates. A
> 'fail_on_discard' bit then would only be good to stop further
> tracepoint callback evaluation. ?This approach would also *almost* nix
> the need to provide dummy syscall hooks. ?(Since sigreturn isn't
> hooked on x86 because it uses ptregs fixup, a dummy would still be
> needed to apply a "1" filter to.)
>
> Even with that tweak to move to a whitelist model, the perf event
> evaluation and tracepoint callback ordering is still not guaranteed.
> Without changing tracepoint itself, all other TPs will still execute.
> And for perf events, it'll be first-come-first-serve until a
> fail_on_discard is hit.
>
> After using seccomp mode=1 as the sample case to reduce scope, it's
> possible to ignore it for now :) and look at the seccomp-filter/mode=2
> case. ?The same mechanism could be used to inject more expressive
> filters. ?This would be done over the sys_perf_event_open() interface
> assuming the new attr is added to stop perf event list enumeration.
> Assuming a whitelist model, a call to prctl(PR_SET_SECCOMP, 2) would
> be needed (maybe with the ON_EXEC flag option too to mirror the
> event->attr on-exec bit). That would yield the ability to register
> perf events for system calls then use ioctl() to SET_FILTER on them.
>
> Reducing the privileges of the filters after installation could be
> done with another attribute bit like 'set_filter_ands'. ?If that is
> also set on the event, and a filter is installed to ensure that
> sys_perf_event_open is blocked, then it should be reasonably sane.
>
> I'd need to add a PERF_EVENT_IOC_GET_FILTER handler to allow
> extracting the settings.
>
> Clearly, I haven't written the code for that yet, though making the
> change for a single platform shouldn't be too much code.
>
> So that leaves me with some questions:
> - Is this the type of reuse that was being encouraged?
> - Does it really make sense to cram this through the perf interface
> and events? ?While the changed attributes are innocuous and
> potentially reusable, it seems that a large amount of the perf
> facilities are exposed that could have weird side effects, but I'm
> sure I still don't fully grok the perf infrastructure.
> - Does extending one tracepoint to provide return values via a pointer
> make sense? I'd hesitate to make all tracepoint hooks return an int by
> default.
> - If all tracepoints returned an int, what would the standard value
> look like - or would it need to be per tracepoint impl?
> - How is ambiguity resolved if multiple perf_events are registered for
> a syscall with different filters? ?Maybe a 'stop_on_match'? though
> ordering is still a problem then.
> - Is there a way to affect a task-wide change without a seccomp flag
> (or new task_struct entry) via the existing sys_perf_event_open
> syscall? ?I considered suggesting a attr bit call 'secure_computing'
> that when an event with the bit is enabled, it automagically forces
> the task into seccomp enforcement mode, but that, again, seemed
> hackish.
>
> While I like the idea of sharing the tracepoints infrastructure and
> the trace_syscalls hooks as well as using a pre-existing interface
> with very minor changes, I'm worried that the complexity of the
> interface and of the infrastructure might undermine the ability to
> continue meeting the desired security goals. ?I had originally stayed
> very close to the seccomp world because of how trivial it is to review
> the code and verify its accuracy/effectiveness. ?This approach leaves
> a lot of gaps for kernel code to seep through and a fair amount of
> ambiguity in what locked down syscall filters might look like.
>
> To me, the best part of the above is that it shows that even if we go
> with a prctl SET_SECCOMP_FILTER-style interface, it is completely
> certain that if a perf/ftrace-based security infrastructure is on our
> future, it will be entirely compatible -- even if the prctl()
> interface is just the "simpler" interface at that point somewhere down
> the road.
>
>
> Regardless, I'll hack up a proof of concept based on the outline
> above. Perhaps something more elegant will emerge once I start
> tweaking the source, but I'm still seeing too many gaps to be
> comfortable so far.
>
>
> [[There is a whole other approach to this too. We could continue with
> the prctl() interface and mirror the trace_sys_enter model for
> secure_computing(). ? Instead of keeping a seccomp_t-based hlist of
> events, they could be stored in a new hlist for seccomp_events in
> struct ftrace_event_call. ?The ftrace filters would be installed there
> and the seccomp_syscall_enter() function could do the checks and pass
> up some state data on the task's seccomp_t that indicates it needs to
> do_exit(). ?That would likely reduce the amount of code in
> seccomp_filter.c pretty seriously (though not entirely
> beneficially).]]
>
>
> Thanks again for all the feedback and insights! I really hope we can
> come to an agreeable approach for implementing kernel attack surface
> reduction.

Just a quick follow up.  I have a PoC of the perf-based system call
filtering code in place which uses seccomp.mode as a task_context
flag, adds require_secure and err_on_discard perf_event_attrs, and
enforces these new events terminate the process in perf_syscall_enter
via a return value on perf_buf_submit.  This lets a call like:

LD_LIBRARY_PATH=/host/home/wad/kernel.uml/tests/
/host/home/wad/kernel.uml/tests/perf record \
-S \
-e 'syscalls:sys_enter_access' \
-e 'syscalls:sys_enter_brk' \
-e 'syscalls:sys_enter_close' \
-e 'syscalls:sys_enter_exit_group' \
-e 'syscalls:sys_enter_fcntl64' \
-e 'syscalls:sys_enter_fstat64' \
-e 'syscalls:sys_enter_getdents64' \
-e 'syscalls:sys_enter_getpid' \
-e 'syscalls:sys_enter_getuid' \
-e 'syscalls:sys_enter_ioctl' \
-e 'syscalls:sys_enter_lstat64' \
-e 'syscalls:sys_enter_mmap_pgoff' \
-e 'syscalls:sys_enter_mprotect' \
-e 'syscalls:sys_enter_munmap' \
-e 'syscalls:sys_enter_open' \
-e 'syscalls:sys_enter_read' \
-e 'syscalls:sys_enter_stat64' \
-e 'syscalls:sys_enter_time' \
-e 'syscalls:sys_enter_newuname' \
-e 'syscalls:sys_enter_write' --filter "fd == 1" \
/bin/ls

run successfully while omitting a syscall or passing in --filter "fd
== 0" properly terminates it. (ignoring the need for execve and
set_thread_area for PoC purposes).

The change avoids defining a new trace call type or a huge number of
internal changes and hides seccomp.mode=2 from ABI-exposure in prctl,
but the attack surface is non-trivial to verify, and I'm not sure if
this ABI change makes sense. It amounts to:

 include/linux/ftrace_event.h  |    4 +-
 include/linux/perf_event.h    |   10 +++++---
 kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
 kernel/seccomp.c              |    8 ++++++
 kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
 5 files changed, 82 insertions(+), 16 deletions(-)

And can be found here: http://static.dataspill.org/perf_secure/v1/

If there is any interest at all, I can post it properly to this giant
CC list.  However,  it's unclear to me where this thread stands.  I
also see a completely independent path for implementing system call
filtering now, but I'll hold off posting that patch until I see where
this goes.

Any guidance will be appreciated - thanks again!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 15:59                                                                     ` Will Drewry
  (?)
@ 2011-05-24 16:20                                                                       ` Peter Zijlstra
  -1 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-24 16:20 UTC (permalink / raw)
  To: Will Drewry
  Cc: Steven Rostedt, Ingo Molnar, Frederic Weisbecker, James Morris,
	linux-kernel, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, David Howells, Russell King,
	Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
>  include/linux/ftrace_event.h  |    4 +-
>  include/linux/perf_event.h    |   10 +++++---
>  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
>  kernel/seccomp.c              |    8 ++++++
>  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
>  5 files changed, 82 insertions(+), 16 deletions(-) 

I strongly oppose to the perf core being mixed with any sekurity voodoo
(or any other active role for that matter).

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 16:20                                                                       ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-24 16:20 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, Ingo Molnar,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	kees.cook, linux-arm-kernel, Michal Marek, Michal Simek,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
>  include/linux/ftrace_event.h  |    4 +-
>  include/linux/perf_event.h    |   10 +++++---
>  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++=
++---
>  kernel/seccomp.c              |    8 ++++++
>  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
>  5 files changed, 82 insertions(+), 16 deletions(-)=20

I strongly oppose to the perf core being mixed with any sekurity voodoo
(or any other active role for that matter).

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 16:20                                                                       ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-24 16:20 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
>  include/linux/ftrace_event.h  |    4 +-
>  include/linux/perf_event.h    |   10 +++++---
>  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
>  kernel/seccomp.c              |    8 ++++++
>  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
>  5 files changed, 82 insertions(+), 16 deletions(-) 

I strongly oppose to the perf core being mixed with any sekurity voodoo
(or any other active role for that matter).

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 16:20                                                                       ` Peter Zijlstra
  (?)
@ 2011-05-24 16:25                                                                         ` Thomas Gleixner
  -1 siblings, 0 replies; 406+ messages in thread
From: Thomas Gleixner @ 2011-05-24 16:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Drewry, Steven Rostedt, Ingo Molnar, Frederic Weisbecker,
	James Morris, linux-kernel, Eric Paris, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, H. Peter Anvin, x86,
	linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390, linux-sh,
	sparclinux, Linus Torvalds

On Tue, 24 May 2011, Peter Zijlstra wrote:

> On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> >  include/linux/ftrace_event.h  |    4 +-
> >  include/linux/perf_event.h    |   10 +++++---
> >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> >  kernel/seccomp.c              |    8 ++++++
> >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> >  5 files changed, 82 insertions(+), 16 deletions(-) 
> 
> I strongly oppose to the perf core being mixed with any sekurity voodoo
> (or any other active role for that matter).

Amen. We have enough crap to cleanup in perf/ftrace already, so we
really do not need security magic added to it.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 16:25                                                                         ` Thomas Gleixner
  0 siblings, 0 replies; 406+ messages in thread
From: Thomas Gleixner @ 2011-05-24 16:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, Ingo Molnar,
	Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky, kees.cook,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller

On Tue, 24 May 2011, Peter Zijlstra wrote:

> On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> >  include/linux/ftrace_event.h  |    4 +-
> >  include/linux/perf_event.h    |   10 +++++---
> >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> >  kernel/seccomp.c              |    8 ++++++
> >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> >  5 files changed, 82 insertions(+), 16 deletions(-) 
> 
> I strongly oppose to the perf core being mixed with any sekurity voodoo
> (or any other active role for that matter).

Amen. We have enough crap to cleanup in perf/ftrace already, so we
really do not need security magic added to it.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 16:25                                                                         ` Thomas Gleixner
  0 siblings, 0 replies; 406+ messages in thread
From: Thomas Gleixner @ 2011-05-24 16:25 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 24 May 2011, Peter Zijlstra wrote:

> On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> >  include/linux/ftrace_event.h  |    4 +-
> >  include/linux/perf_event.h    |   10 +++++---
> >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> >  kernel/seccomp.c              |    8 ++++++
> >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> >  5 files changed, 82 insertions(+), 16 deletions(-) 
> 
> I strongly oppose to the perf core being mixed with any sekurity voodoo
> (or any other active role for that matter).

Amen. We have enough crap to cleanup in perf/ftrace already, so we
really do not need security magic added to it.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 16:25                                                                         ` Thomas Gleixner
  (?)
@ 2011-05-24 19:00                                                                           ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-24 19:00 UTC (permalink / raw)
  To: Thomas Gleixner, Peter Zijlstra, Steven Rostedt, Ingo Molnar,
	Frederic Weisbecker
  Cc: James Morris, linux-kernel, Eric Paris, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, H. Peter Anvin, x86,
	linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390, linux-sh,
	sparclinux, Linus Torvalds

On Tue, May 24, 2011 at 11:25 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Tue, 24 May 2011, Peter Zijlstra wrote:
>
>> On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
>> >  include/linux/ftrace_event.h  |    4 +-
>> >  include/linux/perf_event.h    |   10 +++++---
>> >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
>> >  kernel/seccomp.c              |    8 ++++++
>> >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
>> >  5 files changed, 82 insertions(+), 16 deletions(-)
>>
>> I strongly oppose to the perf core being mixed with any sekurity voodoo
>> (or any other active role for that matter).
>
> Amen. We have enough crap to cleanup in perf/ftrace already, so we
> really do not need security magic added to it.

Thanks for the quick responses!

I agree, but I'm left a little bit lost now w.r.t. the comments around
reusing the ABI.  If perf doesn't make sense (which certainly seems
wrong from a security interface perspective), then the preexisting
ABIs I know of for this case are as follows:
- /sys/kernel/debug/tracing/*
- prctl(PR_SET_SECCOMP* (or /proc/...)

Both would require expansion.  The latter was reused by the original
patch series.  The former doesn't expose much in the way of per-task
event filtering -- ftrace_pids doesn't translate well to
ftrace_syscall_enter-based enforcement.  I'd expect we'd need
ftrace_event_call->task_events (like ->perf_events), and either
explore them in ftrace_syscall_enter or add a new tracepoint handler,
ftrace_task_syscall_enter, via something like TRACE_REG_TASK_REGISTER.
 It could then do whatever it wanted with the successful or
unsuccessful matching against predicates, stacking or not, which could
be used for a seccomp-like mechanism.  However, bubbling that change
up to the non-existent interfaces in debug/tracing could be a
challenge too (Registration would require an alternate flow like perf
to call TRACE_REG_*? Do they become
tracing/events/subsystem/event/task/<tid>/<filter_string_N>? ...?).

This is all just a matter of programming... but at this point, I'm not
seeing the clear shared path forward.  Even with per-task ftrace
access in debug/tracing, that would introduce a reasonably large
change to the system and add a new ABI, albeit in debug/tracing.  If
the above (or whatever the right approach is) comes into existence,
then any prctl(PR_SET_SECCOMP) ABI could have the backend
implementation to modify the same data.  I'm not putting it like this
to say that I'm designing to be obsolete, but to show that the defined
interface wouldn't conflict if ftrace does overlap more in the future.
 Given the importance of a clearly defined interface for security
functionality, I'd be surprised to see all the pieces come together in
the near future in such a way that a transition would be immediately
possible -- I'm not even sure what the ftrace roadmap really is!

Would it be more desirable to put a system call filtering interface on
a miscdev (like /dev/syscall_filter) instead of in /proc or prctl (and
not reuse seccomp at all)?  I'm not clear what the onus is to justify
a change in the different ABI areas, but I see system call filtering
as an important piece of system security and would like to determine
if there is a viable path forward, or if this will need to be
revisited in another 2 years.

thanks again!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 19:00                                                                           ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-24 19:00 UTC (permalink / raw)
  To: Thomas Gleixner, Peter Zijlstra, Steven Rostedt, Ingo Molnar,
	Frederic Weisbecker
  Cc: linux-mips, linux-sh, Heiko Carstens, Oleg Nesterov,
	David Howells, Paul Mackerras, Ralf Baechle, H. Peter Anvin,
	sparclinux, Jiri Slaby, linux-s390, Russell King, x86,
	James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Tejun Heo, linux-arm-kernel, Michal Marek,
	Michal Simek, linuxppc-dev, linux-kernel, Eric Paris, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Tue, May 24, 2011 at 11:25 AM, Thomas Gleixner <tglx@linutronix.de> wrot=
e:
> On Tue, 24 May 2011, Peter Zijlstra wrote:
>
>> On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
>> > =A0include/linux/ftrace_event.h =A0| =A0 =A04 +-
>> > =A0include/linux/perf_event.h =A0 =A0| =A0 10 +++++---
>> > =A0kernel/perf_event.c =A0 =A0 =A0 =A0 =A0 | =A0 49 ++++++++++++++++++=
+++++++++++++++++++---
>> > =A0kernel/seccomp.c =A0 =A0 =A0 =A0 =A0 =A0 =A0| =A0 =A08 ++++++
>> > =A0kernel/trace/trace_syscalls.c | =A0 27 +++++++++++++++++-----
>> > =A05 files changed, 82 insertions(+), 16 deletions(-)
>>
>> I strongly oppose to the perf core being mixed with any sekurity voodoo
>> (or any other active role for that matter).
>
> Amen. We have enough crap to cleanup in perf/ftrace already, so we
> really do not need security magic added to it.

Thanks for the quick responses!

I agree, but I'm left a little bit lost now w.r.t. the comments around
reusing the ABI.  If perf doesn't make sense (which certainly seems
wrong from a security interface perspective), then the preexisting
ABIs I know of for this case are as follows:
- /sys/kernel/debug/tracing/*
- prctl(PR_SET_SECCOMP* (or /proc/...)

Both would require expansion.  The latter was reused by the original
patch series.  The former doesn't expose much in the way of per-task
event filtering -- ftrace_pids doesn't translate well to
ftrace_syscall_enter-based enforcement.  I'd expect we'd need
ftrace_event_call->task_events (like ->perf_events), and either
explore them in ftrace_syscall_enter or add a new tracepoint handler,
ftrace_task_syscall_enter, via something like TRACE_REG_TASK_REGISTER.
 It could then do whatever it wanted with the successful or
unsuccessful matching against predicates, stacking or not, which could
be used for a seccomp-like mechanism.  However, bubbling that change
up to the non-existent interfaces in debug/tracing could be a
challenge too (Registration would require an alternate flow like perf
to call TRACE_REG_*? Do they become
tracing/events/subsystem/event/task/<tid>/<filter_string_N>? ...?).

This is all just a matter of programming... but at this point, I'm not
seeing the clear shared path forward.  Even with per-task ftrace
access in debug/tracing, that would introduce a reasonably large
change to the system and add a new ABI, albeit in debug/tracing.  If
the above (or whatever the right approach is) comes into existence,
then any prctl(PR_SET_SECCOMP) ABI could have the backend
implementation to modify the same data.  I'm not putting it like this
to say that I'm designing to be obsolete, but to show that the defined
interface wouldn't conflict if ftrace does overlap more in the future.
 Given the importance of a clearly defined interface for security
functionality, I'd be surprised to see all the pieces come together in
the near future in such a way that a transition would be immediately
possible -- I'm not even sure what the ftrace roadmap really is!

Would it be more desirable to put a system call filtering interface on
a miscdev (like /dev/syscall_filter) instead of in /proc or prctl (and
not reuse seccomp at all)?  I'm not clear what the onus is to justify
a change in the different ABI areas, but I see system call filtering
as an important piece of system security and would like to determine
if there is a viable path forward, or if this will need to be
revisited in another 2 years.

thanks again!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 19:00                                                                           ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-24 19:00 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, May 24, 2011 at 11:25 AM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Tue, 24 May 2011, Peter Zijlstra wrote:
>
>> On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
>> > ?include/linux/ftrace_event.h ?| ? ?4 +-
>> > ?include/linux/perf_event.h ? ?| ? 10 +++++---
>> > ?kernel/perf_event.c ? ? ? ? ? | ? 49 +++++++++++++++++++++++++++++++++++++---
>> > ?kernel/seccomp.c ? ? ? ? ? ? ?| ? ?8 ++++++
>> > ?kernel/trace/trace_syscalls.c | ? 27 +++++++++++++++++-----
>> > ?5 files changed, 82 insertions(+), 16 deletions(-)
>>
>> I strongly oppose to the perf core being mixed with any sekurity voodoo
>> (or any other active role for that matter).
>
> Amen. We have enough crap to cleanup in perf/ftrace already, so we
> really do not need security magic added to it.

Thanks for the quick responses!

I agree, but I'm left a little bit lost now w.r.t. the comments around
reusing the ABI.  If perf doesn't make sense (which certainly seems
wrong from a security interface perspective), then the preexisting
ABIs I know of for this case are as follows:
- /sys/kernel/debug/tracing/*
- prctl(PR_SET_SECCOMP* (or /proc/...)

Both would require expansion.  The latter was reused by the original
patch series.  The former doesn't expose much in the way of per-task
event filtering -- ftrace_pids doesn't translate well to
ftrace_syscall_enter-based enforcement.  I'd expect we'd need
ftrace_event_call->task_events (like ->perf_events), and either
explore them in ftrace_syscall_enter or add a new tracepoint handler,
ftrace_task_syscall_enter, via something like TRACE_REG_TASK_REGISTER.
 It could then do whatever it wanted with the successful or
unsuccessful matching against predicates, stacking or not, which could
be used for a seccomp-like mechanism.  However, bubbling that change
up to the non-existent interfaces in debug/tracing could be a
challenge too (Registration would require an alternate flow like perf
to call TRACE_REG_*? Do they become
tracing/events/subsystem/event/task/<tid>/<filter_string_N>? ...?).

This is all just a matter of programming... but at this point, I'm not
seeing the clear shared path forward.  Even with per-task ftrace
access in debug/tracing, that would introduce a reasonably large
change to the system and add a new ABI, albeit in debug/tracing.  If
the above (or whatever the right approach is) comes into existence,
then any prctl(PR_SET_SECCOMP) ABI could have the backend
implementation to modify the same data.  I'm not putting it like this
to say that I'm designing to be obsolete, but to show that the defined
interface wouldn't conflict if ftrace does overlap more in the future.
 Given the importance of a clearly defined interface for security
functionality, I'd be surprised to see all the pieces come together in
the near future in such a way that a transition would be immediately
possible -- I'm not even sure what the ftrace roadmap really is!

Would it be more desirable to put a system call filtering interface on
a miscdev (like /dev/syscall_filter) instead of in /proc or prctl (and
not reuse seccomp at all)?  I'm not clear what the onus is to justify
a change in the different ABI areas, but I see system call filtering
as an important piece of system security and would like to determine
if there is a viable path forward, or if this will need to be
revisited in another 2 years.

thanks again!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 16:20                                                                       ` Peter Zijlstra
  (?)
@ 2011-05-24 19:54                                                                         ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-24 19:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Drewry, Steven Rostedt, Frederic Weisbecker, James Morris,
	linux-kernel, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, David Howells, Russell King,
	Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> >  include/linux/ftrace_event.h  |    4 +-
> >  include/linux/perf_event.h    |   10 +++++---
> >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> >  kernel/seccomp.c              |    8 ++++++
> >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> >  5 files changed, 82 insertions(+), 16 deletions(-) 
> 
> I strongly oppose to the perf core being mixed with any sekurity voodoo
> (or any other active role for that matter).

I'd object to invisible side-effects as well, and vehemently so. But note how 
intelligently it's used here: it's explicit in the code, it's used explicitly 
in kernel/seccomp.c and the event generation place in 
kernel/trace/trace_syscalls.c.

So this is a really flexible solution IMO and does not extend events with some 
invisible 'active' role. It extends the *call site* with an open-coded active 
role - which active role btw. already pre-existed.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 19:54                                                                         ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-24 19:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> >  include/linux/ftrace_event.h  |    4 +-
> >  include/linux/perf_event.h    |   10 +++++---
> >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> >  kernel/seccomp.c              |    8 ++++++
> >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> >  5 files changed, 82 insertions(+), 16 deletions(-) 
> 
> I strongly oppose to the perf core being mixed with any sekurity voodoo
> (or any other active role for that matter).

I'd object to invisible side-effects as well, and vehemently so. But note how 
intelligently it's used here: it's explicit in the code, it's used explicitly 
in kernel/seccomp.c and the event generation place in 
kernel/trace/trace_syscalls.c.

So this is a really flexible solution IMO and does not extend events with some 
invisible 'active' role. It extends the *call site* with an open-coded active 
role - which active role btw. already pre-existed.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 19:54                                                                         ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-24 19:54 UTC (permalink / raw)
  To: linux-arm-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> >  include/linux/ftrace_event.h  |    4 +-
> >  include/linux/perf_event.h    |   10 +++++---
> >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> >  kernel/seccomp.c              |    8 ++++++
> >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> >  5 files changed, 82 insertions(+), 16 deletions(-) 
> 
> I strongly oppose to the perf core being mixed with any sekurity voodoo
> (or any other active role for that matter).

I'd object to invisible side-effects as well, and vehemently so. But note how 
intelligently it's used here: it's explicit in the code, it's used explicitly 
in kernel/seccomp.c and the event generation place in 
kernel/trace/trace_syscalls.c.

So this is a really flexible solution IMO and does not extend events with some 
invisible 'active' role. It extends the *call site* with an open-coded active 
role - which active role btw. already pre-existed.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 15:59                                                                     ` Will Drewry
  (?)
@ 2011-05-24 20:08                                                                       ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-24 20:08 UTC (permalink / raw)
  To: Will Drewry
  Cc: Steven Rostedt, Peter Zijlstra, Frederic Weisbecker,
	James Morris, linux-kernel, Eric Paris, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds


* Will Drewry <wad@chromium.org> wrote:

> The change avoids defining a new trace call type or a huge number of internal 
> changes and hides seccomp.mode=2 from ABI-exposure in prctl, but the attack 
> surface is non-trivial to verify, and I'm not sure if this ABI change makes 
> sense. It amounts to:
> 
>  include/linux/ftrace_event.h  |    4 +-
>  include/linux/perf_event.h    |   10 +++++---
>  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
>  kernel/seccomp.c              |    8 ++++++
>  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
>  5 files changed, 82 insertions(+), 16 deletions(-)
> 
> And can be found here: http://static.dataspill.org/perf_secure/v1/

Wow, i'm very impressed how few changes you needed to do to support this!

So, firstly, i don't think we should change perf_tp_event() at all - the 
'observer' codepaths should be unaffected.

But there could be a perf_tp_event_ret() or perf_tp_event_check() entry that 
code like seccomp which wants to use event results can use.

Also, i'm not sure about the seccomp details and assumptions that were moved 
into the perf core. How about passing in a helper function to 
perf_tp_event_check(), where seccomp would define its seccomp specific helper 
function?

That looks sufficiently flexible. That helper function could be an 'extra 
filter' kind of thing, right?

Also, regarding the ABI and the attr.err_on_discard and attr.require_secure 
bits, they look a bit too specific as well.

attr.err_on_discard: with the filter helper function passed in this is probably 
not needed anymore, right?

attr.require_secure: this is basically used to *force* the creation of 
security-controlling filters, right? It seems to me that this could be done via 
a seccomp ABI extension as well, without adding this to the perf ABI. That 
seccomp call could check whether the right events are created and move the task 
to mode 2 only if that prereq is met - or something like that.

> If there is any interest at all, I can post it properly to this giant
> CC list. [...]

I'd suggest to trim the Cc: list aggressively - anyone interested in the 
discussion can pick it up on lkml - and i strongly suspect that most of the Cc: 
participants would want to be off the Cc: :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 20:08                                                                       ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-24 20:08 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Steven Rostedt, Tejun Heo,
	Thomas Gleixner, linux-arm-kernel, Michal Marek, Michal Simek,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* Will Drewry <wad@chromium.org> wrote:

> The change avoids defining a new trace call type or a huge number of internal 
> changes and hides seccomp.mode=2 from ABI-exposure in prctl, but the attack 
> surface is non-trivial to verify, and I'm not sure if this ABI change makes 
> sense. It amounts to:
> 
>  include/linux/ftrace_event.h  |    4 +-
>  include/linux/perf_event.h    |   10 +++++---
>  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
>  kernel/seccomp.c              |    8 ++++++
>  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
>  5 files changed, 82 insertions(+), 16 deletions(-)
> 
> And can be found here: http://static.dataspill.org/perf_secure/v1/

Wow, i'm very impressed how few changes you needed to do to support this!

So, firstly, i don't think we should change perf_tp_event() at all - the 
'observer' codepaths should be unaffected.

But there could be a perf_tp_event_ret() or perf_tp_event_check() entry that 
code like seccomp which wants to use event results can use.

Also, i'm not sure about the seccomp details and assumptions that were moved 
into the perf core. How about passing in a helper function to 
perf_tp_event_check(), where seccomp would define its seccomp specific helper 
function?

That looks sufficiently flexible. That helper function could be an 'extra 
filter' kind of thing, right?

Also, regarding the ABI and the attr.err_on_discard and attr.require_secure 
bits, they look a bit too specific as well.

attr.err_on_discard: with the filter helper function passed in this is probably 
not needed anymore, right?

attr.require_secure: this is basically used to *force* the creation of 
security-controlling filters, right? It seems to me that this could be done via 
a seccomp ABI extension as well, without adding this to the perf ABI. That 
seccomp call could check whether the right events are created and move the task 
to mode 2 only if that prereq is met - or something like that.

> If there is any interest at all, I can post it properly to this giant
> CC list. [...]

I'd suggest to trim the Cc: list aggressively - anyone interested in the 
discussion can pick it up on lkml - and i strongly suspect that most of the Cc: 
participants would want to be off the Cc: :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 20:08                                                                       ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-24 20:08 UTC (permalink / raw)
  To: linux-arm-kernel


* Will Drewry <wad@chromium.org> wrote:

> The change avoids defining a new trace call type or a huge number of internal 
> changes and hides seccomp.mode=2 from ABI-exposure in prctl, but the attack 
> surface is non-trivial to verify, and I'm not sure if this ABI change makes 
> sense. It amounts to:
> 
>  include/linux/ftrace_event.h  |    4 +-
>  include/linux/perf_event.h    |   10 +++++---
>  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
>  kernel/seccomp.c              |    8 ++++++
>  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
>  5 files changed, 82 insertions(+), 16 deletions(-)
> 
> And can be found here: http://static.dataspill.org/perf_secure/v1/

Wow, i'm very impressed how few changes you needed to do to support this!

So, firstly, i don't think we should change perf_tp_event() at all - the 
'observer' codepaths should be unaffected.

But there could be a perf_tp_event_ret() or perf_tp_event_check() entry that 
code like seccomp which wants to use event results can use.

Also, i'm not sure about the seccomp details and assumptions that were moved 
into the perf core. How about passing in a helper function to 
perf_tp_event_check(), where seccomp would define its seccomp specific helper 
function?

That looks sufficiently flexible. That helper function could be an 'extra 
filter' kind of thing, right?

Also, regarding the ABI and the attr.err_on_discard and attr.require_secure 
bits, they look a bit too specific as well.

attr.err_on_discard: with the filter helper function passed in this is probably 
not needed anymore, right?

attr.require_secure: this is basically used to *force* the creation of 
security-controlling filters, right? It seems to me that this could be done via 
a seccomp ABI extension as well, without adding this to the perf ABI. That 
seccomp call could check whether the right events are created and move the task 
to mode 2 only if that prereq is met - or something like that.

> If there is any interest at all, I can post it properly to this giant
> CC list. [...]

I'd suggest to trim the Cc: list aggressively - anyone interested in the 
discussion can pick it up on lkml - and i strongly suspect that most of the Cc: 
participants would want to be off the Cc: :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 19:54                                                                         ` Ingo Molnar
  (?)
@ 2011-05-24 20:10                                                                           ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-24 20:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Drewry, Steven Rostedt, Frederic Weisbecker, James Morris,
	linux-kernel, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, David Howells, Russell King,
	Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds


* Ingo Molnar <mingo@elte.hu> wrote:

> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > >  include/linux/ftrace_event.h  |    4 +-
> > >  include/linux/perf_event.h    |   10 +++++---
> > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > >  kernel/seccomp.c              |    8 ++++++
> > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > 
> > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > (or any other active role for that matter).
> 
> I'd object to invisible side-effects as well, and vehemently so. But note how 
> intelligently it's used here: it's explicit in the code, it's used explicitly 
> in kernel/seccomp.c and the event generation place in 
> kernel/trace/trace_syscalls.c.
> 
> So this is a really flexible solution IMO and does not extend events with 
> some invisible 'active' role. It extends the *call site* with an open-coded 
> active role - which active role btw. already pre-existed.

Also see my other mail - i think this seccomp code is too tied in to the perf 
core and ABI - but this is fixable IMO.

The fundamental notion that a generator subsystem of events can use filter 
results as well (such as kernel/trace/trace_syscalls.c.) for its own purposes 
is pretty robust though.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 20:10                                                                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-24 20:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Steven Rostedt, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* Ingo Molnar <mingo@elte.hu> wrote:

> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > >  include/linux/ftrace_event.h  |    4 +-
> > >  include/linux/perf_event.h    |   10 +++++---
> > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > >  kernel/seccomp.c              |    8 ++++++
> > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > 
> > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > (or any other active role for that matter).
> 
> I'd object to invisible side-effects as well, and vehemently so. But note how 
> intelligently it's used here: it's explicit in the code, it's used explicitly 
> in kernel/seccomp.c and the event generation place in 
> kernel/trace/trace_syscalls.c.
> 
> So this is a really flexible solution IMO and does not extend events with 
> some invisible 'active' role. It extends the *call site* with an open-coded 
> active role - which active role btw. already pre-existed.

Also see my other mail - i think this seccomp code is too tied in to the perf 
core and ABI - but this is fixable IMO.

The fundamental notion that a generator subsystem of events can use filter 
results as well (such as kernel/trace/trace_syscalls.c.) for its own purposes 
is pretty robust though.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 20:10                                                                           ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-24 20:10 UTC (permalink / raw)
  To: linux-arm-kernel


* Ingo Molnar <mingo@elte.hu> wrote:

> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > >  include/linux/ftrace_event.h  |    4 +-
> > >  include/linux/perf_event.h    |   10 +++++---
> > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > >  kernel/seccomp.c              |    8 ++++++
> > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > 
> > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > (or any other active role for that matter).
> 
> I'd object to invisible side-effects as well, and vehemently so. But note how 
> intelligently it's used here: it's explicit in the code, it's used explicitly 
> in kernel/seccomp.c and the event generation place in 
> kernel/trace/trace_syscalls.c.
> 
> So this is a really flexible solution IMO and does not extend events with 
> some invisible 'active' role. It extends the *call site* with an open-coded 
> active role - which active role btw. already pre-existed.

Also see my other mail - i think this seccomp code is too tied in to the perf 
core and ABI - but this is fixable IMO.

The fundamental notion that a generator subsystem of events can use filter 
results as well (such as kernel/trace/trace_syscalls.c.) for its own purposes 
is pretty robust though.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 20:08                                                                       ` Ingo Molnar
  (?)
@ 2011-05-24 20:14                                                                         ` Steven Rostedt
  -1 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-24 20:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, Peter Zijlstra, Frederic Weisbecker, James Morris,
	linux-kernel, Eric Paris, kees.cook, agl, Serge E. Hallyn,
	Ingo Molnar, Andrew Morton, Tejun Heo, Michal Marek,
	Oleg Nesterov, Jiri Slaby, David Howells, Russell King,
	Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, Thomas Gleixner, H. Peter Anvin,
	x86, linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390,
	linux-sh, sparclinux, Linus Torvalds

On Tue, 2011-05-24 at 22:08 +0200, Ingo Molnar wrote:
> * Will Drewry <wad@chromium.org> wrote:

> But there could be a perf_tp_event_ret() or perf_tp_event_check() entry that 
> code like seccomp which wants to use event results can use.

We should name it something else. The "perf_tp.." is a misnomer as it
has nothing to do with performance monitoring. "dynamic_event_.." maybe,
as it is dynamic to the affect that we can use jump labels to enable or
disable it.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 20:14                                                                         ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-24 20:14 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Tejun Heo, Thomas Gleixner,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Eric Paris, Paul Mundt,
	Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

On Tue, 2011-05-24 at 22:08 +0200, Ingo Molnar wrote:
> * Will Drewry <wad@chromium.org> wrote:

> But there could be a perf_tp_event_ret() or perf_tp_event_check() entry that 
> code like seccomp which wants to use event results can use.

We should name it something else. The "perf_tp.." is a misnomer as it
has nothing to do with performance monitoring. "dynamic_event_.." maybe,
as it is dynamic to the affect that we can use jump labels to enable or
disable it.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-24 20:14                                                                         ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-24 20:14 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 2011-05-24 at 22:08 +0200, Ingo Molnar wrote:
> * Will Drewry <wad@chromium.org> wrote:

> But there could be a perf_tp_event_ret() or perf_tp_event_check() entry that 
> code like seccomp which wants to use event results can use.

We should name it something else. The "perf_tp.." is a misnomer as it
has nothing to do with performance monitoring. "dynamic_event_.." maybe,
as it is dynamic to the affect that we can use jump labels to enable or
disable it.

-- Steve

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 20:08                                                                       ` Ingo Molnar
                                                                                         ` (2 preceding siblings ...)
  (?)
@ 2011-05-24 20:25                                                                       ` Kees Cook
  2011-05-25 19:09                                                                         ` Ingo Molnar
  -1 siblings, 1 reply; 406+ messages in thread
From: Kees Cook @ 2011-05-24 20:25 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Will Drewry, linux-kernel

[CC trimmed, as recommended]

Hi,

On Tue, May 24, 2011 at 10:08:15PM +0200, Ingo Molnar wrote:
> * Will Drewry <wad@chromium.org> wrote:
> 
> > The change avoids defining a new trace call type or a huge number of internal 
> > changes and hides seccomp.mode=2 from ABI-exposure in prctl, but the attack 
> > surface is non-trivial to verify, and I'm not sure if this ABI change makes 
> > sense. It amounts to:
> > 
> >  include/linux/ftrace_event.h  |    4 +-
> >  include/linux/perf_event.h    |   10 +++++---
> >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> >  kernel/seccomp.c              |    8 ++++++
> >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> >  5 files changed, 82 insertions(+), 16 deletions(-)
> > 
> > And can be found here: http://static.dataspill.org/perf_secure/v1/
> 
> Wow, i'm very impressed how few changes you needed to do to support this!
> [...]
> attr.require_secure: this is basically used to *force* the creation of 
> security-controlling filters, right? It seems to me that this could be done via 
> a seccomp ABI extension as well, without adding this to the perf ABI. That 
> seccomp call could check whether the right events are created and move the task 
> to mode 2 only if that prereq is met - or something like that.

I understood the prctl() API that was outlined earlier, but it seems
this is not going to happen now. What would the programming API actually
look like for an application developer using this perf-style method?

-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 19:54                                                                         ` Ingo Molnar
  (?)
@ 2011-05-25 10:35                                                                           ` Thomas Gleixner
  -1 siblings, 0 replies; 406+ messages in thread
From: Thomas Gleixner @ 2011-05-25 10:35 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Will Drewry, Steven Rostedt, Frederic Weisbecker,
	James Morris, linux-kernel, Eric Paris, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, H. Peter Anvin, x86,
	linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390, linux-sh,
	sparclinux, Linus Torvalds

On Tue, 24 May 2011, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > >  include/linux/ftrace_event.h  |    4 +-
> > >  include/linux/perf_event.h    |   10 +++++---
> > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > >  kernel/seccomp.c              |    8 ++++++
> > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > 
> > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > (or any other active role for that matter).
> 
> I'd object to invisible side-effects as well, and vehemently so. But note how 
> intelligently it's used here: it's explicit in the code, it's used explicitly 
> in kernel/seccomp.c and the event generation place in 
> kernel/trace/trace_syscalls.c.
> 
> So this is a really flexible solution IMO and does not extend events with some 
> invisible 'active' role. It extends the *call site* with an open-coded active 
> role - which active role btw. already pre-existed.

We do _NOT_ make any decision based on the trace point so what's the
"pre-existing" active role in the syscall entry code?

I'm all for code reuse and reuse of interfaces, but this is completely
wrong. Instrumentation and security decisions are two fundamentally
different things and we want them kept separate. Instrumentation is
not meant to make decisions. Just because we can does not mean that it
is a good idea.

So what the current approach does is:

 - abuse the existing ftrace syscall hook by adding a return value to
   the tracepoint.

   So we need to propagate that for every tracepoint just because we
   have a single user.

 - abuse the perf per task mechanism

   Just because we have per task context in perf does not mean that we
   pull everything and the world which requires per task context into
   perf. The security folks have per task context already so security
   related stuff wants to go there.

 - abuse the perf/ftrace interfaces

   One of the arguments was that perf and ftrace have permission which
   are not available from the existing security interfaces. That's not
   at all a good reason to abuse these interfaces. Let the security
   folks sort out the problem on their end and do not impose any
   expectations on perf/ftrace which we have to carry around forever.

Yes, it can be made working with a relatively small patch, but it has
a very nasty side effect: 

  You add another user space visible ABI to the existing perf/ftrace
  mess which needs to be supported forever.

Brilliant, we have already two ABIs (perf/ftrace) to support and at
the same time we urgently need to solve the problem of better
integration of those two. So adding a third completely unrelated
component with a guaranteed ABI is just making this even more complex.

We can factor out the filtering code and let the security dudes reuse
it for their own purposes. That makes them to have their own
interfaces and does not impose any restrictions upon the tracing/perf
ones. And really security stuff wants to be integrated into the
existing security frameworks and not duct taped into perf/trace just
because it's a conveniant hack around limitiations of the existing
security stuff.

You really should stop to see everything as a nail just because the
only tool you have handy is the perf hammer. perf is about
instrumentation and we don't want to violate the oldest principle of
unix to have simple tools which do one thing and do it good.

Even swiss army knifes have the restriction that you can use only one
tool at a time unless you want to stick the corkscrew through your
palm when you try to cut bread.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-25 10:35                                                                           ` Thomas Gleixner
  0 siblings, 0 replies; 406+ messages in thread
From: Thomas Gleixner @ 2011-05-25 10:35 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller

On Tue, 24 May 2011, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > >  include/linux/ftrace_event.h  |    4 +-
> > >  include/linux/perf_event.h    |   10 +++++---
> > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > >  kernel/seccomp.c              |    8 ++++++
> > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > 
> > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > (or any other active role for that matter).
> 
> I'd object to invisible side-effects as well, and vehemently so. But note how 
> intelligently it's used here: it's explicit in the code, it's used explicitly 
> in kernel/seccomp.c and the event generation place in 
> kernel/trace/trace_syscalls.c.
> 
> So this is a really flexible solution IMO and does not extend events with some 
> invisible 'active' role. It extends the *call site* with an open-coded active 
> role - which active role btw. already pre-existed.

We do _NOT_ make any decision based on the trace point so what's the
"pre-existing" active role in the syscall entry code?

I'm all for code reuse and reuse of interfaces, but this is completely
wrong. Instrumentation and security decisions are two fundamentally
different things and we want them kept separate. Instrumentation is
not meant to make decisions. Just because we can does not mean that it
is a good idea.

So what the current approach does is:

 - abuse the existing ftrace syscall hook by adding a return value to
   the tracepoint.

   So we need to propagate that for every tracepoint just because we
   have a single user.

 - abuse the perf per task mechanism

   Just because we have per task context in perf does not mean that we
   pull everything and the world which requires per task context into
   perf. The security folks have per task context already so security
   related stuff wants to go there.

 - abuse the perf/ftrace interfaces

   One of the arguments was that perf and ftrace have permission which
   are not available from the existing security interfaces. That's not
   at all a good reason to abuse these interfaces. Let the security
   folks sort out the problem on their end and do not impose any
   expectations on perf/ftrace which we have to carry around forever.

Yes, it can be made working with a relatively small patch, but it has
a very nasty side effect: 

  You add another user space visible ABI to the existing perf/ftrace
  mess which needs to be supported forever.

Brilliant, we have already two ABIs (perf/ftrace) to support and at
the same time we urgently need to solve the problem of better
integration of those two. So adding a third completely unrelated
component with a guaranteed ABI is just making this even more complex.

We can factor out the filtering code and let the security dudes reuse
it for their own purposes. That makes them to have their own
interfaces and does not impose any restrictions upon the tracing/perf
ones. And really security stuff wants to be integrated into the
existing security frameworks and not duct taped into perf/trace just
because it's a conveniant hack around limitiations of the existing
security stuff.

You really should stop to see everything as a nail just because the
only tool you have handy is the perf hammer. perf is about
instrumentation and we don't want to violate the oldest principle of
unix to have simple tools which do one thing and do it good.

Even swiss army knifes have the restriction that you can use only one
tool at a time unless you want to stick the corkscrew through your
palm when you try to cut bread.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-25 10:35                                                                           ` Thomas Gleixner
  0 siblings, 0 replies; 406+ messages in thread
From: Thomas Gleixner @ 2011-05-25 10:35 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, 24 May 2011, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > >  include/linux/ftrace_event.h  |    4 +-
> > >  include/linux/perf_event.h    |   10 +++++---
> > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > >  kernel/seccomp.c              |    8 ++++++
> > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > 
> > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > (or any other active role for that matter).
> 
> I'd object to invisible side-effects as well, and vehemently so. But note how 
> intelligently it's used here: it's explicit in the code, it's used explicitly 
> in kernel/seccomp.c and the event generation place in 
> kernel/trace/trace_syscalls.c.
> 
> So this is a really flexible solution IMO and does not extend events with some 
> invisible 'active' role. It extends the *call site* with an open-coded active 
> role - which active role btw. already pre-existed.

We do _NOT_ make any decision based on the trace point so what's the
"pre-existing" active role in the syscall entry code?

I'm all for code reuse and reuse of interfaces, but this is completely
wrong. Instrumentation and security decisions are two fundamentally
different things and we want them kept separate. Instrumentation is
not meant to make decisions. Just because we can does not mean that it
is a good idea.

So what the current approach does is:

 - abuse the existing ftrace syscall hook by adding a return value to
   the tracepoint.

   So we need to propagate that for every tracepoint just because we
   have a single user.

 - abuse the perf per task mechanism

   Just because we have per task context in perf does not mean that we
   pull everything and the world which requires per task context into
   perf. The security folks have per task context already so security
   related stuff wants to go there.

 - abuse the perf/ftrace interfaces

   One of the arguments was that perf and ftrace have permission which
   are not available from the existing security interfaces. That's not
   at all a good reason to abuse these interfaces. Let the security
   folks sort out the problem on their end and do not impose any
   expectations on perf/ftrace which we have to carry around forever.

Yes, it can be made working with a relatively small patch, but it has
a very nasty side effect: 

  You add another user space visible ABI to the existing perf/ftrace
  mess which needs to be supported forever.

Brilliant, we have already two ABIs (perf/ftrace) to support and at
the same time we urgently need to solve the problem of better
integration of those two. So adding a third completely unrelated
component with a guaranteed ABI is just making this even more complex.

We can factor out the filtering code and let the security dudes reuse
it for their own purposes. That makes them to have their own
interfaces and does not impose any restrictions upon the tracing/perf
ones. And really security stuff wants to be integrated into the
existing security frameworks and not duct taped into perf/trace just
because it's a conveniant hack around limitiations of the existing
security stuff.

You really should stop to see everything as a nail just because the
only tool you have handy is the perf hammer. perf is about
instrumentation and we don't want to violate the oldest principle of
unix to have simple tools which do one thing and do it good.

Even swiss army knifes have the restriction that you can use only one
tool at a time unless you want to stick the corkscrew through your
palm when you try to cut bread.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 10:35                                                                           ` Thomas Gleixner
  (?)
@ 2011-05-25 15:01                                                                             ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-25 15:01 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Will Drewry, Steven Rostedt, Frederic Weisbecker,
	James Morris, linux-kernel, Eric Paris, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, H. Peter Anvin, x86,
	linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390, linux-sh,
	sparclinux, Linus Torvalds


* Thomas Gleixner <tglx@linutronix.de> wrote:

> On Tue, 24 May 2011, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > > >  include/linux/ftrace_event.h  |    4 +-
> > > >  include/linux/perf_event.h    |   10 +++++---
> > > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > > >  kernel/seccomp.c              |    8 ++++++
> > > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > > 
> > > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > > (or any other active role for that matter).
> > 
> > I'd object to invisible side-effects as well, and vehemently so. But note how 
> > intelligently it's used here: it's explicit in the code, it's used explicitly 
> > in kernel/seccomp.c and the event generation place in 
> > kernel/trace/trace_syscalls.c.
> > 
> > So this is a really flexible solution IMO and does not extend events with some 
> > invisible 'active' role. It extends the *call site* with an open-coded active 
> > role - which active role btw. already pre-existed.
> 
> We do _NOT_ make any decision based on the trace point so what's the
> "pre-existing" active role in the syscall entry code?

The seccomp code we are discussing in this thread.

> I'm all for code reuse and reuse of interfaces, but this is completely
> wrong. Instrumentation and security decisions are two fundamentally
> different things and we want them kept separate. Instrumentation is
> not meant to make decisions. Just because we can does not mean that it
> is a good idea.

Instrumentation does not 'make decisions': the calling site, which is 
already emitting both the event and wants to do decisions based on 
the data that also generates the event wants to do decisions.

Those decisions *will be made* and you cannot prevent that, the only 
open question is can it reuse code intelligently, which code it is 
btw. already calling for observation reasons?

( Note that pure observers wont be affected and note that pure 
  observation call sites are not affected either. )

> So what the current approach does is:
> 
>  - abuse the existing ftrace syscall hook by adding a return value to
>    the tracepoint.
> 
>    So we need to propagate that for every tracepoint just because we
>    have a single user.

This is a technical criticism i share with you and i think it can be 
fixed - i outlined it to Will yesterday.

And no, if done cleanly it's not 'abuse' to reuse code. Could we wait 
for the first clean iteration of this patch instead of rushing 
judgement prematurely?

>  - abuse the perf per task mechanism
> 
>    Just because we have per task context in perf does not mean that we
>    pull everything and the world which requires per task context into
>    perf. The security folks have per task context already so security
>    related stuff wants to go there.

We do not pull 'everything and the world' in, but code that wants to 
process events in places that already emit events surely sounds 
related to me :-)

>  - abuse the perf/ftrace interfaces
> 
>    One of the arguments was that perf and ftrace have permission which
>    are not available from the existing security interfaces. That's not
>    at all a good reason to abuse these interfaces. Let the security
>    folks sort out the problem on their end and do not impose any
>    expectations on perf/ftrace which we have to carry around forever.
> 
> Yes, it can be made working with a relatively small patch, but it has
> a very nasty side effect: 
> 
>   You add another user space visible ABI to the existing perf/ftrace
>   mess which needs to be supported forever.

What mess? I'm not aware of a mess - other than the ftrace API which 
is not used by this patch.

> Brilliant, we have already two ABIs (perf/ftrace) to support and at
> the same time we urgently need to solve the problem of better
> integration of those two. So adding a third completely unrelated
> component with a guaranteed ABI is just making this even more complex.

So your solution is to add yet another ABI for seccomp and to keep 
seccomp a limited hack forever, just because you are not interested 
in security?

I think we want fewer ABIs and more flexible/reusable facilities.

> We can factor out the filtering code and let the security dudes 
> reuse it for their own purposes. That makes them to have their own 
> interfaces and does not impose any restrictions upon the 
> tracing/perf ones. And really security stuff wants to be integrated 
> into the existing security frameworks and not duct taped into 
> perf/trace just because it's a conveniant hack around limitiations 
> of the existing security stuff.

You are missing what i tried to point out in earlier discussions: 
from a security design POV this isnt just about the system call 
boundary. If this seccomp variant is based on events then it could 
grow proper security checks in other places as well, in places where 
we have some sort of object observation event anyway.

So this is opens up possibilities to reuse and unify code on a very 
broad basis.

> You really should stop to see everything as a nail just because the 
> only tool you have handy is the perf hammer. perf is about 
> instrumentation and we don't want to violate the oldest principle 
> of unix to have simple tools which do one thing and do it good.

That is one of the most misunderstood principles of Unix.

The original Unix tool landscape was highly *integrated* and 
*unified*, into a very tight codebase that was maintained together. 
Yes, there were different, atomic, simple commands but it was all 
done together and it was a coherent whole and pleasant to use!

People misunderstood this as a license to fragment the heck out 
functionality and build 'simple and stupid' tools that fit nowhere 
really and use different, incompatible principles not synced with 
each other. That is wrong and harmful.

So yes, over-integration is obviously wrong - but so is needless 
fragmentation.

Anyway, i still reserve judgement on this seccomp bit but the patches 
so far looked very promising with a very surprisingly small diffstat. 

If anything then that should tell you something that events and 
seccomp are not just casually related ...

> Even swiss army knifes have the restriction that you can use only 
> one tool at a time unless you want to stick the corkscrew through 
> your palm when you try to cut bread.

I'm not sure what you are arguing here - do you pine for the DOS days 
where you could only use one command at a time?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-25 15:01                                                                             ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-25 15:01 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller


* Thomas Gleixner <tglx@linutronix.de> wrote:

> On Tue, 24 May 2011, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > > >  include/linux/ftrace_event.h  |    4 +-
> > > >  include/linux/perf_event.h    |   10 +++++---
> > > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > > >  kernel/seccomp.c              |    8 ++++++
> > > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > > 
> > > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > > (or any other active role for that matter).
> > 
> > I'd object to invisible side-effects as well, and vehemently so. But note how 
> > intelligently it's used here: it's explicit in the code, it's used explicitly 
> > in kernel/seccomp.c and the event generation place in 
> > kernel/trace/trace_syscalls.c.
> > 
> > So this is a really flexible solution IMO and does not extend events with some 
> > invisible 'active' role. It extends the *call site* with an open-coded active 
> > role - which active role btw. already pre-existed.
> 
> We do _NOT_ make any decision based on the trace point so what's the
> "pre-existing" active role in the syscall entry code?

The seccomp code we are discussing in this thread.

> I'm all for code reuse and reuse of interfaces, but this is completely
> wrong. Instrumentation and security decisions are two fundamentally
> different things and we want them kept separate. Instrumentation is
> not meant to make decisions. Just because we can does not mean that it
> is a good idea.

Instrumentation does not 'make decisions': the calling site, which is 
already emitting both the event and wants to do decisions based on 
the data that also generates the event wants to do decisions.

Those decisions *will be made* and you cannot prevent that, the only 
open question is can it reuse code intelligently, which code it is 
btw. already calling for observation reasons?

( Note that pure observers wont be affected and note that pure 
  observation call sites are not affected either. )

> So what the current approach does is:
> 
>  - abuse the existing ftrace syscall hook by adding a return value to
>    the tracepoint.
> 
>    So we need to propagate that for every tracepoint just because we
>    have a single user.

This is a technical criticism i share with you and i think it can be 
fixed - i outlined it to Will yesterday.

And no, if done cleanly it's not 'abuse' to reuse code. Could we wait 
for the first clean iteration of this patch instead of rushing 
judgement prematurely?

>  - abuse the perf per task mechanism
> 
>    Just because we have per task context in perf does not mean that we
>    pull everything and the world which requires per task context into
>    perf. The security folks have per task context already so security
>    related stuff wants to go there.

We do not pull 'everything and the world' in, but code that wants to 
process events in places that already emit events surely sounds 
related to me :-)

>  - abuse the perf/ftrace interfaces
> 
>    One of the arguments was that perf and ftrace have permission which
>    are not available from the existing security interfaces. That's not
>    at all a good reason to abuse these interfaces. Let the security
>    folks sort out the problem on their end and do not impose any
>    expectations on perf/ftrace which we have to carry around forever.
> 
> Yes, it can be made working with a relatively small patch, but it has
> a very nasty side effect: 
> 
>   You add another user space visible ABI to the existing perf/ftrace
>   mess which needs to be supported forever.

What mess? I'm not aware of a mess - other than the ftrace API which 
is not used by this patch.

> Brilliant, we have already two ABIs (perf/ftrace) to support and at
> the same time we urgently need to solve the problem of better
> integration of those two. So adding a third completely unrelated
> component with a guaranteed ABI is just making this even more complex.

So your solution is to add yet another ABI for seccomp and to keep 
seccomp a limited hack forever, just because you are not interested 
in security?

I think we want fewer ABIs and more flexible/reusable facilities.

> We can factor out the filtering code and let the security dudes 
> reuse it for their own purposes. That makes them to have their own 
> interfaces and does not impose any restrictions upon the 
> tracing/perf ones. And really security stuff wants to be integrated 
> into the existing security frameworks and not duct taped into 
> perf/trace just because it's a conveniant hack around limitiations 
> of the existing security stuff.

You are missing what i tried to point out in earlier discussions: 
from a security design POV this isnt just about the system call 
boundary. If this seccomp variant is based on events then it could 
grow proper security checks in other places as well, in places where 
we have some sort of object observation event anyway.

So this is opens up possibilities to reuse and unify code on a very 
broad basis.

> You really should stop to see everything as a nail just because the 
> only tool you have handy is the perf hammer. perf is about 
> instrumentation and we don't want to violate the oldest principle 
> of unix to have simple tools which do one thing and do it good.

That is one of the most misunderstood principles of Unix.

The original Unix tool landscape was highly *integrated* and 
*unified*, into a very tight codebase that was maintained together. 
Yes, there were different, atomic, simple commands but it was all 
done together and it was a coherent whole and pleasant to use!

People misunderstood this as a license to fragment the heck out 
functionality and build 'simple and stupid' tools that fit nowhere 
really and use different, incompatible principles not synced with 
each other. That is wrong and harmful.

So yes, over-integration is obviously wrong - but so is needless 
fragmentation.

Anyway, i still reserve judgement on this seccomp bit but the patches 
so far looked very promising with a very surprisingly small diffstat. 

If anything then that should tell you something that events and 
seccomp are not just casually related ...

> Even swiss army knifes have the restriction that you can use only 
> one tool at a time unless you want to stick the corkscrew through 
> your palm when you try to cut bread.

I'm not sure what you are arguing here - do you pine for the DOS days 
where you could only use one command at a time?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-25 15:01                                                                             ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-25 15:01 UTC (permalink / raw)
  To: linux-arm-kernel


* Thomas Gleixner <tglx@linutronix.de> wrote:

> On Tue, 24 May 2011, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > > >  include/linux/ftrace_event.h  |    4 +-
> > > >  include/linux/perf_event.h    |   10 +++++---
> > > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > > >  kernel/seccomp.c              |    8 ++++++
> > > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > > 
> > > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > > (or any other active role for that matter).
> > 
> > I'd object to invisible side-effects as well, and vehemently so. But note how 
> > intelligently it's used here: it's explicit in the code, it's used explicitly 
> > in kernel/seccomp.c and the event generation place in 
> > kernel/trace/trace_syscalls.c.
> > 
> > So this is a really flexible solution IMO and does not extend events with some 
> > invisible 'active' role. It extends the *call site* with an open-coded active 
> > role - which active role btw. already pre-existed.
> 
> We do _NOT_ make any decision based on the trace point so what's the
> "pre-existing" active role in the syscall entry code?

The seccomp code we are discussing in this thread.

> I'm all for code reuse and reuse of interfaces, but this is completely
> wrong. Instrumentation and security decisions are two fundamentally
> different things and we want them kept separate. Instrumentation is
> not meant to make decisions. Just because we can does not mean that it
> is a good idea.

Instrumentation does not 'make decisions': the calling site, which is 
already emitting both the event and wants to do decisions based on 
the data that also generates the event wants to do decisions.

Those decisions *will be made* and you cannot prevent that, the only 
open question is can it reuse code intelligently, which code it is 
btw. already calling for observation reasons?

( Note that pure observers wont be affected and note that pure 
  observation call sites are not affected either. )

> So what the current approach does is:
> 
>  - abuse the existing ftrace syscall hook by adding a return value to
>    the tracepoint.
> 
>    So we need to propagate that for every tracepoint just because we
>    have a single user.

This is a technical criticism i share with you and i think it can be 
fixed - i outlined it to Will yesterday.

And no, if done cleanly it's not 'abuse' to reuse code. Could we wait 
for the first clean iteration of this patch instead of rushing 
judgement prematurely?

>  - abuse the perf per task mechanism
> 
>    Just because we have per task context in perf does not mean that we
>    pull everything and the world which requires per task context into
>    perf. The security folks have per task context already so security
>    related stuff wants to go there.

We do not pull 'everything and the world' in, but code that wants to 
process events in places that already emit events surely sounds 
related to me :-)

>  - abuse the perf/ftrace interfaces
> 
>    One of the arguments was that perf and ftrace have permission which
>    are not available from the existing security interfaces. That's not
>    at all a good reason to abuse these interfaces. Let the security
>    folks sort out the problem on their end and do not impose any
>    expectations on perf/ftrace which we have to carry around forever.
> 
> Yes, it can be made working with a relatively small patch, but it has
> a very nasty side effect: 
> 
>   You add another user space visible ABI to the existing perf/ftrace
>   mess which needs to be supported forever.

What mess? I'm not aware of a mess - other than the ftrace API which 
is not used by this patch.

> Brilliant, we have already two ABIs (perf/ftrace) to support and at
> the same time we urgently need to solve the problem of better
> integration of those two. So adding a third completely unrelated
> component with a guaranteed ABI is just making this even more complex.

So your solution is to add yet another ABI for seccomp and to keep 
seccomp a limited hack forever, just because you are not interested 
in security?

I think we want fewer ABIs and more flexible/reusable facilities.

> We can factor out the filtering code and let the security dudes 
> reuse it for their own purposes. That makes them to have their own 
> interfaces and does not impose any restrictions upon the 
> tracing/perf ones. And really security stuff wants to be integrated 
> into the existing security frameworks and not duct taped into 
> perf/trace just because it's a conveniant hack around limitiations 
> of the existing security stuff.

You are missing what i tried to point out in earlier discussions: 
from a security design POV this isnt just about the system call 
boundary. If this seccomp variant is based on events then it could 
grow proper security checks in other places as well, in places where 
we have some sort of object observation event anyway.

So this is opens up possibilities to reuse and unify code on a very 
broad basis.

> You really should stop to see everything as a nail just because the 
> only tool you have handy is the perf hammer. perf is about 
> instrumentation and we don't want to violate the oldest principle 
> of unix to have simple tools which do one thing and do it good.

That is one of the most misunderstood principles of Unix.

The original Unix tool landscape was highly *integrated* and 
*unified*, into a very tight codebase that was maintained together. 
Yes, there were different, atomic, simple commands but it was all 
done together and it was a coherent whole and pleasant to use!

People misunderstood this as a license to fragment the heck out 
functionality and build 'simple and stupid' tools that fit nowhere 
really and use different, incompatible principles not synced with 
each other. That is wrong and harmful.

So yes, over-integration is obviously wrong - but so is needless 
fragmentation.

Anyway, i still reserve judgement on this seccomp bit but the patches 
so far looked very promising with a very surprisingly small diffstat. 

If anything then that should tell you something that events and 
seccomp are not just casually related ...

> Even swiss army knifes have the restriction that you can use only 
> one tool at a time unless you want to stick the corkscrew through 
> your palm when you try to cut bread.

I'm not sure what you are arguing here - do you pine for the DOS days 
where you could only use one command at a time?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 20:08                                                                       ` Ingo Molnar
                                                                                         ` (3 preceding siblings ...)
  (?)
@ 2011-05-25 16:40                                                                       ` Will Drewry
  -1 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-25 16:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Peter Zijlstra, Frederic Weisbecker,
	James Morris, linux-kernel, Eric Paris, kees.cook,
	Serge E. Hallyn, Ingo Molnar, Thomas Gleixner

[trimmed the cc list to those who've expressed interest or strong opinions]

On Tue, May 24, 2011 at 3:08 PM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> The change avoids defining a new trace call type or a huge number of internal
>> changes and hides seccomp.mode=2 from ABI-exposure in prctl, but the attack
>> surface is non-trivial to verify, and I'm not sure if this ABI change makes
>> sense. It amounts to:
>>
>>  include/linux/ftrace_event.h  |    4 +-
>>  include/linux/perf_event.h    |   10 +++++---
>>  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
>>  kernel/seccomp.c              |    8 ++++++
>>  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
>>  5 files changed, 82 insertions(+), 16 deletions(-)
>>
>> And can be found here: http://static.dataspill.org/perf_secure/v1/
>
> Wow, i'm very impressed how few changes you needed to do to support this!

There's definitely a large overlap in the needed infrastructure (event
inheritance, event enabling, event registration) which makes changes
using it pretty small!

> So, firstly, i don't think we should change perf_tp_event() at all - the
> 'observer' codepaths should be unaffected.

Easy enough to do.

> But there could be a perf_tp_event_ret() or perf_tp_event_check() entry that
> code like seccomp which wants to use event results can use.
>
> Also, i'm not sure about the seccomp details and assumptions that were moved
> into the perf core. How about passing in a helper function to
> perf_tp_event_check(), where seccomp would define its seccomp specific helper
> function?

I'm curious how we'd register the seccomp callback and/or know when to
call it.  Would it just be a task-wide callback for task_context
events?  If so, will that mean only one callback_event user will be
able to function at a time (per task)?  [That's fine for seccomp
purposes, of course, but may incur further internal api pain later
unless it uses the list iterator function model that ftrace uses.]

> That looks sufficiently flexible. That helper function could be an 'extra
> filter' kind of thing, right?
>
> Also, regarding the ABI and the attr.err_on_discard and attr.require_secure
> bits, they look a bit too specific as well.
>
> attr.err_on_discard: with the filter helper function passed in this is probably
> not needed anymore, right?
>
> attr.require_secure: this is basically used to *force* the creation of
> security-controlling filters, right? It seems to me that this could be done via
> a seccomp ABI extension as well, without adding this to the perf ABI. That
> seccomp call could check whether the right events are created and move the task
> to mode 2 only if that prereq is met - or something like that.

Yeah - this can all be achieved with 0 changes to the perf path
entirely.  The part that causes me concern is the idea of reusing perf
events for a task that may or may not have been installed by the
caller of prctl(PR_SET_SECCOMP, 2) without any clear demarcation to
the API/ABI consumer.  It seems that to use this interface an
application will need to:
1. via libperf, drop all task-centric traces
2. via libperf, install some perf_events for the "allowed" events
3. call prctl(PR_SET_SECCOMP, 2).

Even then, it isn't necessarily clear to the user which events will
have a security impact and which won't.  It creates an implicit ABI
(if that makes sense) that will be hard to ensure strong security
guarantees for as perf changes.  It'd be possible to still do a
(PR_SET_SECCOMP_FILTER, eventid) approach to indicate which events are
expected, but at that point I'd be tempted to leave libperf out of it
again :)  Regardless, I'll pull together the patch as I understand the
proposal since code can be a bit easier to digest than the discussion.
 It seems to me that it's going to be hard to stay away from explicit
ABI changes, but perhaps I'm misunderstanding the ultimate direction
still.

If I'm way off base, please let me know :)  It'll save some extra patch-churn.

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 15:01                                                                             ` Ingo Molnar
  (?)
@ 2011-05-25 17:43                                                                               ` Peter Zijlstra
  -1 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-25 17:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, Will Drewry, Steven Rostedt,
	Frederic Weisbecker, James Morris, linux-kernel, Eric Paris,
	kees.cook, agl, Serge E. Hallyn, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Jiri Slaby,
	David Howells, Russell King, Michal Simek, Ralf Baechle,
	Benjamin Herrenschmidt, Paul Mackerras, Martin Schwidefsky,
	Heiko Carstens, linux390, Paul Mundt, David S. Miller,
	H. Peter Anvin, x86, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds

On Wed, 2011-05-25 at 17:01 +0200, Ingo Molnar wrote:
> > We do _NOT_ make any decision based on the trace point so what's the
> > "pre-existing" active role in the syscall entry code?
> 
> The seccomp code we are discussing in this thread. 

That isn't pre-existing, that's proposed.

But face it, you can argue until you're blue in the face, but both tglx
and I will NAK any and all patches that extend perf/ftrace beyond the
passive observing role.

Your arguments appear to be as non-persuasive to us as ours are to you,
so please drop this endeavor and let the security folks sort it on their
own and let's get back to doing useful work. 

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-25 17:43                                                                               ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-25 17:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, linux-arm-kernel, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Tejun Heo, linux390, Andrew Morton, agl,
	David S. Miller

On Wed, 2011-05-25 at 17:01 +0200, Ingo Molnar wrote:
> > We do _NOT_ make any decision based on the trace point so what's the
> > "pre-existing" active role in the syscall entry code?
>=20
> The seccomp code we are discussing in this thread.=20

That isn't pre-existing, that's proposed.

But face it, you can argue until you're blue in the face, but both tglx
and I will NAK any and all patches that extend perf/ftrace beyond the
passive observing role.

Your arguments appear to be as non-persuasive to us as ours are to you,
so please drop this endeavor and let the security folks sort it on their
own and let's get back to doing useful work.=20

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-25 17:43                                                                               ` Peter Zijlstra
  0 siblings, 0 replies; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-25 17:43 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 2011-05-25 at 17:01 +0200, Ingo Molnar wrote:
> > We do _NOT_ make any decision based on the trace point so what's the
> > "pre-existing" active role in the syscall entry code?
> 
> The seccomp code we are discussing in this thread. 

That isn't pre-existing, that's proposed.

But face it, you can argue until you're blue in the face, but both tglx
and I will NAK any and all patches that extend perf/ftrace beyond the
passive observing role.

Your arguments appear to be as non-persuasive to us as ours are to you,
so please drop this endeavor and let the security folks sort it on their
own and let's get back to doing useful work. 

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 15:01                                                                             ` Ingo Molnar
  (?)
@ 2011-05-25 17:48                                                                               ` Thomas Gleixner
  -1 siblings, 0 replies; 406+ messages in thread
From: Thomas Gleixner @ 2011-05-25 17:48 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Will Drewry, Steven Rostedt, Frederic Weisbecker,
	James Morris, linux-kernel, Eric Paris, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, H. Peter Anvin, x86,
	linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390, linux-sh,
	sparclinux, Linus Torvalds

On Wed, 25 May 2011, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> > On Tue, 24 May 2011, Ingo Molnar wrote:
> > > * Peter Zijlstra <peterz@infradead.org> wrote:
> > > 
> > > > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > > > >  include/linux/ftrace_event.h  |    4 +-
> > > > >  include/linux/perf_event.h    |   10 +++++---
> > > > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > > > >  kernel/seccomp.c              |    8 ++++++
> > > > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > > > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > > > 
> > > > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > > > (or any other active role for that matter).
> > > 
> > > I'd object to invisible side-effects as well, and vehemently so. But note how 
> > > intelligently it's used here: it's explicit in the code, it's used explicitly 
> > > in kernel/seccomp.c and the event generation place in 
> > > kernel/trace/trace_syscalls.c.
> > > 
> > > So this is a really flexible solution IMO and does not extend events with some 
> > > invisible 'active' role. It extends the *call site* with an open-coded active 
> > > role - which active role btw. already pre-existed.
> > 
> > We do _NOT_ make any decision based on the trace point so what's the
> > "pre-existing" active role in the syscall entry code?
> 
> The seccomp code we are discussing in this thread.

That's proposed code and has absolutely nothing to do with the
existing trace point semantics.
 
> > I'm all for code reuse and reuse of interfaces, but this is completely
> > wrong. Instrumentation and security decisions are two fundamentally
> > different things and we want them kept separate. Instrumentation is
> > not meant to make decisions. Just because we can does not mean that it
> > is a good idea.
> 
> Instrumentation does not 'make decisions': the calling site, which is 
> already emitting both the event and wants to do decisions based on 
> the data that also generates the event wants to do decisions.

You can repeat that as often as you want, it does not make it more
true. Fact is that the decision is made in the middle of the perf code.

> +     /* Transition the task if required. */
> +     if (ctx->type == task_context && event->attr.require_secure) {
> +#ifdef CONFIG_SECCOMP
> +		/* Don't allow perf events to escape mode = 1. */
> +		   if (!current->seccomp.mode)
> +			current->seccomp.mode = 2;
> +#endif
> +	}
 
and further down

> +   	    if (event->attr.err_on_discard)
> +		ok = -EACCES;
 
> Those decisions *will be made* and you cannot prevent that, the only 
> open question is can it reuse code intelligently, which code it is 
> btw. already calling for observation reasons?

The tracepoint is called for observation reasons and now you make it a
decision function. That's what I call abuse.

> ( Note that pure observers wont be affected and note that pure 
>   observation call sites are not affected either. )

Hahaha, they still have to run through the additional code when
seccomp is enabled and we still have to propagate the return value
down to the point where the tracepoint itself is. You call that not
affected?
 
> > So what the current approach does is:
> > 
> >  - abuse the existing ftrace syscall hook by adding a return value to
> >    the tracepoint.
> > 
> >    So we need to propagate that for every tracepoint just because we
> >    have a single user.
> 
> This is a technical criticism i share with you and i think it can be 
> fixed - i outlined it to Will yesterday.
> 
> And no, if done cleanly it's not 'abuse' to reuse code. Could we wait 
> for the first clean iteration of this patch instead of rushing 
> judgement prematurely?

There is no way to do it cleanly. It always comes for the price that
you add additional code into the tracing code path. And there are
other people who try hard to remove stuff to recude the overhead which
is caused by instrumentation.

> >  - abuse the perf per task mechanism
> > 
> >    Just because we have per task context in perf does not mean that we
> >    pull everything and the world which requires per task context into
> >    perf. The security folks have per task context already so security
> >    related stuff wants to go there.
> 
> We do not pull 'everything and the world' in, but code that wants to 
> process events in places that already emit events surely sounds 
> related to me :-)

We have enough places where different independent parts of the kernel
want to hook into for obvious reasons.

We have notifiers for those where performance does not matter much and
we have separate calls into the independent functions where it matters
or where we need to evaluate the results in specific ways.

So now you turn instrumentation into a security mechanism, which
"works" nicely for a particular purpose, i.e. decision on a particular
syscall number. Now, how do you make that work when a decision has to
be made on more than a simple match, e.g. syscall number + arguments ?

Not at all, unless you add more complexity and arbitrary callbacks
into the instrumentation code.

> > Brilliant, we have already two ABIs (perf/ftrace) to support and at
> > the same time we urgently need to solve the problem of better
> > integration of those two. So adding a third completely unrelated
> > component with a guaranteed ABI is just making this even more complex.
> 
> So your solution is to add yet another ABI for seccomp and to keep 
> seccomp a limited hack forever, just because you are not interested 
> in security?

Well, I'm interested in security, but I'm neither interested in
security decisions inside instrumentation code nor in security models
which are limited hacks by definition unless you want to add callback
complexities to the instrumentation code.

It all looks nice and charming with this minimalistic use case, but
add real features to it and it gets messy in a split second and you
can't hold that off once you started to allow A. And you clearly
stated that you want to have more trace point based security features
than the simple syscall number filtering.

> I think we want fewer ABIs and more flexible/reusable facilities.

I'm all for that, but security and instrumentation are different
beasts.
 
> > We can factor out the filtering code and let the security dudes 
> > reuse it for their own purposes. That makes them to have their own 
> > interfaces and does not impose any restrictions upon the 
> > tracing/perf ones. And really security stuff wants to be integrated 
> > into the existing security frameworks and not duct taped into 
> > perf/trace just because it's a conveniant hack around limitiations 
> > of the existing security stuff.
> 
> You are missing what i tried to point out in earlier discussions: 
> from a security design POV this isnt just about the system call 
> boundary. If this seccomp variant is based on events then it could 
> grow proper security checks in other places as well, in places where 
> we have some sort of object observation event anyway.

Right, that requires callbacks and more stuff to do object based
observation and ties a trace point into a place where it might not
make sense after a while, but the security decision wants to stay at
that place. The syscall example is so tempting because it's simplistic
and easy to solve, but every extension to that model is going to
create a nightmare.

You are duct taping stuff together which has totally different
semantics and requirements. And your only argument is reuse of
existing code. That's a good argument in principle, but there is a
fundamental difference between intelligent reuse and enforcing it just
for reuse sake.

> So this is opens up possibilities to reuse and unify code on a very 
> broad basis.

By making a total mess out of it? 

> So yes, over-integration is obviously wrong - but so is needless 
> fragmentation.

Right. And this falls into the category of over-integration. We have
people working on security and they are working on stacked security
models, so where is the justification to start another security
framework inside the instrumentation code which is completely non
interoperable with the existing ones?

> If anything then that should tell you something that events and 
> seccomp are not just casually related ...

They happen to have the hook at the same point in the source and for
pure coincidence it works because the problem to solve is extremly
simplistic. And that's why the diffstat is minimalistic, but that does
not prove anything.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-25 17:48                                                                               ` Thomas Gleixner
  0 siblings, 0 replies; 406+ messages in thread
From: Thomas Gleixner @ 2011-05-25 17:48 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller

On Wed, 25 May 2011, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> > On Tue, 24 May 2011, Ingo Molnar wrote:
> > > * Peter Zijlstra <peterz@infradead.org> wrote:
> > > 
> > > > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > > > >  include/linux/ftrace_event.h  |    4 +-
> > > > >  include/linux/perf_event.h    |   10 +++++---
> > > > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > > > >  kernel/seccomp.c              |    8 ++++++
> > > > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > > > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > > > 
> > > > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > > > (or any other active role for that matter).
> > > 
> > > I'd object to invisible side-effects as well, and vehemently so. But note how 
> > > intelligently it's used here: it's explicit in the code, it's used explicitly 
> > > in kernel/seccomp.c and the event generation place in 
> > > kernel/trace/trace_syscalls.c.
> > > 
> > > So this is a really flexible solution IMO and does not extend events with some 
> > > invisible 'active' role. It extends the *call site* with an open-coded active 
> > > role - which active role btw. already pre-existed.
> > 
> > We do _NOT_ make any decision based on the trace point so what's the
> > "pre-existing" active role in the syscall entry code?
> 
> The seccomp code we are discussing in this thread.

That's proposed code and has absolutely nothing to do with the
existing trace point semantics.
 
> > I'm all for code reuse and reuse of interfaces, but this is completely
> > wrong. Instrumentation and security decisions are two fundamentally
> > different things and we want them kept separate. Instrumentation is
> > not meant to make decisions. Just because we can does not mean that it
> > is a good idea.
> 
> Instrumentation does not 'make decisions': the calling site, which is 
> already emitting both the event and wants to do decisions based on 
> the data that also generates the event wants to do decisions.

You can repeat that as often as you want, it does not make it more
true. Fact is that the decision is made in the middle of the perf code.

> +     /* Transition the task if required. */
> +     if (ctx->type == task_context && event->attr.require_secure) {
> +#ifdef CONFIG_SECCOMP
> +		/* Don't allow perf events to escape mode = 1. */
> +		   if (!current->seccomp.mode)
> +			current->seccomp.mode = 2;
> +#endif
> +	}
 
and further down

> +   	    if (event->attr.err_on_discard)
> +		ok = -EACCES;
 
> Those decisions *will be made* and you cannot prevent that, the only 
> open question is can it reuse code intelligently, which code it is 
> btw. already calling for observation reasons?

The tracepoint is called for observation reasons and now you make it a
decision function. That's what I call abuse.

> ( Note that pure observers wont be affected and note that pure 
>   observation call sites are not affected either. )

Hahaha, they still have to run through the additional code when
seccomp is enabled and we still have to propagate the return value
down to the point where the tracepoint itself is. You call that not
affected?
 
> > So what the current approach does is:
> > 
> >  - abuse the existing ftrace syscall hook by adding a return value to
> >    the tracepoint.
> > 
> >    So we need to propagate that for every tracepoint just because we
> >    have a single user.
> 
> This is a technical criticism i share with you and i think it can be 
> fixed - i outlined it to Will yesterday.
> 
> And no, if done cleanly it's not 'abuse' to reuse code. Could we wait 
> for the first clean iteration of this patch instead of rushing 
> judgement prematurely?

There is no way to do it cleanly. It always comes for the price that
you add additional code into the tracing code path. And there are
other people who try hard to remove stuff to recude the overhead which
is caused by instrumentation.

> >  - abuse the perf per task mechanism
> > 
> >    Just because we have per task context in perf does not mean that we
> >    pull everything and the world which requires per task context into
> >    perf. The security folks have per task context already so security
> >    related stuff wants to go there.
> 
> We do not pull 'everything and the world' in, but code that wants to 
> process events in places that already emit events surely sounds 
> related to me :-)

We have enough places where different independent parts of the kernel
want to hook into for obvious reasons.

We have notifiers for those where performance does not matter much and
we have separate calls into the independent functions where it matters
or where we need to evaluate the results in specific ways.

So now you turn instrumentation into a security mechanism, which
"works" nicely for a particular purpose, i.e. decision on a particular
syscall number. Now, how do you make that work when a decision has to
be made on more than a simple match, e.g. syscall number + arguments ?

Not at all, unless you add more complexity and arbitrary callbacks
into the instrumentation code.

> > Brilliant, we have already two ABIs (perf/ftrace) to support and at
> > the same time we urgently need to solve the problem of better
> > integration of those two. So adding a third completely unrelated
> > component with a guaranteed ABI is just making this even more complex.
> 
> So your solution is to add yet another ABI for seccomp and to keep 
> seccomp a limited hack forever, just because you are not interested 
> in security?

Well, I'm interested in security, but I'm neither interested in
security decisions inside instrumentation code nor in security models
which are limited hacks by definition unless you want to add callback
complexities to the instrumentation code.

It all looks nice and charming with this minimalistic use case, but
add real features to it and it gets messy in a split second and you
can't hold that off once you started to allow A. And you clearly
stated that you want to have more trace point based security features
than the simple syscall number filtering.

> I think we want fewer ABIs and more flexible/reusable facilities.

I'm all for that, but security and instrumentation are different
beasts.
 
> > We can factor out the filtering code and let the security dudes 
> > reuse it for their own purposes. That makes them to have their own 
> > interfaces and does not impose any restrictions upon the 
> > tracing/perf ones. And really security stuff wants to be integrated 
> > into the existing security frameworks and not duct taped into 
> > perf/trace just because it's a conveniant hack around limitiations 
> > of the existing security stuff.
> 
> You are missing what i tried to point out in earlier discussions: 
> from a security design POV this isnt just about the system call 
> boundary. If this seccomp variant is based on events then it could 
> grow proper security checks in other places as well, in places where 
> we have some sort of object observation event anyway.

Right, that requires callbacks and more stuff to do object based
observation and ties a trace point into a place where it might not
make sense after a while, but the security decision wants to stay at
that place. The syscall example is so tempting because it's simplistic
and easy to solve, but every extension to that model is going to
create a nightmare.

You are duct taping stuff together which has totally different
semantics and requirements. And your only argument is reuse of
existing code. That's a good argument in principle, but there is a
fundamental difference between intelligent reuse and enforcing it just
for reuse sake.

> So this is opens up possibilities to reuse and unify code on a very 
> broad basis.

By making a total mess out of it? 

> So yes, over-integration is obviously wrong - but so is needless 
> fragmentation.

Right. And this falls into the category of over-integration. We have
people working on security and they are working on stacked security
models, so where is the justification to start another security
framework inside the instrumentation code which is completely non
interoperable with the existing ones?

> If anything then that should tell you something that events and 
> seccomp are not just casually related ...

They happen to have the hook at the same point in the source and for
pure coincidence it works because the problem to solve is extremly
simplistic. And that's why the diffstat is minimalistic, but that does
not prove anything.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-25 17:48                                                                               ` Thomas Gleixner
  0 siblings, 0 replies; 406+ messages in thread
From: Thomas Gleixner @ 2011-05-25 17:48 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 25 May 2011, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> > On Tue, 24 May 2011, Ingo Molnar wrote:
> > > * Peter Zijlstra <peterz@infradead.org> wrote:
> > > 
> > > > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > > > >  include/linux/ftrace_event.h  |    4 +-
> > > > >  include/linux/perf_event.h    |   10 +++++---
> > > > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > > > >  kernel/seccomp.c              |    8 ++++++
> > > > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > > > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > > > 
> > > > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > > > (or any other active role for that matter).
> > > 
> > > I'd object to invisible side-effects as well, and vehemently so. But note how 
> > > intelligently it's used here: it's explicit in the code, it's used explicitly 
> > > in kernel/seccomp.c and the event generation place in 
> > > kernel/trace/trace_syscalls.c.
> > > 
> > > So this is a really flexible solution IMO and does not extend events with some 
> > > invisible 'active' role. It extends the *call site* with an open-coded active 
> > > role - which active role btw. already pre-existed.
> > 
> > We do _NOT_ make any decision based on the trace point so what's the
> > "pre-existing" active role in the syscall entry code?
> 
> The seccomp code we are discussing in this thread.

That's proposed code and has absolutely nothing to do with the
existing trace point semantics.
 
> > I'm all for code reuse and reuse of interfaces, but this is completely
> > wrong. Instrumentation and security decisions are two fundamentally
> > different things and we want them kept separate. Instrumentation is
> > not meant to make decisions. Just because we can does not mean that it
> > is a good idea.
> 
> Instrumentation does not 'make decisions': the calling site, which is 
> already emitting both the event and wants to do decisions based on 
> the data that also generates the event wants to do decisions.

You can repeat that as often as you want, it does not make it more
true. Fact is that the decision is made in the middle of the perf code.

> +     /* Transition the task if required. */
> +     if (ctx->type == task_context && event->attr.require_secure) {
> +#ifdef CONFIG_SECCOMP
> +		/* Don't allow perf events to escape mode = 1. */
> +		   if (!current->seccomp.mode)
> +			current->seccomp.mode = 2;
> +#endif
> +	}
 
and further down

> +   	    if (event->attr.err_on_discard)
> +		ok = -EACCES;
 
> Those decisions *will be made* and you cannot prevent that, the only 
> open question is can it reuse code intelligently, which code it is 
> btw. already calling for observation reasons?

The tracepoint is called for observation reasons and now you make it a
decision function. That's what I call abuse.

> ( Note that pure observers wont be affected and note that pure 
>   observation call sites are not affected either. )

Hahaha, they still have to run through the additional code when
seccomp is enabled and we still have to propagate the return value
down to the point where the tracepoint itself is. You call that not
affected?
 
> > So what the current approach does is:
> > 
> >  - abuse the existing ftrace syscall hook by adding a return value to
> >    the tracepoint.
> > 
> >    So we need to propagate that for every tracepoint just because we
> >    have a single user.
> 
> This is a technical criticism i share with you and i think it can be 
> fixed - i outlined it to Will yesterday.
> 
> And no, if done cleanly it's not 'abuse' to reuse code. Could we wait 
> for the first clean iteration of this patch instead of rushing 
> judgement prematurely?

There is no way to do it cleanly. It always comes for the price that
you add additional code into the tracing code path. And there are
other people who try hard to remove stuff to recude the overhead which
is caused by instrumentation.

> >  - abuse the perf per task mechanism
> > 
> >    Just because we have per task context in perf does not mean that we
> >    pull everything and the world which requires per task context into
> >    perf. The security folks have per task context already so security
> >    related stuff wants to go there.
> 
> We do not pull 'everything and the world' in, but code that wants to 
> process events in places that already emit events surely sounds 
> related to me :-)

We have enough places where different independent parts of the kernel
want to hook into for obvious reasons.

We have notifiers for those where performance does not matter much and
we have separate calls into the independent functions where it matters
or where we need to evaluate the results in specific ways.

So now you turn instrumentation into a security mechanism, which
"works" nicely for a particular purpose, i.e. decision on a particular
syscall number. Now, how do you make that work when a decision has to
be made on more than a simple match, e.g. syscall number + arguments ?

Not at all, unless you add more complexity and arbitrary callbacks
into the instrumentation code.

> > Brilliant, we have already two ABIs (perf/ftrace) to support and at
> > the same time we urgently need to solve the problem of better
> > integration of those two. So adding a third completely unrelated
> > component with a guaranteed ABI is just making this even more complex.
> 
> So your solution is to add yet another ABI for seccomp and to keep 
> seccomp a limited hack forever, just because you are not interested 
> in security?

Well, I'm interested in security, but I'm neither interested in
security decisions inside instrumentation code nor in security models
which are limited hacks by definition unless you want to add callback
complexities to the instrumentation code.

It all looks nice and charming with this minimalistic use case, but
add real features to it and it gets messy in a split second and you
can't hold that off once you started to allow A. And you clearly
stated that you want to have more trace point based security features
than the simple syscall number filtering.

> I think we want fewer ABIs and more flexible/reusable facilities.

I'm all for that, but security and instrumentation are different
beasts.
 
> > We can factor out the filtering code and let the security dudes 
> > reuse it for their own purposes. That makes them to have their own 
> > interfaces and does not impose any restrictions upon the 
> > tracing/perf ones. And really security stuff wants to be integrated 
> > into the existing security frameworks and not duct taped into 
> > perf/trace just because it's a conveniant hack around limitiations 
> > of the existing security stuff.
> 
> You are missing what i tried to point out in earlier discussions: 
> from a security design POV this isnt just about the system call 
> boundary. If this seccomp variant is based on events then it could 
> grow proper security checks in other places as well, in places where 
> we have some sort of object observation event anyway.

Right, that requires callbacks and more stuff to do object based
observation and ties a trace point into a place where it might not
make sense after a while, but the security decision wants to stay at
that place. The syscall example is so tempting because it's simplistic
and easy to solve, but every extension to that model is going to
create a nightmare.

You are duct taping stuff together which has totally different
semantics and requirements. And your only argument is reuse of
existing code. That's a good argument in principle, but there is a
fundamental difference between intelligent reuse and enforcing it just
for reuse sake.

> So this is opens up possibilities to reuse and unify code on a very 
> broad basis.

By making a total mess out of it? 

> So yes, over-integration is obviously wrong - but so is needless 
> fragmentation.

Right. And this falls into the category of over-integration. We have
people working on security and they are working on stacked security
models, so where is the justification to start another security
framework inside the instrumentation code which is completely non
interoperable with the existing ones?

> If anything then that should tell you something that events and 
> seccomp are not just casually related ...

They happen to have the hook at the same point in the source and for
pure coincidence it works because the problem to solve is extremly
simplistic. And that's why the diffstat is minimalistic, but that does
not prove anything.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 17:48                                                                               ` Thomas Gleixner
  (?)
  (?)
@ 2011-05-25 18:01                                                                               ` Kees Cook
  2011-05-25 18:42                                                                                 ` Linus Torvalds
  -1 siblings, 1 reply; 406+ messages in thread
From: Kees Cook @ 2011-05-25 18:01 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Ingo Molnar, Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel

Hi,

On Wed, May 25, 2011 at 07:48:51PM +0200, Thomas Gleixner wrote:
> On Wed, 25 May 2011, Ingo Molnar wrote:
> > * Thomas Gleixner <tglx@linutronix.de> wrote:
> > > On Tue, 24 May 2011, Ingo Molnar wrote:
> > > > * Peter Zijlstra <peterz@infradead.org> wrote:
> > > > 
> > > > > On Tue, 2011-05-24 at 10:59 -0500, Will Drewry wrote:
> > > > > >  include/linux/ftrace_event.h  |    4 +-
> > > > > >  include/linux/perf_event.h    |   10 +++++---
> > > > > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > > > > >  kernel/seccomp.c              |    8 ++++++
> > > > > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > > > > >  5 files changed, 82 insertions(+), 16 deletions(-) 
> > > > > 
> > > > > I strongly oppose to the perf core being mixed with any sekurity voodoo
> > > > > (or any other active role for that matter).
> > > > 
> > > > I'd object to invisible side-effects as well, and vehemently so. But note how 
> > > > intelligently it's used here: it's explicit in the code, it's used explicitly 
> > > > in kernel/seccomp.c and the event generation place in 
> > > > kernel/trace/trace_syscalls.c.
> > > > 
> > > > So this is a really flexible solution IMO and does not extend events with some 
> > > > invisible 'active' role. It extends the *call site* with an open-coded active 
> > > > role - which active role btw. already pre-existed.
> > > 
> > > We do _NOT_ make any decision based on the trace point so what's the
> > > "pre-existing" active role in the syscall entry code?
> > 
> > The seccomp code we are discussing in this thread.
> 
> That's proposed code and has absolutely nothing to do with the
> existing trace point semantics.
>  
> > > I'm all for code reuse and reuse of interfaces, but this is completely
> > > wrong. Instrumentation and security decisions are two fundamentally
> > > different things and we want them kept separate. Instrumentation is
> > > not meant to make decisions. Just because we can does not mean that it
> > > is a good idea.
> > 
> > Instrumentation does not 'make decisions': the calling site, which is 
> > already emitting both the event and wants to do decisions based on 
> > the data that also generates the event wants to do decisions.
> 
> You can repeat that as often as you want, it does not make it more
> true. Fact is that the decision is made in the middle of the perf code.

Can we just go back to the original spec? A lot of people were excited
about the prctl() API as done in Will's earlier patchset, we don't lose the
extremely useful "enable_on_exec" feature, and we can get away from all
this disagreement.

-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 18:01                                                                               ` Kees Cook
@ 2011-05-25 18:42                                                                                 ` Linus Torvalds
  2011-05-25 19:06                                                                                   ` Ingo Molnar
                                                                                                     ` (3 more replies)
  0 siblings, 4 replies; 406+ messages in thread
From: Linus Torvalds @ 2011-05-25 18:42 UTC (permalink / raw)
  To: Kees Cook
  Cc: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Will Drewry,
	Steven Rostedt, linux-kernel

On Wed, May 25, 2011 at 11:01 AM, Kees Cook <kees.cook@canonical.com> wrote:
>
> Can we just go back to the original spec? A lot of people were excited
> about the prctl() API as done in Will's earlier patchset, we don't lose the
> extremely useful "enable_on_exec" feature, and we can get away from all
> this disagreement.

.. and quite frankly, I'm not even convinced about the original simpler spec.

Security is a morass. People come up with cool ideas every day, and
nobody actually uses them - or if they use them, they are just a
maintenance nightmare.

Quite frankly, limiting pathname access by some prefix is "cool", but
it's basically useless.

That's not where security problems are.

Security problems are in the odd corners - ioctl's, /proc files,
random small interfaces that aren't just about file access.

And who would *use* this thing in real life? Nobody. In order to sell
me on a new security interface, give me a real actual use case that is
security-conscious and relevant to real users.

For things like web servers that actually want to limit filename
lookup, we'd be <i>much</i> better off with a few new flags to
pathname lookup that say "don't follow symlinks" and "don't follow
'..'". Things like that can actually be beneficial to
security-conscious programming, with very little overhead. Some of
those things currently look up pathnames one component at a time,
because they can't afford to not do so. That's a *much* better model
for the whole "only limit to this subtree" case that was quoted
sometime early in this thread.

And per-system-call permissions are very dubious. What system calls
don't you want to succeed? That ioctl? You just made it impossible to
do a modern graphical application. Yet the kind of thing where we
would _want_ to help users is in making it easier to sandbox something
like the adobe flash player. But without accelerated direct rendering,
that's not going to fly, is it?

So I'm sorry for throwing cold water on you guys, but the whole "let's
come up with a new security gadget" thing just makes me go "oh no, not
again".

                    Linus

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 18:42                                                                                 ` Linus Torvalds
@ 2011-05-25 19:06                                                                                   ` Ingo Molnar
  2011-05-25 19:54                                                                                     ` Will Drewry
  2011-05-25 19:11                                                                                   ` Kees Cook
                                                                                                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 406+ messages in thread
From: Ingo Molnar @ 2011-05-25 19:06 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kees Cook, Thomas Gleixner, Peter Zijlstra, Will Drewry,
	Steven Rostedt, linux-kernel


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> And per-system-call permissions are very dubious. What system calls 
> don't you want to succeed? That ioctl? You just made it impossible 
> to do a modern graphical application. Yet the kind of thing where 
> we would _want_ to help users is in making it easier to sandbox 
> something like the adobe flash player. But without accelerated 
> direct rendering, that's not going to fly, is it?

I was under the impression that Will had a very specific application 
in mind which actually works today and uses the inferior version of 
seccomp.

Will, mind filling us in on that?

I'd agree that adding any of this without a real serious app making 
real use of it would be pointless. I discussed this under the 
impression that the app existed :-)

I also got the very distinct impression from the various iterations 
that a real usecase existed behind it - all the fixes and 
considerations looked very realistic, not designed up for security's 
sake.

> So I'm sorry for throwing cold water on you guys, but the whole 
> "let's come up with a new security gadget" thing just makes me go 
> "oh no, not again".

Fair enough :-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-24 20:25                                                                       ` Kees Cook
@ 2011-05-25 19:09                                                                         ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-25 19:09 UTC (permalink / raw)
  To: Kees Cook; +Cc: Will Drewry, linux-kernel


* Kees Cook <kees.cook@canonical.com> wrote:

> [CC trimmed, as recommended]
> 
> Hi,
> 
> On Tue, May 24, 2011 at 10:08:15PM +0200, Ingo Molnar wrote:
> > * Will Drewry <wad@chromium.org> wrote:
> > 
> > > The change avoids defining a new trace call type or a huge number of internal 
> > > changes and hides seccomp.mode=2 from ABI-exposure in prctl, but the attack 
> > > surface is non-trivial to verify, and I'm not sure if this ABI change makes 
> > > sense. It amounts to:
> > > 
> > >  include/linux/ftrace_event.h  |    4 +-
> > >  include/linux/perf_event.h    |   10 +++++---
> > >  kernel/perf_event.c           |   49 +++++++++++++++++++++++++++++++++++++---
> > >  kernel/seccomp.c              |    8 ++++++
> > >  kernel/trace/trace_syscalls.c |   27 +++++++++++++++++-----
> > >  5 files changed, 82 insertions(+), 16 deletions(-)
> > > 
> > > And can be found here: http://static.dataspill.org/perf_secure/v1/
> > 
> > Wow, i'm very impressed how few changes you needed to do to support this!
> > [...]
> > attr.require_secure: this is basically used to *force* the creation of 
> > security-controlling filters, right? It seems to me that this could be done via 
> > a seccomp ABI extension as well, without adding this to the perf ABI. That 
> > seccomp call could check whether the right events are created and move the task 
> > to mode 2 only if that prereq is met - or something like that.
> 
> I understood the prctl() API that was outlined earlier, but it 
> seems this is not going to happen now. What would the programming 
> API actually look like for an application developer using this 
> perf-style method?

Well, this API is probably not going to happen either ;-)

The way is to create a perf event and install a filter by passing the 
filters ASCII string as a pointer to the kernel, using 
PERF_EVENT_IOC_SET_FILTER on the event fd. If this ever gets used 
seriously then it should probably move into its own system call - but 
that is a detail.

Installing a filter can be safely done by unprivileged user-space 
(the kernel checks it), and they get inherited across fork(), are 
properly per task, etc.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 18:42                                                                                 ` Linus Torvalds
  2011-05-25 19:06                                                                                   ` Ingo Molnar
@ 2011-05-25 19:11                                                                                   ` Kees Cook
  2011-05-25 20:01                                                                                     ` Linus Torvalds
  2011-05-26  1:19                                                                                   ` James Morris
  2011-05-29 16:51                                                                                   ` Aneesh Kumar K.V
  3 siblings, 1 reply; 406+ messages in thread
From: Kees Cook @ 2011-05-25 19:11 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Will Drewry,
	Steven Rostedt, linux-kernel

Hi Linus,

On Wed, May 25, 2011 at 11:42:44AM -0700, Linus Torvalds wrote:
> And who would *use* this thing in real life? Nobody. In order to sell
> me on a new security interface, give me a real actual use case that is
> security-conscious and relevant to real users.
> [...]
> And per-system-call permissions are very dubious. What system calls
> don't you want to succeed? That ioctl? You just made it impossible to
> do a modern graphical application. Yet the kind of thing where we
> would _want_ to help users is in making it easier to sandbox something
> like the adobe flash player. But without accelerated direct rendering,
> that's not going to fly, is it?

Uhm, what? Chrome would use it. And LXC would. Those were stated very
early on as projects extremely interested in syscall filtering. And that's
just the start, I can easily imagine Apache modules enforcing a very narrow
band of syscalls, or just about anything else that could be in a position
of running potentially malicious code. This could be very far-reaching, IMO.

-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 19:06                                                                                   ` Ingo Molnar
@ 2011-05-25 19:54                                                                                     ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-25 19:54 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Kees Cook, Thomas Gleixner, Peter Zijlstra,
	Steven Rostedt, linux-kernel

On Wed, May 25, 2011 at 2:06 PM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> And per-system-call permissions are very dubious. What system calls
>> don't you want to succeed? That ioctl? You just made it impossible
>> to do a modern graphical application. Yet the kind of thing where
>> we would _want_ to help users is in making it easier to sandbox
>> something like the adobe flash player. But without accelerated
>> direct rendering, that's not going to fly, is it?
>
> I was under the impression that Will had a very specific application
> in mind which actually works today and uses the inferior version of
> seccomp.
>
> Will, mind filling us in on that?

With pleasure!  I'll be a bit overly verbose to ensure I'm covering my
bases, I hope it's not too tedious.

Support for using system call filtering will be added to the Chromium
browser if it is accepted here.  At present, Chromium separates the
processing of untrusted input (html, javascript, images) into
standalone renderer processes.  In an effort to reduce the risks
associated with processing the data we put those renderers in a chroot
with a private VFS and PID namespace. This limits the ability for a
compromised renderer to signal() another process outside of the
"sandbox" or access files it shouldn't.

Ideally, the only exposed surface to the renderer would be the IPC
mechanism, memory allocation, etc.  That isn't possible today though
[*].  The renderer gets the whole syscall ABI.  In many cases, adding
support for (all of the) LSMs to the sandboxing methodology would help
mitigate the exposure.  There would be the code paths that handle the
user input prior to calling the LSM hooks, but after that point, the
renderer could be denied, shutdown, etc.  Unfortunately, there's no
one-to-one mapping from system calls to LSM hooks (nor do all stock
kernels from distros come with a pre-chosen and configured LSM).

To supply some concreteness, the perf_counter_open() system call comes
to mind.  It suffered from a stack-based buffer overflow when
processing the user-supplied arguments, and there was no effective
mechanism, LSM or otherwise, to prevent its access.  In my usecase, if
only a whitelist of required system calls was made available to the
Chromium renderer processes, then the addition of a bug like
perf_counter_open()'s to the kernel would not have provided a direct
means to escape the user-level sandboxing and execute arbitrary code
in the kernel.


As I mentioned, if it is possible to expand seccomp to provide a
system call access mechanism (bitmask, whatever),  I will expand the
Chromium sandbox to make use of it on every linux distro that ships
with it enabled.  In addition, my immediate work focus is on Chromium
OS.  I would like to apply system call filtering to every daemon in
the distribution alongside additional security defenses.  Also, I am
aware of many server-side uses but can't promise immediate deployment
in the same fashion.



[It's also worth noting that as more browser plugins, like Adobe
Flash, migrate to the Pepper API (chrome,mozilla), they will no longer
need direct hardware access (ioctl()s, fs, etc).  All system access
will be brokered via the browser which lets them be sandboxed entirely
-- including system call filtering is supported by the host platform.]

[*] it is possible to do crazy, on-the-fly syscall rewriting with
seccomp(1) and a trusted thread, but the performance cost is huge, the
portability is nil (pure asm), and the risk of a security bug is high.

> I'd agree that adding any of this without a real serious app making
> real use of it would be pointless. I discussed this under the
> impression that the app existed :-)
>
> I also got the very distinct impression from the various iterations
> that a real usecase existed behind it - all the fixes and
> considerations looked very realistic, not designed up for security's
> sake.
>
>> So I'm sorry for throwing cold water on you guys, but the whole
>> "let's come up with a new security gadget" thing just makes me go
>> "oh no, not again".
>
> Fair enough :-)

I don't want to boil the ocean and certainly am not interested in
reliving the LSM-wars. I want the missing piece of the puzzle when it
comes to reducing exposed kernel code.  seccomp.mode=1 is so close,
but its overly restrictive nature has made it implausible for nearly
all real-world uses.  A slight expansion to allow a system call
bitmask or simple filters would be sufficient for Chromium OS,
Chromium, qemu, and lxc use, among others.


Thanks for reading and replying!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 19:11                                                                                   ` Kees Cook
@ 2011-05-25 20:01                                                                                     ` Linus Torvalds
  2011-05-25 20:19                                                                                       ` Ingo Molnar
  2011-05-26 14:37                                                                                       ` Colin Walters
  0 siblings, 2 replies; 406+ messages in thread
From: Linus Torvalds @ 2011-05-25 20:01 UTC (permalink / raw)
  To: Kees Cook
  Cc: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Will Drewry,
	Steven Rostedt, linux-kernel

On Wed, May 25, 2011 at 12:11 PM, Kees Cook <kees.cook@canonical.com> wrote:
>
> Uhm, what? Chrome would use it. And LXC would. Those were stated very
> early on as projects extremely interested in syscall filtering.

.. and I seriously doubt it is workable.

Or at least it needs some actual working proof-of-concept thing.
Exactly because of issues like direct rendering etc, that require some
of the nastier system calls to work at all.

As to your example of apache modules - last I saw, most of those were
written in high-level scripting languages that almost invariably end
up using quite a bit of the system call interfaces. And more
importantly, almost nobody does unportable code.

So hey, I'm willing to be convinced. But I'll need more than people
_saying_ that they'd be interested. Because judging by past
performance, nobody ever uses esoteric cool new features.

                           Linus

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 20:01                                                                                     ` Linus Torvalds
@ 2011-05-25 20:19                                                                                       ` Ingo Molnar
  2011-06-09  9:00                                                                                         ` Sven Anders
  2011-05-26 14:37                                                                                       ` Colin Walters
  1 sibling, 1 reply; 406+ messages in thread
From: Ingo Molnar @ 2011-05-25 20:19 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kees Cook, Thomas Gleixner, Peter Zijlstra, Will Drewry,
	Steven Rostedt, linux-kernel


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Wed, May 25, 2011 at 12:11 PM, Kees Cook <kees.cook@canonical.com> wrote:
> >
> > Uhm, what? Chrome would use it. And LXC would. Those were stated very
> > early on as projects extremely interested in syscall filtering.
> 
> .. and I seriously doubt it is workable.
> 
> Or at least it needs some actual working proof-of-concept thing.
> Exactly because of issues like direct rendering etc, that require some
> of the nastier system calls to work at all.

Btw., Will's patch in this thread (which i think he tested with real 
code) implements an approach which detaches the concept from a rigid 
notion of 'syscall filtering' and opens it up for things like 
reliable pathname checks, memory object checks, etc. - without having 
to change the ABI.

If we go for syscall filtering as per bitmask, then we've pretty much 
condemned this to be limited to the syscall boundary alone.

So this sandboxing concept looked flexible enough to me to work 
itself up the security concept food chain *embedded in apps*.

<flame>

IMHO the key design mistake of LSM is that it detaches security 
policy from applications: you need to be admin to load policies, you 
need to be root to use/configure an LSM. Dammit, you need to be root 
to add labels to files!

This not only makes the LSM policies distro specific (and needlessly 
forked and detached from real security), but also gives the message 
that:

 'to ensure your security you need to be privileged'

which is the anti-concept of good security IMO.

So why not give unprivileged security policy facilities and let 
*Apps* shape their own security models. Yes, they will mess up 
initially and will reinvent the wheel. But socially IMO it will work 
a *lot* better in the long run: it's not imposed on them 
*externally*, it's something they can use and grow gradually. They 
will experience the security holes first hand and they will be *able 
to do something strategic about them* if we give them the right 
facilities.

At least the Chrome browser project appears to be intent on following 
such an approach. I consider a more bazaar alike approach more 
healthy, and it very much needs kernel help as LSMs are isolated from 
apps right now.

The thing is, we cannot possibly make the LSM situation much worse 
than it is today: i see *ALL* of the LSMs focused on all the wrong 
things!

But yes, i can understand that you are deeply sceptical.

</flame>

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 18:42                                                                                 ` Linus Torvalds
  2011-05-25 19:06                                                                                   ` Ingo Molnar
  2011-05-25 19:11                                                                                   ` Kees Cook
@ 2011-05-26  1:19                                                                                   ` James Morris
  2011-05-26  6:08                                                                                     ` Avi Kivity
  2011-05-26  8:24                                                                                     ` Ingo Molnar
  2011-05-29 16:51                                                                                   ` Aneesh Kumar K.V
  3 siblings, 2 replies; 406+ messages in thread
From: James Morris @ 2011-05-26  1:19 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kees Cook, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
	Will Drewry, Steven Rostedt, linux-kernel, Avi Kivity, gnatapov,
	Chris Wright

On Wed, 25 May 2011, Linus Torvalds wrote:

> And per-system-call permissions are very dubious. What system calls
> don't you want to succeed? That ioctl? You just made it impossible to
> do a modern graphical application. Yet the kind of thing where we
> would _want_ to help users is in making it easier to sandbox something
> like the adobe flash player. But without accelerated direct rendering,
> that's not going to fly, is it?

Going back to the initial idea proposed by Will, where seccomp is simply 
extended to filter all syscalls, there is potential benefit in being able 
to limit the attack surface of the syscall API.

This is not security mediation in terms of interaction between things 
(e.g. "allow A to read B").  It's a _hardening_ feature which prevents a 
process from being able to invoke potentially hundreds of syscalls is has 
no need for.  It would allow us to usefully restrict some well-established 
attack modes, e.g. triggering bugs in kernel code via unneeded syscalls.

This is orthogonal to access control schemes (such as SELinux), which are 
about mediating security-relevant interactions between objects.

One area of possible use is KVM/Qemu, where processes now contain entire 
operating systems, and the attack surface between them is now much broader 
e.g. a local unprivileged vulnerability is now effectively a 'remote' full 
system compromise.

There has been some discussion of this within the KVM project.  Using the 
existing seccomp facility is problematic in that it requires significant 
reworking of Qemu to a privsep model, which would also then incur a likely 
unacceptable context switching overhead.  The generalized seccomp filter 
as proposed by Will would provide a significant reduction in exposed 
syscalls and thus guest->host attack surface.

I've cc'd some KVM folk for more input on how this may or may not meet 
their requirements -- Avi/Gleb, there's a background writeup here: 
http://lwn.net/Articles/442569/ .  We may need a proof of concept and/or 
commitment to use this feature for it to be accepted upstream.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26  1:19                                                                                   ` James Morris
@ 2011-05-26  6:08                                                                                     ` Avi Kivity
  2011-05-26  8:24                                                                                     ` Ingo Molnar
  1 sibling, 0 replies; 406+ messages in thread
From: Avi Kivity @ 2011-05-26  6:08 UTC (permalink / raw)
  To: James Morris
  Cc: Linus Torvalds, Kees Cook, Thomas Gleixner, Ingo Molnar,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Eric Paris

On 05/26/2011 04:19 AM, James Morris wrote:
> On Wed, 25 May 2011, Linus Torvalds wrote:
>
> >  And per-system-call permissions are very dubious. What system calls
> >  don't you want to succeed? That ioctl? You just made it impossible to
> >  do a modern graphical application. Yet the kind of thing where we
> >  would _want_ to help users is in making it easier to sandbox something
> >  like the adobe flash player. But without accelerated direct rendering,
> >  that's not going to fly, is it?
>
> Going back to the initial idea proposed by Will, where seccomp is simply
> extended to filter all syscalls, there is potential benefit in being able
> to limit the attack surface of the syscall API.
>
> This is not security mediation in terms of interaction between things
> (e.g. "allow A to read B").  It's a _hardening_ feature which prevents a
> process from being able to invoke potentially hundreds of syscalls is has
> no need for.  It would allow us to usefully restrict some well-established
> attack modes, e.g. triggering bugs in kernel code via unneeded syscalls.
>
> This is orthogonal to access control schemes (such as SELinux), which are
> about mediating security-relevant interactions between objects.
>
> One area of possible use is KVM/Qemu, where processes now contain entire
> operating systems, and the attack surface between them is now much broader
> e.g. a local unprivileged vulnerability is now effectively a 'remote' full
> system compromise.
>
> There has been some discussion of this within the KVM project.  Using the
> existing seccomp facility is problematic in that it requires significant
> reworking of Qemu to a privsep model, which would also then incur a likely
> unacceptable context switching overhead.  The generalized seccomp filter
> as proposed by Will would provide a significant reduction in exposed
> syscalls and thus guest->host attack surface.
>
> I've cc'd some KVM folk for more input on how this may or may not meet
> their requirements -- Avi/Gleb, there's a background writeup here:
> http://lwn.net/Articles/442569/ .  We may need a proof of concept and/or
> commitment to use this feature for it to be accepted upstream.

Indeed are were looking at sandboxing as a means to mitigate the "guest 
exploits qemu, proceeds to exploit host syscall interface" scenario, and 
evolved seccomp looks like the best tradeoff in terms of security gains 
vs effort needed.

Eric Paris (copied) prototyped this with his own version of enhanced 
seccomp and achieved pretty good results, so a proof of concept will be 
quite easy to provide.

Regarding dynamic filtering, the biggest question here is how this will 
interact with hotplug, which requires new files to be opened in the 
sandboxed process (or SCM_RIGHTed in).  Any fd-based filtering will 
defeat that, so we'll need some way for a privileged monitor to adjust 
filters.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-16  0:36                                 ` James Morris
  (?)
@ 2011-05-26  6:27                                   ` Pavel Machek
  -1 siblings, 0 replies; 406+ messages in thread
From: Pavel Machek @ 2011-05-26  6:27 UTC (permalink / raw)
  To: James Morris
  Cc: Ingo Molnar, linux-mips, linux-sh, Peter Zijlstra,
	Frederic Weisbecker, Heiko Carstens, Oleg Nesterov,
	David Howells, Paul Mackerras, Eric Paris, H. Peter Anvin,
	sparclinux, Jiri Slaby, linux-s390, Russell King, x86,
	Linus Torvalds, Ingo Molnar, Benjamin Herrenschmidt, kees.cook,
	Serge E. Hallyn, Peter Zijlstra, Steven Rostedt, Tejun Heo,
	Thomas Gleixner, linux-arm-kernel, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller

  On Mon 2011-05-16 10:36:05, James Morris wrote:
> On Fri, 13 May 2011, Ingo Molnar wrote:
> How do you reason about the behavior of the system as a whole?
> 
> 
> > I argue that this is the LSM and audit subsystems designed right: in the long 
> > run it could allow everything that LSM does at the moment - and so much more 
> > ...
> 
> Now you're proposing a redesign of the security subsystem.  That's a 
> significant undertaking.
> 
> In the meantime, we have a simple, well-defined enhancement to seccomp 
> which will be very useful to current users in reducing their kernel attack 
> surface.

Well, you can do the same with subterfugue, even without kernel
changes. But that's ptrace -- slow. (And it already shows that syscall
based filters are extremely tricky to configure).

If yu want speed, seccomp+server for non-permitted operations seems like reasonable way.

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-26  6:27                                   ` Pavel Machek
  0 siblings, 0 replies; 406+ messages in thread
From: Pavel Machek @ 2011-05-26  6:27 UTC (permalink / raw)
  To: James Morris
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, linux-kernel, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, agl, Ingo Molnar, Ingo Molnar,
	Serge E. Hallyn, Peter Zijlstra, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, kees.cook, linux-arm-kernel,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	Oleg Nesterov, Eric Paris, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, Linus Torvalds, David S. Miller

  On Mon 2011-05-16 10:36:05, James Morris wrote:
> On Fri, 13 May 2011, Ingo Molnar wrote:
> How do you reason about the behavior of the system as a whole?
> 
> 
> > I argue that this is the LSM and audit subsystems designed right: in the long 
> > run it could allow everything that LSM does at the moment - and so much more 
> > ...
> 
> Now you're proposing a redesign of the security subsystem.  That's a 
> significant undertaking.
> 
> In the meantime, we have a simple, well-defined enhancement to seccomp 
> which will be very useful to current users in reducing their kernel attack 
> surface.

Well, you can do the same with subterfugue, even without kernel
changes. But that's ptrace -- slow. (And it already shows that syscall
based filters are extremely tricky to configure).

If yu want speed, seccomp+server for non-permitted operations seems like reasonable way.

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-26  6:27                                   ` Pavel Machek
  0 siblings, 0 replies; 406+ messages in thread
From: Pavel Machek @ 2011-05-26  6:27 UTC (permalink / raw)
  To: linux-arm-kernel

  On Mon 2011-05-16 10:36:05, James Morris wrote:
> On Fri, 13 May 2011, Ingo Molnar wrote:
> How do you reason about the behavior of the system as a whole?
> 
> 
> > I argue that this is the LSM and audit subsystems designed right: in the long 
> > run it could allow everything that LSM does at the moment - and so much more 
> > ...
> 
> Now you're proposing a redesign of the security subsystem.  That's a 
> significant undertaking.
> 
> In the meantime, we have a simple, well-defined enhancement to seccomp 
> which will be very useful to current users in reducing their kernel attack 
> surface.

Well, you can do the same with subterfugue, even without kernel
changes. But that's ptrace -- slow. (And it already shows that syscall
based filters are extremely tricky to configure).

If yu want speed, seccomp+server for non-permitted operations seems like reasonable way.

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26  1:19                                                                                   ` James Morris
  2011-05-26  6:08                                                                                     ` Avi Kivity
@ 2011-05-26  8:24                                                                                     ` Ingo Molnar
  2011-05-26  8:35                                                                                       ` Pekka Enberg
  2011-05-26  8:49                                                                                       ` Avi Kivity
  1 sibling, 2 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  8:24 UTC (permalink / raw)
  To: James Morris
  Cc: Linus Torvalds, Kees Cook, Thomas Gleixner, Peter Zijlstra,
	Will Drewry, Steven Rostedt, linux-kernel, Avi Kivity, gnatapov,
	Chris Wright, Pekka Enberg


* James Morris <jmorris@namei.org> wrote:

> On Wed, 25 May 2011, Linus Torvalds wrote:
> 
> > And per-system-call permissions are very dubious. What system 
> > calls don't you want to succeed? That ioctl? You just made it 
> > impossible to do a modern graphical application. Yet the kind of 
> > thing where we would _want_ to help users is in making it easier 
> > to sandbox something like the adobe flash player. But without 
> > accelerated direct rendering, that's not going to fly, is it?
> 
> Going back to the initial idea proposed by Will, where seccomp is 
> simply extended to filter all syscalls, there is potential benefit 
> in being able to limit the attack surface of the syscall API.

If controlling the system call boundary is found to be useful then 
the logical next logical step is to realize that limiting it to 
*only* the syscall boundary is shortsighted.

Also, here's a short reminder of the complexity evolution of this 
patch-set, which i've followed since it's been first posted in 2009:

       bitmask (2009):  6 files changed,  194 insertions(+), 22 deletions(-)
 filter engine (2010): 18 files changed, 1100 insertions(+), 21 deletions(-)
 event filters (2011):  5 files changed,   82 insertions(+), 16 deletions(-)

Interestingly, the events version is *by far* the most flexible one 
in both the short and the long run, and it is also the smallest patch 
...

It's a perfect fit and that's not really surprising: system call 
boundary hardening is about filtering various key parameters - while 
event tracing is about filtering various key parameters as well.

But it goes further than that: SELinux security policies are in 
essence primitive event filters as well, on an abstract level - see 
below for more details.

And yes, the primitive, coarse, per syscall allow/disallow bitmask v1 
version would not be too painful to the core kernel in terms of code 
impact and interaction with other code (it does not interact at all) 
- but it would still be sadly shortsighted to not explore the event 
filters angle, now that we have actual working code.

It would not improve the LSM situation one tiny bit either - the 
bitmask design would guarantee that the seccomp approach can never 
seriously replace the sucky LSM concepts we have in the kernel today.

> This is not security mediation in terms of interaction between 
> things (e.g. "allow A to read B").  It's a _hardening_ feature 
> which prevents a process from being able to invoke potentially 
> hundreds of syscalls is has no need for.  It would allow us to 
> usefully restrict some well-established attack modes, e.g. 
> triggering bugs in kernel code via unneeded syscalls.

If you think about it then you'll see that this artificial 
distinction between 'mediation' and 'hardening' is nonsense!

If we add the appropriate file label field to VFS tracing events 
(which would be useful for many instrumentation reasons as well) then 
the event filtering variant of Will's patch:

   _will be able to do object level security mediation too_

What is at the core of every access control concept, be that DAC, 
MAC, RBAC or ACL? Flexible task specific set of access vectors to 
file and other labeled objects, which cannot be circumvented by that 
task.

How can we implement a user-space file object manager via Will's 
event filters approach? It's actually pretty easy:

 - a simple object manager wants to know 'who' does something, 'what'
   it is trying to access, and then wants to generate an allow/deny
   action as a function of the (who,what) parameters:

     - The 'who' is a given as the event filters are per
       task, so different tasks with different roles can have
       different event filters. This is the equivalent of the current
       tasks's security context. [ Event filters installed by the
       parent cannot be removed by child tasks (they cannot even read
       them - it's transparent). ]

     - The most finegrained representation of 'what' are inode
       numbers. Because we do not want to generate rules for every
       single object we want to group objects and want to define
       access rules on different groups. This can be done by defining
       an event that emits file labels.

So a simple object manager would simply use file label event 
attributes and would define simple rules like:

	"(label & tmp_t) || (label & user_home_t)"

to allow access to /tmp and /home files. Filters allow us to define 
arbitrary access vectors to objects in essence. The above filters get 
passed to the kernel as an ASCII string by the object manager task, 
where the filter engine parses it safely and creates atomic 
predicates out of it, which can then be executed at the source of any 
event.

[ We could even implement a transparent AVC-cache equivalent for 
  filters, should the complexity and popularity of them increase: 
  ASCII filters lend themselves very well to hash based caches. ]

Similarly, support for other types of object tagging, network labels, 
etc. can be added as well with little pain: they can be added without 
any change to the basic ABI! Using events filters here makes it a 
very extensible security concept.

It is capable to implement the functional equivalent of most MAC, 
DAC, RBAC and other access control concepts, purely in user-space - 
in addition to 'hardening' (which btw. is really access control too, 
in disguise).

Obviously it is all layered: it is only allowed to control access to 
objects all the other security concepts allow for it to access - i.e. 
this is an unprivileged LSM, a per application security layer if you 
will, that can further refine security policies.

In terms of security models this event filters approach is 
unconditional goodness in my opinion.

> This is orthogonal to access control schemes (such as SELinux), 
> which are about mediating security-relevant interactions between 
> objects.

It's only 'orthogonal' because IMO you make two fundamental mistakes:

 1) You arbitrarily limit SELinux to object based security measures 
    alone.

    Which is not even true btw: SELinux certainly has some hooks it
    uses for pragmatic non-object hardening: for example all the
    places where we fall back to capabilities are places where
    there's a method based restriction not object based restriction.

    The KDSKBENT ioctl check for example in 
    security/selinux/hooks.c::selinux_file_ioctl(), or 
    selinux_vm_enough_memory(), or the CAP_MAC_ADMIN exception in
    selinux_inode_getsecurity() all violate 'pure' DAC concepts but
    obviously for pragmatic reasons SELinux is doing these ...

    mmap_min_addr is a borderline method restriction feature as well:
    it does not really control access to the underlying object (RAM),
    but controls one (of many) access methods to it by controlling
    virtual memory ...

    So SELinux, in a rather hypocritical fashion is already involved 
    in hardening and in filtering, because obviously any practical 
    and pragmatic security system *has to*.

 2) You arbitrarily limit Will's patch to *not* be able to
    implement object based security mechanisms. Why?

Syscall hardening and object based access rules are *deeply* 
connected, conceptually they are subsets of one and the same thing: a 
good, organic security model controlling different hierarchies of 
physical and derived (virtual) resources, which allows flexible 
control of both objects *and* methods.

The 'methods' (the syscalls and other functionality) are *also* a 
derived resource so it's entirely legitimate to control access to 
them. Yes, because they are methods you can also try to use them to 
restrict access to underlying objects - this is what AppArmour is 
about mostly, and yes i agree that in the general case it's not a 
particularly robust method.

And yes, i fully submit that object access control has theoretical 
advantages and it should often be the primary measure that gives a 
robust, often provable backbone to a secure system.

But you'd be out of your mind to not recognize:

 - The utility of controlling access methods (as resources) as well, 
   both to reduce the attack surface in the implementation of those 
   methods, and to allow the easy summary control of objects where 
   there's only a low number of (and often only a single!) access 
   method.

 - The utility of unprivileged security frameworks.

 - The utility of stackable security fetures. (defense in depth, 
   anyone?)

Will's astonishingly small patch:

 event filters (2011):  5 files changed,   82 insertions(+), 16 deletions(-)

Gives us *all three* of those, while also allowing user-space 
implemented MAC, DAC, RBAC as well.

> One area of possible use is KVM/Qemu, where processes now contain 
> entire operating systems, and the attack surface between them is 
> now much broader e.g. a local unprivileged vulnerability is now 
> effectively a 'remote' full system compromise.

Note that the main reason why Qemu needs access method hardening is 
because it has a dominantly state machine based design which does not 
lend itself very well to an object manager security design.

Note that tools/kvm/ would probably like to implement its own object 
manager model as well in addition to access method restrictions: by 
being virtual hardware it deals with many resources and object 
hierarchies that are simply not known to the host OS's LSM.

Unlike Qemu tools/kvm/ has a design that is very fit for MAC 
concepts: it uses separate helper threads for separate resources 
(this could in many cases even be changed to be separate processes 
which only share access to the guest RAM image) - while Qemu is in 
most parts a state machine, so in tools/kvm/ we can realistically 
have a good object manager and keep an exploit in a networking 
interface driver from being able to access disk driver state.

(I've Cc:-ed Pekka for tools/kvm/.)

> There has been some discussion of this within the KVM project.  
> Using the existing seccomp facility is problematic in that it 
> requires significant reworking of Qemu to a privsep model, which 
> would also then incur a likely unacceptable context switching 
> overhead.  The generalized seccomp filter as proposed by Will would 
> provide a significant reduction in exposed syscalls and thus 
> guest->host attack surface.

... and the event filter based method would *also* allow MAC to be 
defined over physical resources, such as virtual network interfaces, 
virtual disk devices, etc.

You are seriously limiting the capabilities of this feature for no 
good reason i can recognize.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26  8:24                                                                                     ` Ingo Molnar
@ 2011-05-26  8:35                                                                                       ` Pekka Enberg
  2011-05-26  8:49                                                                                       ` Avi Kivity
  1 sibling, 0 replies; 406+ messages in thread
From: Pekka Enberg @ 2011-05-26  8:35 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	Avi Kivity, gnatapov, Chris Wright, Pekka Enberg, Sasha Levin

Hi Ingo,

On Thu, May 26, 2011 at 11:24 AM, Ingo Molnar <mingo@elte.hu> wrote:
> Unlike Qemu tools/kvm/ has a design that is very fit for MAC
> concepts: it uses separate helper threads for separate resources
> (this could in many cases even be changed to be separate processes
> which only share access to the guest RAM image) - while Qemu is in
> most parts a state machine, so in tools/kvm/ we can realistically
> have a good object manager and keep an exploit in a networking
> interface driver from being able to access disk driver state.

I haven't really followed this particular discussion nor do I know if
Qemu is good or bad fit but sure, for tools/kvm Chrome-style
sandboxing makes tons of sense and would be a pretty good fit for how
our device model works now.

                        Pekka

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26  6:27                                   ` Pavel Machek
  (?)
@ 2011-05-26  8:35                                     ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  8:35 UTC (permalink / raw)
  To: Pavel Machek
  Cc: James Morris, linux-mips, linux-sh, Peter Zijlstra,
	Frederic Weisbecker, Heiko Carstens, Oleg Nesterov,
	David Howells, Paul Mackerras, Eric Paris, H. Peter Anvin,
	sparclinux, Jiri Slaby, linux-s390, Russell King, x86,
	Linus Torvalds, Ingo Molnar, Benjamin Herrenschmidt, kees.cook,
	Serge E. Hallyn, Peter Zijlstra, Steven Rostedt, Tejun Heo,
	Thomas Gleixner, linux-arm-kernel, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Martin Schwidefsky, linux390, Andrew Morton, agl,
	David S. Miller


* Pavel Machek <pavel@ucw.cz> wrote:

>   On Mon 2011-05-16 10:36:05, James Morris wrote:
> > On Fri, 13 May 2011, Ingo Molnar wrote:
> > How do you reason about the behavior of the system as a whole?
> > 
> > 
> > > I argue that this is the LSM and audit subsystems designed right: in the long 
> > > run it could allow everything that LSM does at the moment - and so much more 
> > > ...
> > 
> > Now you're proposing a redesign of the security subsystem.  That's a 
> > significant undertaking.
> > 
> > In the meantime, we have a simple, well-defined enhancement to seccomp 
> > which will be very useful to current users in reducing their kernel attack 
> > surface.
> 
> Well, you can do the same with subterfugue, even without kernel 
> changes. But that's ptrace -- slow. (And it already shows that 
> syscall based filters are extremely tricky to configure).

Yes, if you use syscall based filters to implement access to 
underlying objects where the access methods do not capture essential 
lifetime events properly (such as files) they you'll quickly run into 
trouble achieving a secure solution.

But you can robustly use syscall filters to control the underlying 
primary *resource*: various pieces of kernel code with *negative* 
utility to the current app - which have no use to the app but pose 
risks in terms of potential exploits in them.

But you can use event filters to implement arbitrary security 
policies robustly.

For example file objects: if you generate the right events for a 
class of objects then you can control access to them very robustly.

It's not a surprise that this is what SELinux does primarily: it has 
lifetime event hooks at the inode object (and socket, packet, etc.) 
level and captures those access attempts and validates them against 
the permissions of that object, in light of the accessing task's 
credentials.

Exactly that can be done with Will's patch as well, if its potential 
scope of event-checking points is not stupidly limited to the syscall 
boundary alone ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-26  8:35                                     ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  8:35 UTC (permalink / raw)
  To: Pavel Machek
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, linux-kernel, David Howells, Paul Mackerras,
	Ralf Baechle, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Peter Zijlstra, Steven Rostedt,
	Martin Schwidefsky, Thomas Gleixner, agl, linux-arm-kernel,
	Michal Marek, Michal Simek, Will Drewry, linuxppc-dev,
	Oleg Nesterov, Eric Paris, Paul Mundt, Tejun Heo, linux390,
	Andrew Morton, Linus Torvalds, David S. Miller


* Pavel Machek <pavel@ucw.cz> wrote:

>   On Mon 2011-05-16 10:36:05, James Morris wrote:
> > On Fri, 13 May 2011, Ingo Molnar wrote:
> > How do you reason about the behavior of the system as a whole?
> > 
> > 
> > > I argue that this is the LSM and audit subsystems designed right: in the long 
> > > run it could allow everything that LSM does at the moment - and so much more 
> > > ...
> > 
> > Now you're proposing a redesign of the security subsystem.  That's a 
> > significant undertaking.
> > 
> > In the meantime, we have a simple, well-defined enhancement to seccomp 
> > which will be very useful to current users in reducing their kernel attack 
> > surface.
> 
> Well, you can do the same with subterfugue, even without kernel 
> changes. But that's ptrace -- slow. (And it already shows that 
> syscall based filters are extremely tricky to configure).

Yes, if you use syscall based filters to implement access to 
underlying objects where the access methods do not capture essential 
lifetime events properly (such as files) they you'll quickly run into 
trouble achieving a secure solution.

But you can robustly use syscall filters to control the underlying 
primary *resource*: various pieces of kernel code with *negative* 
utility to the current app - which have no use to the app but pose 
risks in terms of potential exploits in them.

But you can use event filters to implement arbitrary security 
policies robustly.

For example file objects: if you generate the right events for a 
class of objects then you can control access to them very robustly.

It's not a surprise that this is what SELinux does primarily: it has 
lifetime event hooks at the inode object (and socket, packet, etc.) 
level and captures those access attempts and validates them against 
the permissions of that object, in light of the accessing task's 
credentials.

Exactly that can be done with Will's patch as well, if its potential 
scope of event-checking points is not stupidly limited to the syscall 
boundary alone ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-26  8:35                                     ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  8:35 UTC (permalink / raw)
  To: linux-arm-kernel


* Pavel Machek <pavel@ucw.cz> wrote:

>   On Mon 2011-05-16 10:36:05, James Morris wrote:
> > On Fri, 13 May 2011, Ingo Molnar wrote:
> > How do you reason about the behavior of the system as a whole?
> > 
> > 
> > > I argue that this is the LSM and audit subsystems designed right: in the long 
> > > run it could allow everything that LSM does at the moment - and so much more 
> > > ...
> > 
> > Now you're proposing a redesign of the security subsystem.  That's a 
> > significant undertaking.
> > 
> > In the meantime, we have a simple, well-defined enhancement to seccomp 
> > which will be very useful to current users in reducing their kernel attack 
> > surface.
> 
> Well, you can do the same with subterfugue, even without kernel 
> changes. But that's ptrace -- slow. (And it already shows that 
> syscall based filters are extremely tricky to configure).

Yes, if you use syscall based filters to implement access to 
underlying objects where the access methods do not capture essential 
lifetime events properly (such as files) they you'll quickly run into 
trouble achieving a secure solution.

But you can robustly use syscall filters to control the underlying 
primary *resource*: various pieces of kernel code with *negative* 
utility to the current app - which have no use to the app but pose 
risks in terms of potential exploits in them.

But you can use event filters to implement arbitrary security 
policies robustly.

For example file objects: if you generate the right events for a 
class of objects then you can control access to them very robustly.

It's not a surprise that this is what SELinux does primarily: it has 
lifetime event hooks at the inode object (and socket, packet, etc.) 
level and captures those access attempts and validates them against 
the permissions of that object, in light of the accessing task's 
credentials.

Exactly that can be done with Will's patch as well, if its potential 
scope of event-checking points is not stupidly limited to the syscall 
boundary alone ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 17:48                                                                               ` Thomas Gleixner
  (?)
@ 2011-05-26  8:43                                                                                 ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  8:43 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Will Drewry, Steven Rostedt, Frederic Weisbecker,
	James Morris, linux-kernel, Eric Paris, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, H. Peter Anvin, x86,
	linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390, linux-sh,
	sparclinux, Linus Torvalds


* Thomas Gleixner <tglx@linutronix.de> wrote:

> > > We do _NOT_ make any decision based on the trace point so 
> > > what's the "pre-existing" active role in the syscall entry 
> > > code?
> > 
> > The seccomp code we are discussing in this thread.
> 
> That's proposed code and has absolutely nothing to do with the 
> existing trace point semantics.

So because it's proposed code it does not exist?

If the feature is accepted (and given Linus's opinion it's not clear 
at all it's accepted in any form) then it's obviously a very 
legitimate technical concern whether we do:

	ret = seccomp_check_syscall_event(p1, p2, p3, p4, p5);
	if (ret)
		return -EACCES;

	... random code ...

	trace_syscall_event(p1, p2, p3, p4, p5);

Where seccomp_check_syscall_event() duplicates much of the machinery 
that is behind trace_syscall_event().

Or we do the more intelligent:

	ret = check_syscall_event(p1, p2, p3, p4, p5);
	if (ret)
		return -EACCES;

Where we have the happy side effects of:

  - less code at the call site

  - (a lot of!) shared infrastructure between the proposed seccomp 
    code and event filters.

  - we'd also be able to trace at security check boundaries - which
    has obvious bug analysis advantages.

In fact i do not see *any* advantages in keeping this needlessly 
bloaty and needlessly inconsistently sampled form of instrumentation:

	ret = seccomp_check_syscall_event(p1, p2, p3, p4, p5);
	if (ret)
		return -EACCES;

	... random code ...

	trace_syscall_event(p1, p2, p3, p4, p5);

Do you?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-26  8:43                                                                                 ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  8:43 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller


* Thomas Gleixner <tglx@linutronix.de> wrote:

> > > We do _NOT_ make any decision based on the trace point so 
> > > what's the "pre-existing" active role in the syscall entry 
> > > code?
> > 
> > The seccomp code we are discussing in this thread.
> 
> That's proposed code and has absolutely nothing to do with the 
> existing trace point semantics.

So because it's proposed code it does not exist?

If the feature is accepted (and given Linus's opinion it's not clear 
at all it's accepted in any form) then it's obviously a very 
legitimate technical concern whether we do:

	ret = seccomp_check_syscall_event(p1, p2, p3, p4, p5);
	if (ret)
		return -EACCES;

	... random code ...

	trace_syscall_event(p1, p2, p3, p4, p5);

Where seccomp_check_syscall_event() duplicates much of the machinery 
that is behind trace_syscall_event().

Or we do the more intelligent:

	ret = check_syscall_event(p1, p2, p3, p4, p5);
	if (ret)
		return -EACCES;

Where we have the happy side effects of:

  - less code at the call site

  - (a lot of!) shared infrastructure between the proposed seccomp 
    code and event filters.

  - we'd also be able to trace at security check boundaries - which
    has obvious bug analysis advantages.

In fact i do not see *any* advantages in keeping this needlessly 
bloaty and needlessly inconsistently sampled form of instrumentation:

	ret = seccomp_check_syscall_event(p1, p2, p3, p4, p5);
	if (ret)
		return -EACCES;

	... random code ...

	trace_syscall_event(p1, p2, p3, p4, p5);

Do you?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-26  8:43                                                                                 ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  8:43 UTC (permalink / raw)
  To: linux-arm-kernel


* Thomas Gleixner <tglx@linutronix.de> wrote:

> > > We do _NOT_ make any decision based on the trace point so 
> > > what's the "pre-existing" active role in the syscall entry 
> > > code?
> > 
> > The seccomp code we are discussing in this thread.
> 
> That's proposed code and has absolutely nothing to do with the 
> existing trace point semantics.

So because it's proposed code it does not exist?

If the feature is accepted (and given Linus's opinion it's not clear 
at all it's accepted in any form) then it's obviously a very 
legitimate technical concern whether we do:

	ret = seccomp_check_syscall_event(p1, p2, p3, p4, p5);
	if (ret)
		return -EACCES;

	... random code ...

	trace_syscall_event(p1, p2, p3, p4, p5);

Where seccomp_check_syscall_event() duplicates much of the machinery 
that is behind trace_syscall_event().

Or we do the more intelligent:

	ret = check_syscall_event(p1, p2, p3, p4, p5);
	if (ret)
		return -EACCES;

Where we have the happy side effects of:

  - less code at the call site

  - (a lot of!) shared infrastructure between the proposed seccomp 
    code and event filters.

  - we'd also be able to trace at security check boundaries - which
    has obvious bug analysis advantages.

In fact i do not see *any* advantages in keeping this needlessly 
bloaty and needlessly inconsistently sampled form of instrumentation:

	ret = seccomp_check_syscall_event(p1, p2, p3, p4, p5);
	if (ret)
		return -EACCES;

	... random code ...

	trace_syscall_event(p1, p2, p3, p4, p5);

Do you?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26  8:24                                                                                     ` Ingo Molnar
  2011-05-26  8:35                                                                                       ` Pekka Enberg
@ 2011-05-26  8:49                                                                                       ` Avi Kivity
  2011-05-26  8:57                                                                                         ` Pekka Enberg
  2011-05-26  9:30                                                                                         ` Ingo Molnar
  1 sibling, 2 replies; 406+ messages in thread
From: Avi Kivity @ 2011-05-26  8:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg

On 05/26/2011 11:24 AM, Ingo Molnar wrote:
> So a simple object manager would simply use file label event
> attributes and would define simple rules like:
>
> 	"(label&  tmp_t) || (label&  user_home_t)"

Filtering by label vs. filtering by descriptor would solve qemu's 
hotplug issue neatly.

> Note that tools/kvm/ would probably like to implement its own object
> manager model as well in addition to access method restrictions: by
> being virtual hardware it deals with many resources and object
> hierarchies that are simply not known to the host OS's LSM.
>
> Unlike Qemu tools/kvm/ has a design that is very fit for MAC
> concepts: it uses separate helper threads for separate resources
> (this could in many cases even be changed to be separate processes
> which only share access to the guest RAM image) - while Qemu is in
> most parts a state machine, so in tools/kvm/ we can realistically
> have a good object manager and keep an exploit in a networking
> interface driver from being able to access disk driver state.

You mean each thread will have a different security context?  I don't 
see the point.  All threads share all of memory so it would be trivial 
for one thread to exploit another and gain all of its privileges.

A multi process model works better but it has significant memory and 
performance overhead.

(well the memory overhead is much smaller when using transparent huge 
pages, but these only work for anonymous memory).

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26  8:49                                                                                       ` Avi Kivity
@ 2011-05-26  8:57                                                                                         ` Pekka Enberg
       [not found]                                                                                           ` <20110526085939.GG29458@redhat.com>
  2011-05-26  9:30                                                                                         ` Ingo Molnar
  1 sibling, 1 reply; 406+ messages in thread
From: Pekka Enberg @ 2011-05-26  8:57 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Ingo Molnar, James Morris, Linus Torvalds, Kees Cook,
	Thomas Gleixner, Peter Zijlstra, Will Drewry, Steven Rostedt,
	linux-kernel, gnatapov, Chris Wright, Pekka Enberg

Hi Avi,

On Thu, May 26, 2011 at 11:49 AM, Avi Kivity <avi@redhat.com> wrote:
> You mean each thread will have a different security context?  I don't see
> the point.  All threads share all of memory so it would be trivial for one
> thread to exploit another and gain all of its privileges.

So how would that happen? I'm assuming that once the security context
has been set up for a thread, you're not able to change it after that.
You'd be able to exploit other threads through shared memory but how
would you gain privileges?

                        Pekka

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 17:48                                                                               ` Thomas Gleixner
  (?)
@ 2011-05-26  9:15                                                                                 ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  9:15 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Will Drewry, Steven Rostedt, Frederic Weisbecker,
	James Morris, linux-kernel, Eric Paris, kees.cook, agl,
	Serge E. Hallyn, Ingo Molnar, Andrew Morton, Tejun Heo,
	Michal Marek, Oleg Nesterov, Jiri Slaby, David Howells,
	Russell King, Michal Simek, Ralf Baechle, Benjamin Herrenschmidt,
	Paul Mackerras, Martin Schwidefsky, Heiko Carstens, linux390,
	Paul Mundt, David S. Miller, H. Peter Anvin, x86,
	linux-arm-kernel, linux-mips, linuxppc-dev, linux-s390, linux-sh,
	sparclinux, Linus Torvalds


* Thomas Gleixner <tglx@linutronix.de> wrote:

> > If anything then that should tell you something that events and 
> > seccomp are not just casually related ...
> 
> They happen to have the hook at the same point in the source and 
> for pure coincidence it works because the problem to solve is 
> extremly simplistic. And that's why the diffstat is minimalistic, 
> but that does not prove anything.

Here are the diffstats of the various versions of this proposed 
security feature:

       bitmask (2009):  6 files changed,  194 insertions(+), 22 deletions(-)
 filter engine (2010): 18 files changed, 1100 insertions(+), 21 deletions(-)
 event filters (2011):  5 files changed,   82 insertions(+), 16 deletions(-)

The third variant, 'event filters', is actually the most 
sophisticated one of all and it is not simplistic at all.

The main reason why the diffstat is small is because it reuses over 
ten thousand lines of pre-existing kernel code intelligently. Are you 
interpreting that as some sort of failure of the patch? I think it's 
a very good thing.

To demonstrate the non-simplicity of the feature:

 - These security rules/filters can be sophisticated like:

   sys_close() rule protecting against the closing of 
   stdin/stdout/stderr:

                  "fd == 0 || fd == 1 || fd == 2"

   sys_ioperm() rule allowing port 0x80 access but nothing else:

                  "from != 128 || num != 1"

   sys_listen() rule limiting the max accept() backlog to 16 entries:

                  "backlog > 16"

   sys_mprotect(), sys_mmap[2](), sys_unmap() and sys_mremap() rule
   protecting the first 1 MB NULL pointer guard range:

                  "addr < 0x00100000"

   sys_setscheduler() rule protecting against the switch to 
   non-SCHED_OTHER scheduler policies:

                  "policy != 0"

   Most of these examples are finegrained access restrictions that 
   AFAIK are not possible with any of the LSM based security measures 
   that Linux offers today.

 - These security rules/filters can be safely used and installed by 
   unprivileged userspace, allowing arbitrary end user apps to define 
   their own, flexible security policies.

 - These security rules/filters get automatically inherited into child 
   tasks and child tasks cannot mess with them - they cannot even 
   query/observe that these filters *exist*.

 - These security rules/filters nest on each other in basically 
   arbitrary depth, giving us a working, implemented, stackable LSM
   concept.

 - These security rules/filters can be extended to arbitrary more 
   object lifetime events in the future, without changing the ABI.

 - These security rules/filters, unlike most LSM rules, can execute
   not just within hardirqs but also within deeply atomic contexts
   such as NMI contexts, putting far less restrictions on what can
   be security/access checked.

 - Access permission violations can be set up to generate events of
   the violations into a scalable ring-buffer, providing unprivileged
   security-auditing functionality to the managing task(s).

I'd call that anything but 'simplistic'.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-26  9:15                                                                                 ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  9:15 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-mips, linux-sh, Peter Zijlstra, Frederic Weisbecker,
	Heiko Carstens, Oleg Nesterov, David Howells, Paul Mackerras,
	Eric Paris, H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390,
	Russell King, x86, James Morris, Linus Torvalds, Ingo Molnar,
	kees.cook, Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky,
	linux-arm-kernel, Michal Marek, Michal Simek, Will Drewry,
	linuxppc-dev, linux-kernel, Ralf Baechle, Paul Mundt, Tejun Heo,
	linux390, Andrew Morton, agl, David S. Miller


* Thomas Gleixner <tglx@linutronix.de> wrote:

> > If anything then that should tell you something that events and 
> > seccomp are not just casually related ...
> 
> They happen to have the hook at the same point in the source and 
> for pure coincidence it works because the problem to solve is 
> extremly simplistic. And that's why the diffstat is minimalistic, 
> but that does not prove anything.

Here are the diffstats of the various versions of this proposed 
security feature:

       bitmask (2009):  6 files changed,  194 insertions(+), 22 deletions(-)
 filter engine (2010): 18 files changed, 1100 insertions(+), 21 deletions(-)
 event filters (2011):  5 files changed,   82 insertions(+), 16 deletions(-)

The third variant, 'event filters', is actually the most 
sophisticated one of all and it is not simplistic at all.

The main reason why the diffstat is small is because it reuses over 
ten thousand lines of pre-existing kernel code intelligently. Are you 
interpreting that as some sort of failure of the patch? I think it's 
a very good thing.

To demonstrate the non-simplicity of the feature:

 - These security rules/filters can be sophisticated like:

   sys_close() rule protecting against the closing of 
   stdin/stdout/stderr:

                  "fd == 0 || fd == 1 || fd == 2"

   sys_ioperm() rule allowing port 0x80 access but nothing else:

                  "from != 128 || num != 1"

   sys_listen() rule limiting the max accept() backlog to 16 entries:

                  "backlog > 16"

   sys_mprotect(), sys_mmap[2](), sys_unmap() and sys_mremap() rule
   protecting the first 1 MB NULL pointer guard range:

                  "addr < 0x00100000"

   sys_setscheduler() rule protecting against the switch to 
   non-SCHED_OTHER scheduler policies:

                  "policy != 0"

   Most of these examples are finegrained access restrictions that 
   AFAIK are not possible with any of the LSM based security measures 
   that Linux offers today.

 - These security rules/filters can be safely used and installed by 
   unprivileged userspace, allowing arbitrary end user apps to define 
   their own, flexible security policies.

 - These security rules/filters get automatically inherited into child 
   tasks and child tasks cannot mess with them - they cannot even 
   query/observe that these filters *exist*.

 - These security rules/filters nest on each other in basically 
   arbitrary depth, giving us a working, implemented, stackable LSM
   concept.

 - These security rules/filters can be extended to arbitrary more 
   object lifetime events in the future, without changing the ABI.

 - These security rules/filters, unlike most LSM rules, can execute
   not just within hardirqs but also within deeply atomic contexts
   such as NMI contexts, putting far less restrictions on what can
   be security/access checked.

 - Access permission violations can be set up to generate events of
   the violations into a scalable ring-buffer, providing unprivileged
   security-auditing functionality to the managing task(s).

I'd call that anything but 'simplistic'.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-26  9:15                                                                                 ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  9:15 UTC (permalink / raw)
  To: linux-arm-kernel


* Thomas Gleixner <tglx@linutronix.de> wrote:

> > If anything then that should tell you something that events and 
> > seccomp are not just casually related ...
> 
> They happen to have the hook at the same point in the source and 
> for pure coincidence it works because the problem to solve is 
> extremly simplistic. And that's why the diffstat is minimalistic, 
> but that does not prove anything.

Here are the diffstats of the various versions of this proposed 
security feature:

       bitmask (2009):  6 files changed,  194 insertions(+), 22 deletions(-)
 filter engine (2010): 18 files changed, 1100 insertions(+), 21 deletions(-)
 event filters (2011):  5 files changed,   82 insertions(+), 16 deletions(-)

The third variant, 'event filters', is actually the most 
sophisticated one of all and it is not simplistic at all.

The main reason why the diffstat is small is because it reuses over 
ten thousand lines of pre-existing kernel code intelligently. Are you 
interpreting that as some sort of failure of the patch? I think it's 
a very good thing.

To demonstrate the non-simplicity of the feature:

 - These security rules/filters can be sophisticated like:

   sys_close() rule protecting against the closing of 
   stdin/stdout/stderr:

                  "fd == 0 || fd == 1 || fd == 2"

   sys_ioperm() rule allowing port 0x80 access but nothing else:

                  "from != 128 || num != 1"

   sys_listen() rule limiting the max accept() backlog to 16 entries:

                  "backlog > 16"

   sys_mprotect(), sys_mmap[2](), sys_unmap() and sys_mremap() rule
   protecting the first 1 MB NULL pointer guard range:

                  "addr < 0x00100000"

   sys_setscheduler() rule protecting against the switch to 
   non-SCHED_OTHER scheduler policies:

                  "policy != 0"

   Most of these examples are finegrained access restrictions that 
   AFAIK are not possible with any of the LSM based security measures 
   that Linux offers today.

 - These security rules/filters can be safely used and installed by 
   unprivileged userspace, allowing arbitrary end user apps to define 
   their own, flexible security policies.

 - These security rules/filters get automatically inherited into child 
   tasks and child tasks cannot mess with them - they cannot even 
   query/observe that these filters *exist*.

 - These security rules/filters nest on each other in basically 
   arbitrary depth, giving us a working, implemented, stackable LSM
   concept.

 - These security rules/filters can be extended to arbitrary more 
   object lifetime events in the future, without changing the ABI.

 - These security rules/filters, unlike most LSM rules, can execute
   not just within hardirqs but also within deeply atomic contexts
   such as NMI contexts, putting far less restrictions on what can
   be security/access checked.

 - Access permission violations can be set up to generate events of
   the violations into a scalable ring-buffer, providing unprivileged
   security-auditing functionality to the managing task(s).

I'd call that anything but 'simplistic'.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26  8:49                                                                                       ` Avi Kivity
  2011-05-26  8:57                                                                                         ` Pekka Enberg
@ 2011-05-26  9:30                                                                                         ` Ingo Molnar
  2011-05-26  9:48                                                                                           ` Ingo Molnar
  2011-05-26 10:56                                                                                           ` Avi Kivity
  1 sibling, 2 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  9:30 UTC (permalink / raw)
  To: Avi Kivity
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg


* Avi Kivity <avi@redhat.com> wrote:

> > Note that tools/kvm/ would probably like to implement its own 
> > object manager model as well in addition to access method 
> > restrictions: by being virtual hardware it deals with many 
> > resources and object hierarchies that are simply not known to the 
> > host OS's LSM.
> >
> > Unlike Qemu tools/kvm/ has a design that is very fit for MAC 
> > concepts: it uses separate helper threads for separate resources 
> > (this could in many cases even be changed to be separate 
> > processes which only share access to the guest RAM image) - while 
> > Qemu is in most parts a state machine, so in tools/kvm/ we can 
> > realistically have a good object manager and keep an exploit in a 
> > networking interface driver from being able to access disk driver 
> > state.
> 
> You mean each thread will have a different security context?  I 
> don't see the point.  All threads share all of memory so it would 
> be trivial for one thread to exploit another and gain all of its 
> privileges.

You are missing the geniality of the tools/kvm/ thread pool! :-)

It could be switched to a worker *process* model rather easily. Guest 
RAM and (a limited amount of) global resources would be shared via 
mmap(SHARED), but otherwise each worker process would have its own 
stack, its own subsystem-specific state, etc.

Exploiting other device domains via the shared guest RAM image is not 
possible, we treat guest RAM as untrusted data already.

Devices, like real hardware devices, are functionally pretty 
independent from each other, so this security model is rather natural 
and makes a lot of sense.

> A multi process model works better but it has significant memory 
> and performance overhead.

Not in Linux :-) We context-switch between processes almost as 
quickly as we do between threads. With modern tagged TLB hardware 
it's even faster.

> (well the memory overhead is much smaller when using transparent 
> huge pages, but these only work for anonymous memory).

The biggest amount of RAM is the guest RAM image - but if that is 
mmap(SHARED) and mapped using hugepages then the pte overhead from a 
process model is largely mitigated.

Once we have a process model then isolation and MAC between devices 
becomes a very real possibility: exploit via one network interface 
cannot break into a disk interface.

Maybe even the isolation and per device access control of 
*same-class* devices from each other is possible: with careful 
implementation of the subsystem shared data structures. (which isnt 
much really)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26  9:30                                                                                         ` Ingo Molnar
@ 2011-05-26  9:48                                                                                           ` Ingo Molnar
  2011-05-26 11:02                                                                                             ` Avi Kivity
  2011-05-26 10:56                                                                                           ` Avi Kivity
  1 sibling, 1 reply; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26  9:48 UTC (permalink / raw)
  To: Avi Kivity
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg


* Ingo Molnar <mingo@elte.hu> wrote:

> You are missing the geniality of the tools/kvm/ thread pool! :-)
> 
> It could be switched to a worker *process* model rather easily. 
> Guest RAM and (a limited amount of) global resources would be 
> shared via mmap(SHARED), but otherwise each worker process would 
> have its own stack, its own subsystem-specific state, etc.

We get VM exit events in the vcpu threads which after minimal 
processing pass much of the work to the thread pool. Most of the 
virtio work (which could be a source of vulnerability - ringbuffers 
are hard) is done in the worker task context.

It would be possible to further increase isolation there by also 
passing the IO/MMIO decoding to the worker thread - but i'm not sure 
that's truly needed. Most of the risk is where most of the code is - 
and the code is in the worker task which interprets on-disk data, 
protocols, etc.

So we could not only isolate devices from each other, but we could 
also protect the highly capable vcpu fd from exploits in devices - 
worker threads generally do not need access to the vcpu fd IIRC.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
       [not found]                                                                                           ` <20110526085939.GG29458@redhat.com>
@ 2011-05-26 10:38                                                                                             ` Ingo Molnar
  2011-05-26 10:46                                                                                               ` Avi Kivity
  2011-05-26 10:46                                                                                               ` Gleb Natapov
  0 siblings, 2 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 10:38 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Pekka Enberg, Avi Kivity, James Morris, Linus Torvalds,
	Kees Cook, Thomas Gleixner, Peter Zijlstra, Will Drewry,
	Steven Rostedt, linux-kernel, Chris Wright, Pekka Enberg


* Gleb Natapov <gleb@redhat.com> wrote:

> On Thu, May 26, 2011 at 11:57:51AM +0300, Pekka Enberg wrote:
> > Hi Avi,
> > 
> > On Thu, May 26, 2011 at 11:49 AM, Avi Kivity <avi@redhat.com> wrote:
> >
> > > You mean each thread will have a different security context?  I 
> > > don't see the point.  All threads share all of memory so it 
> > > would be trivial for one thread to exploit another and gain all 
> > > of its privileges.
> > 
> > So how would that happen? I'm assuming that once the security 
> > context has been set up for a thread, you're not able to change 
> > it after that. You'd be able to exploit other threads through 
> > shared memory but how would you gain privileges?
> 
> By tricking other threads to execute code for you. Just replace 
> return address on the other's thread stack.

That kind of exploit is not possible if the worker pool consists of 
processes - which would be rather easy to achieve with tools/kvm/.

In that model each process has its own stack, not accessible to other 
worker processes. They'd only share the guest RAM image and some 
(minimal) global state.

This way the individual devices are (optionally) isolated from each 
other. In a way this is a microkernel done right ;-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 10:38                                                                                             ` Ingo Molnar
@ 2011-05-26 10:46                                                                                               ` Avi Kivity
  2011-05-26 10:46                                                                                               ` Gleb Natapov
  1 sibling, 0 replies; 406+ messages in thread
From: Avi Kivity @ 2011-05-26 10:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Gleb Natapov, Pekka Enberg, James Morris, Linus Torvalds,
	Kees Cook, Thomas Gleixner, Peter Zijlstra, Will Drewry,
	Steven Rostedt, linux-kernel, Chris Wright, Pekka Enberg

On 05/26/2011 01:38 PM, Ingo Molnar wrote:
> * Gleb Natapov<gleb@redhat.com>  wrote:
>
> >  On Thu, May 26, 2011 at 11:57:51AM +0300, Pekka Enberg wrote:
> >  >  Hi Avi,
> >  >
> >  >  On Thu, May 26, 2011 at 11:49 AM, Avi Kivity<avi@redhat.com>  wrote:
> >  >
> >  >  >  You mean each thread will have a different security context?  I
> >  >  >  don't see the point.  All threads share all of memory so it
> >  >  >  would be trivial for one thread to exploit another and gain all
> >  >  >  of its privileges.
> >  >
> >  >  So how would that happen? I'm assuming that once the security
> >  >  context has been set up for a thread, you're not able to change
> >  >  it after that. You'd be able to exploit other threads through
> >  >  shared memory but how would you gain privileges?
> >
> >  By tricking other threads to execute code for you. Just replace
> >  return address on the other's thread stack.
>
> That kind of exploit is not possible if the worker pool consists of
> processes - which would be rather easy to achieve with tools/kvm/.
>
> In that model each process has its own stack, not accessible to other
> worker processes. They'd only share the guest RAM image and some
> (minimal) global state.
>
> This way the individual devices are (optionally) isolated from each
> other. In a way this is a microkernel done right ;-)

It's really hard to achieve, since devices have global interactions.  
For example a PCI device can change the memory layout when a BAR is 
programmed.  So you would have a lot of message passing going on (not at 
runtime, so no huge impact on performance).  The programming model is 
very different.

Note that message passing is in fact quite a good way to model hardware, 
since what different devices actually do is pass messages to each 
other.  I expect if done this way, the device model would be better than 
what we have today.  But it's not an easy step away from threads.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 10:38                                                                                             ` Ingo Molnar
  2011-05-26 10:46                                                                                               ` Avi Kivity
@ 2011-05-26 10:46                                                                                               ` Gleb Natapov
  2011-05-26 11:11                                                                                                 ` Ingo Molnar
  1 sibling, 1 reply; 406+ messages in thread
From: Gleb Natapov @ 2011-05-26 10:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka Enberg, Avi Kivity, James Morris, Linus Torvalds,
	Kees Cook, Thomas Gleixner, Peter Zijlstra, Will Drewry,
	Steven Rostedt, linux-kernel, Chris Wright, Pekka Enberg

On Thu, May 26, 2011 at 12:38:36PM +0200, Ingo Molnar wrote:
> 
> * Gleb Natapov <gleb@redhat.com> wrote:
> 
> > On Thu, May 26, 2011 at 11:57:51AM +0300, Pekka Enberg wrote:
> > > Hi Avi,
> > > 
> > > On Thu, May 26, 2011 at 11:49 AM, Avi Kivity <avi@redhat.com> wrote:
> > >
> > > > You mean each thread will have a different security context?  I 
> > > > don't see the point.  All threads share all of memory so it 
> > > > would be trivial for one thread to exploit another and gain all 
> > > > of its privileges.
> > > 
> > > So how would that happen? I'm assuming that once the security 
> > > context has been set up for a thread, you're not able to change 
> > > it after that. You'd be able to exploit other threads through 
> > > shared memory but how would you gain privileges?
> > 
> > By tricking other threads to execute code for you. Just replace 
> > return address on the other's thread stack.
> 
> That kind of exploit is not possible if the worker pool consists of 
> processes - which would be rather easy to achieve with tools/kvm/.
> 
Well, of course. There original question was about threads.

> In that model each process has its own stack, not accessible to other 
> worker processes. They'd only share the guest RAM image and some 
> (minimal) global state.
> 
> This way the individual devices are (optionally) isolated from each 
> other. In a way this is a microkernel done right ;-)
> 
But doesn't this design suffer the same problem as microkernel? Namely
a lot of slow IPCs?

--
			Gleb.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26  9:30                                                                                         ` Ingo Molnar
  2011-05-26  9:48                                                                                           ` Ingo Molnar
@ 2011-05-26 10:56                                                                                           ` Avi Kivity
  2011-05-26 11:38                                                                                             ` Ingo Molnar
  1 sibling, 1 reply; 406+ messages in thread
From: Avi Kivity @ 2011-05-26 10:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg

On 05/26/2011 12:30 PM, Ingo Molnar wrote:
> * Avi Kivity<avi@redhat.com>  wrote:
>
> >  >  Note that tools/kvm/ would probably like to implement its own
> >  >  object manager model as well in addition to access method
> >  >  restrictions: by being virtual hardware it deals with many
> >  >  resources and object hierarchies that are simply not known to the
> >  >  host OS's LSM.
> >  >
> >  >  Unlike Qemu tools/kvm/ has a design that is very fit for MAC
> >  >  concepts: it uses separate helper threads for separate resources
> >  >  (this could in many cases even be changed to be separate
> >  >  processes which only share access to the guest RAM image) - while
> >  >  Qemu is in most parts a state machine, so in tools/kvm/ we can
> >  >  realistically have a good object manager and keep an exploit in a
> >  >  networking interface driver from being able to access disk driver
> >  >  state.
> >
> >  You mean each thread will have a different security context?  I
> >  don't see the point.  All threads share all of memory so it would
> >  be trivial for one thread to exploit another and gain all of its
> >  privileges.
>
> You are missing the geniality of the tools/kvm/ thread pool! :-)

I'm sure the thread pool is very general, but the hardware we're 
modelling is not.

> It could be switched to a worker *process* model rather easily. Guest
> RAM and (a limited amount of) global resources would be shared via
> mmap(SHARED), but otherwise each worker process would have its own
> stack, its own subsystem-specific state, etc.

Suppose a guest reconfigures a device's MSI page, and suppose that's 
handled by the device's process.  Now it's not sufficient to update some 
global state, you have to go and tell the host kernel about it.  With 
good privilege separation the device process would not be permitted to 
do that; now it has to pass a message to a process that is.

Same thing applies for BARs, reset signals, live migration, etc.

> Exploiting other device domains via the shared guest RAM image is not
> possible, we treat guest RAM as untrusted data already.

Right.

> Devices, like real hardware devices, are functionally pretty
> independent from each other, so this security model is rather natural
> and makes a lot of sense.

When just pushing packets, you are right.  However setup/configuration 
is hardly clean.

Consider a CD-ROM eject, for example.  Now it can't be done by a simple 
callback.

> >  A multi process model works better but it has significant memory
> >  and performance overhead.
>
> Not in Linux :-) We context-switch between processes almost as
> quickly as we do between threads. With modern tagged TLB hardware
> it's even faster.

Once we get PCID in, yes.  There's still the message passing overhead, 
and unnecessary context switches.  In a threaded model you can choose 
whether to switch threads or not, in a process model you cannot.

> >  (well the memory overhead is much smaller when using transparent
> >  huge pages, but these only work for anonymous memory).
>
> The biggest amount of RAM is the guest RAM image - but if that is
> mmap(SHARED) and mapped using hugepages then the pte overhead from a
> process model is largely mitigated.

That doesn't work with memory hotplug.

> Once we have a process model then isolation and MAC between devices
> becomes a very real possibility: exploit via one network interface
> cannot break into a disk interface.

Yes, certainly.

> Maybe even the isolation and per device access control of
> *same-class* devices from each other is possible: with careful
> implementation of the subsystem shared data structures. (which isnt
> much really)

Right, hardly at all in fact.  The problem comes from the side-band 
issues like reset, interrupts, hotplug, and whatnot.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26  9:48                                                                                           ` Ingo Molnar
@ 2011-05-26 11:02                                                                                             ` Avi Kivity
  2011-05-26 11:16                                                                                               ` Ingo Molnar
  0 siblings, 1 reply; 406+ messages in thread
From: Avi Kivity @ 2011-05-26 11:02 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg

On 05/26/2011 12:48 PM, Ingo Molnar wrote:
> * Ingo Molnar<mingo@elte.hu>  wrote:
>
> >  You are missing the geniality of the tools/kvm/ thread pool! :-)
> >
> >  It could be switched to a worker *process* model rather easily.
> >  Guest RAM and (a limited amount of) global resources would be
> >  shared via mmap(SHARED), but otherwise each worker process would
> >  have its own stack, its own subsystem-specific state, etc.
>
> We get VM exit events in the vcpu threads which after minimal
> processing pass much of the work to the thread pool. Most of the
> virtio work (which could be a source of vulnerability - ringbuffers
> are hard) is done in the worker task context.
>
> It would be possible to further increase isolation there by also
> passing the IO/MMIO decoding to the worker thread - but i'm not sure
> that's truly needed. Most of the risk is where most of the code is -
> and the code is in the worker task which interprets on-disk data,
> protocols, etc.

I've suggested in the past to add an "mmiofd" facility to kvm, similar 
to ioeventfd.  This is how it would work:

- userspace configures kvm with an mmio range and a pipe
- guest writes to that range write a packet to the pipe describing the write
- guest reads from that range write a packet to the pipe describing the 
read, then wait for a reply packet with the result

The advantages would be
- avoid heavyweight exit; kvm can simply wake up a thread on another 
core and resume processing
- writes can be pipelined, similar to how PCI writes are posted
- supports process separation

So far no one has posted an implementation but it should be pretty simple.

> So we could not only isolate devices from each other, but we could
> also protect the highly capable vcpu fd from exploits in devices -
> worker threads generally do not need access to the vcpu fd IIRC.

Yes.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 10:46                                                                                               ` Gleb Natapov
@ 2011-05-26 11:11                                                                                                 ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 11:11 UTC (permalink / raw)
  To: Gleb Natapov
  Cc: Pekka Enberg, Avi Kivity, James Morris, Linus Torvalds,
	Kees Cook, Thomas Gleixner, Peter Zijlstra, Will Drewry,
	Steven Rostedt, linux-kernel, Chris Wright, Pekka Enberg


* Gleb Natapov <gleb@redhat.com> wrote:

> > In that model each process has its own stack, not accessible to 
> > other worker processes. They'd only share the guest RAM image and 
> > some (minimal) global state.
> > 
> > This way the individual devices are (optionally) isolated from 
> > each other. In a way this is a microkernel done right ;-)
> 
> But doesn't this design suffer the same problem as microkernel? 
> Namely a lot of slow IPCs?

Most of the IPCs we do already, to keep the devices separated from 
each other. So the most common type of IPC comes 'for free' in that 
model - and this is specific to virtualization so i'd not extend the 
claim to the host kernel.

virtio is an IPC mechanism to begin with.

It's certainly not entirely free though so if this is implemented in 
tools/kvm/ it should be configurable.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 11:02                                                                                             ` Avi Kivity
@ 2011-05-26 11:16                                                                                               ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 11:16 UTC (permalink / raw)
  To: Avi Kivity
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg


* Avi Kivity <avi@redhat.com> wrote:

> > It would be possible to further increase isolation there by also 
> > passing the IO/MMIO decoding to the worker thread - but i'm not 
> > sure that's truly needed. Most of the risk is where most of the 
> > code is - and the code is in the worker task which interprets 
> > on-disk data, protocols, etc.
> 
> I've suggested in the past to add an "mmiofd" facility to kvm, 
> similar to ioeventfd.  This is how it would work:
> 
> - userspace configures kvm with an mmio range and a pipe
> - guest writes to that range write a packet to the pipe describing the write
> - guest reads from that range write a packet to the pipe describing
> the read, then wait for a reply packet with the result
> 
> The advantages would be
> - avoid heavyweight exit; kvm can simply wake up a thread on another
> core and resume processing
> - writes can be pipelined, similar to how PCI writes are posted
> - supports process separation

Yes, that was my exact thought, a per transport channel fd.

> So far no one has posted an implementation but it should be pretty 
> simple.

tools/kvm/ could make quick use of it - and it's a performance 
optimization mainly IMO, not primarily a security feature.

If you whip up a quick untested prototype for the KVM side we could 
look into adding tooling support for it and could test it.

As long as it's provided as an opt-in ioctl() which if fails (on 
older kernels) we fall back to the vcpu-fd, it should be relatively 
straightforward to support on the tooling side as well AFAICS.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 10:56                                                                                           ` Avi Kivity
@ 2011-05-26 11:38                                                                                             ` Ingo Molnar
  2011-05-26 18:06                                                                                               ` Avi Kivity
  0 siblings, 1 reply; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 11:38 UTC (permalink / raw)
  To: Avi Kivity
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg


* Avi Kivity <avi@redhat.com> wrote:

> > The biggest amount of RAM is the guest RAM image - but if that is 
> > mmap(SHARED) and mapped using hugepages then the pte overhead 
> > from a process model is largely mitigated.
> 
> That doesn't work with memory hotplug.

Why not, if we do the sensible thing and restrict the size 
granularity and alignment of plugged/unplugged memory regions to 2MB?

We can fix guest Linux as well to not be stupid about the sizing of 
memory hotplug requests. It does hotplug based on the memory map we 
pass to it anyway.

Am i missing something obvious here?

> > Maybe even the isolation and per device access control of 
> > *same-class* devices from each other is possible: with careful 
> > implementation of the subsystem shared data structures. (which 
> > isnt much really)
> 
> Right, hardly at all in fact.  The problem comes from the side-band 
> issues like reset, interrupts, hotplug, and whatnot.

Yeah. There are two good aspects here i think:

 - The sideband IPC overhead does not matter much, it's a side band.

 - Spending the effort to isolate configuration details is worth it: 
   sideband code is a primary breeding ground for bugs and security 
   holes.

The main worry to me would be the maintainability difference: does it 
result in much more complex code? As always i'm cautiously optimistic 
about that: i think once we try it we can find a suitable model ... 
It might even turn out to be more readable and more flexible in the 
end.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 20:01                                                                                     ` Linus Torvalds
  2011-05-25 20:19                                                                                       ` Ingo Molnar
@ 2011-05-26 14:37                                                                                       ` Colin Walters
  2011-05-26 15:03                                                                                         ` Linus Torvalds
  1 sibling, 1 reply; 406+ messages in thread
From: Colin Walters @ 2011-05-26 14:37 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kees Cook, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
	Will Drewry, Steven Rostedt, linux-kernel

On Wed, May 25, 2011 at 4:01 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> As to your example of apache modules - last I saw, most of those were
> written in high-level scripting languages that almost invariably end
> up using quite a bit of the system call interfaces. And more
> importantly, almost nobody does unportable code.

Well, there's a difference between frameworks and applications.  At
least in GNOME e.g. we've been generally good at picking up and
transparently taking advantage of Linux-specific stuff where possible
like splice() in g_file_copy(), and in this cycle I'll probably end up
using signalfd() to fix a long standing race condition.

> So hey, I'm willing to be convinced. But I'll need more than people
> _saying_ that they'd be interested. Because judging by past
> performance, nobody ever uses esoteric cool new features.

I'm curious which features you feel are esoteric and cool but unused?

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 14:37                                                                                       ` Colin Walters
@ 2011-05-26 15:03                                                                                         ` Linus Torvalds
  2011-05-26 15:28                                                                                           ` Colin Walters
  2011-05-26 16:33                                                                                           ` Will Drewry
  0 siblings, 2 replies; 406+ messages in thread
From: Linus Torvalds @ 2011-05-26 15:03 UTC (permalink / raw)
  To: Colin Walters
  Cc: Kees Cook, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
	Will Drewry, Steven Rostedt, linux-kernel

On Thu, May 26, 2011 at 7:37 AM, Colin Walters <walters@verbum.org> wrote:
>
> I'm curious which features you feel are esoteric and cool but unused?

Just about anything linux-specific. Ranging from the totally new
concepts (epoll/clone/splice/signalfd) to just simple cleanups and
extensions of reasonably standard stuff (sync_file_range/sendpage).

Sure, there's almost always *somebody* who uses them, but they are
seldom actually worth it.

The one thing that works well is when you expose it as a standard
interface. So futexes are linux-specific, but they are exposed as the
standard pthreads condition variables etc to apps - very few actually
use them as futexes. But because glibc uses them for the pthreads
synchronization, I think they ended up being used inside glibc for
low-level stuff too, so I think futexes ended up being an unqualified
success - much better than the standard interface.

The "it can be used in standard libraries" ends up being a very
powerful thing. It doesn't have to be libc - if something like a glib
or a big graphical interface uses them, they can get very popular. But
if you have to have actual config options (autoconf or similar) to
enable the feature on Linux, along with a compatibility case (because
older kernels don't even support it, so it's not even "linux", it's
"linux newer than xyz"), then very very few applications end up using
it.

And security issues in particular are often *very* subtle. For
example, something like a system call filter sounds like an obviously
safe thing: it can only limit what you do, right?

Except no, not right at all. Imagine that you're limiting a suid
application, and the one operation you limit is "setuid()". Imagine
that the suid application explicitly drops privileges in order to run
safely as the user. Imagine, further, that it doesn't even check the
return value, because it *knows* that if it is root, it will succeed,
and if it isn't root, then it wasn't suid to begin with and doesn't
need to do anything about it.

Unlikely? Hell no. That's standard practice. And if you allow filter
setup that survives fork+exec, you just opened a HUGE security hole.

Fixable? Yes, easily. And I haven't looked at the current patches, but
I would not be AT ALL surprised if they had exactly the above huge
security hole.

My point being that (a) I'm very dubious about new non-standard
features, because historically they seldom get used very widely and
(b) I'm doubly dubious about security things because it turns out it's
damn easy to get it wrong in all kinds of small subtle details.

                            Linus

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 15:03                                                                                         ` Linus Torvalds
@ 2011-05-26 15:28                                                                                           ` Colin Walters
  2011-05-26 16:33                                                                                           ` Will Drewry
  1 sibling, 0 replies; 406+ messages in thread
From: Colin Walters @ 2011-05-26 15:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Kees Cook, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
	Will Drewry, Steven Rostedt, linux-kernel

On Thu, May 26, 2011 at 11:03 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Thu, May 26, 2011 at 7:37 AM, Colin Walters <walters@verbum.org> wrote:
>>
>> I'm curious which features you feel are esoteric and cool but unused?
>
> Just about anything linux-specific. Ranging from the totally new
> concepts (epoll/clone/splice/signalfd) to just simple cleanups and
> extensions of reasonably standard stuff (sync_file_range/sendpage).

epoll's very widely used via frameworks I'd say; at least the Apache
runtime uses it, libevent does, and apparently the Sun JDK does:
http://www.google.com/codesearch/p?hl=en#ih5hvYJNSIA/src/solaris/classes/sun/nio/ch/EPollPort.java&q=epoll&sa=N&cd=32&ct=rc
And here's an entry on that: http://blogs.oracle.com/alanb/entry/epoll

(Why doesn't glib?  It's hard since the priority design was kind of a
mistake: https://bugzilla.gnome.org/show_bug.cgi?id=156048 )

> The "it can be used in standard libraries" ends up being a very
> powerful thing. It doesn't have to be libc - if something like a glib
> or a big graphical interface uses them, they can get very popular.

Right, that's the distinction I was trying to make.

> But
> if you have to have actual config options (autoconf or similar) to
> enable the feature on Linux, along with a compatibility case (because
> older kernels don't even support it, so it's not even "linux", it's
> "linux newer than xyz"), then very very few applications end up using
> it.

>From my experience as a framework developer, it hasn't been hard at
all to keep track of new Linux features, we talk about them a lot =)
The fallback code is often obvious, like for splice(), though for
signalfd it's going to much more messy to keep around the legacy
helper thread case.

> Unlikely? Hell no. That's standard practice. And if you allow filter
> setup that survives fork+exec, you just opened a HUGE security hole.

Oh definitely, setuid and process inheritance has been a source of
many problems over the years, and I agree it'd be very dangerous for
the syscall filters to stay open across execve.  Of course in practice
glibc secure mode exists to mitigate these things too; it could abort
if one was in place in that case.

But I was more curious about your views on Linux-specific interfaces,
and you answered that; thanks!

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 15:03                                                                                         ` Linus Torvalds
  2011-05-26 15:28                                                                                           ` Colin Walters
@ 2011-05-26 16:33                                                                                           ` Will Drewry
  2011-05-26 16:46                                                                                             ` Linus Torvalds
  1 sibling, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-05-26 16:33 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Colin Walters, Kees Cook, Thomas Gleixner, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, linux-kernel, James Morris

On Thu, May 26, 2011 at 10:03 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Thu, May 26, 2011 at 7:37 AM, Colin Walters <walters@verbum.org> wrote:
>>
>> I'm curious which features you feel are esoteric and cool but unused?
>
> Just about anything linux-specific. Ranging from the totally new
> concepts (epoll/clone/splice/signalfd) to just simple cleanups and
> extensions of reasonably standard stuff (sync_file_range/sendpage).
>
> Sure, there's almost always *somebody* who uses them, but they are
> seldom actually worth it.
>
> The one thing that works well is when you expose it as a standard
> interface. So futexes are linux-specific, but they are exposed as the
> standard pthreads condition variables etc to apps - very few actually
> use them as futexes. But because glibc uses them for the pthreads
> synchronization, I think they ended up being used inside glibc for
> low-level stuff too, so I think futexes ended up being an unqualified
> success - much better than the standard interface.
>
> The "it can be used in standard libraries" ends up being a very
> powerful thing. It doesn't have to be libc - if something like a glib
> or a big graphical interface uses them, they can get very popular. But
> if you have to have actual config options (autoconf or similar) to
> enable the feature on Linux, along with a compatibility case (because
> older kernels don't even support it, so it's not even "linux", it's
> "linux newer than xyz"), then very very few applications end up using
> it.
>
> And security issues in particular are often *very* subtle. For
> example, something like a system call filter sounds like an obviously
> safe thing: it can only limit what you do, right?
>
> Except no, not right at all. Imagine that you're limiting a suid
> application, and the one operation you limit is "setuid()". Imagine
> that the suid application explicitly drops privileges in order to run
> safely as the user. Imagine, further, that it doesn't even check the
> return value, because it *knows* that if it is root, it will succeed,
> and if it isn't root, then it wasn't suid to begin with and doesn't
> need to do anything about it.
>
> Unlikely? Hell no. That's standard practice. And if you allow filter
> setup that survives fork+exec, you just opened a HUGE security hole.
>
> Fixable? Yes, easily. And I haven't looked at the current patches, but
> I would not be AT ALL surprised if they had exactly the above huge
> security hole.

FWIW, none of the patches deal with privilege escalation via setuid
files or file capabilities.

> My point being that (a) I'm very dubious about new non-standard
> features, because historically they seldom get used very widely and
> (b) I'm doubly dubious about security things because it turns out it's
> damn easy to get it wrong in all kinds of small subtle details.

I agree with both points, so I'm being a bit hypocritical, I suspect.

At present, I'm not aware of any platforms that support system call
restriction in a non-platform-specific fashion: mac has seatbelt,
freebsd has things like capsicum, linux has seccomp :)  This led me to
the proposal around expanding seccomp since it was already a Linux-ism
for this functionality and, ideally, could be minimal to help limit
the subtle-bug-exposure.   However, any form of kernel attack surface
reduction would be great, but I'm unaware of any that integrate with
glibc smoothly (even if some do have nicer programming interfaces).

I've used system call filtering in the past with good effect in server
environments, and I believe the Chromium renderer example is a robust
for Linux desktops, even without glibc integration.  If the
Linux-specific, non-automatic (glibc) interface is a no-go, then I'll
go back to the drawing board.  I'm not sure how to avoid something
Linux-specific in general, even if it's just adding syscall hooks to
LSMs, though it could be possible to share interfaces with some other
platform's implementation of a broader security system that includes
kernel exposure minimization (like capsicum) which could be built-on
what existing substrate is available or a new one, as Ingo proposes.

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 16:33                                                                                           ` Will Drewry
@ 2011-05-26 16:46                                                                                             ` Linus Torvalds
  2011-05-26 17:02                                                                                               ` Will Drewry
                                                                                                                 ` (3 more replies)
  0 siblings, 4 replies; 406+ messages in thread
From: Linus Torvalds @ 2011-05-26 16:46 UTC (permalink / raw)
  To: Will Drewry
  Cc: Colin Walters, Kees Cook, Thomas Gleixner, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, linux-kernel, James Morris

On Thu, May 26, 2011 at 9:33 AM, Will Drewry <wad@chromium.org> wrote:
>
> FWIW, none of the patches deal with privilege escalation via setuid
> files or file capabilities.

That is NOT AT ALL what I'm talking about.

I'm talking about the "setuid()" system call (and all its cousins:
setgit/setreuid etc). And the whole thread has been about filtering
system calls, no?

Do a google code search for setuid.

In good code, it will look something like

  uid = getuid();

  if (setuid(uid)) {
    fprintf(stderr, "Unable to drop provileges\n");
    exit(1);
  }

but I guarantee you that there are cases where people just blindly
drop privileges. google code search found me at least the "heirloom"
source code doing exactly that.

And if you filter system calls, it's entirely possible that you can
attack suid executables through such a vector. Your "limit system
calls for security" security suddenly turned into "avoid the system
call that made things secure"!

See what I'm saying?

                       Linus

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 16:46                                                                                             ` Linus Torvalds
@ 2011-05-26 17:02                                                                                               ` Will Drewry
  2011-05-26 17:04                                                                                                 ` Will Drewry
                                                                                                                   ` (2 more replies)
  2011-05-26 17:07                                                                                               ` Steven Rostedt
                                                                                                                 ` (2 subsequent siblings)
  3 siblings, 3 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-26 17:02 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Colin Walters, Kees Cook, Thomas Gleixner, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, linux-kernel, James Morris

On Thu, May 26, 2011 at 11:46 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Thu, May 26, 2011 at 9:33 AM, Will Drewry <wad@chromium.org> wrote:
>>
>> FWIW, none of the patches deal with privilege escalation via setuid
>> files or file capabilities.
>
> That is NOT AT ALL what I'm talking about.
>
> I'm talking about the "setuid()" system call (and all its cousins:
> setgit/setreuid etc). And the whole thread has been about filtering
> system calls, no?
>
> Do a google code search for setuid.
>
> In good code, it will look something like
>
>  uid = getuid();
>
>  if (setuid(uid)) {
>    fprintf(stderr, "Unable to drop provileges\n");
>    exit(1);
>  }
>
> but I guarantee you that there are cases where people just blindly
> drop privileges. google code search found me at least the "heirloom"
> source code doing exactly that.
>
> And if you filter system calls, it's entirely possible that you can
> attack suid executables through such a vector. Your "limit system
> calls for security" security suddenly turned into "avoid the system
> call that made things secure"!
>
> See what I'm saying?

Absolutely - that was what I meant :/  The patches do not currently
check creds at creation or again at use, which would lead to
unprivileged filters being used in a privileged context.  Right now,
though, if setuid() is not allowed by the seccomp-filter, the process
will be immediately killed with do_exit(SIGKILL) on call -- thus
avoiding a silent failure. I mentioned file capabilities because they
can have setuid-like side effects, too.  As long as system call
rejection results in a process death, I *think* it helps with some of
this complexity, but I haven't fully vetted the patches for these
scenarios to be 100% confident.

Sorry I wasn't clear!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 17:02                                                                                               ` Will Drewry
@ 2011-05-26 17:04                                                                                                 ` Will Drewry
  2011-05-26 17:17                                                                                                 ` Linus Torvalds
  2011-05-26 17:38                                                                                                 ` [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering Valdis.Kletnieks
  2 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-26 17:04 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Colin Walters, Kees Cook, Thomas Gleixner, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, linux-kernel, James Morris

On Thu, May 26, 2011 at 12:02 PM, Will Drewry <wad@chromium.org> wrote:
> On Thu, May 26, 2011 at 11:46 AM, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
>> On Thu, May 26, 2011 at 9:33 AM, Will Drewry <wad@chromium.org> wrote:
>>>
>>> FWIW, none of the patches deal with privilege escalation via setuid
>>> files or file capabilities.
>>
>> That is NOT AT ALL what I'm talking about.
>>
>> I'm talking about the "setuid()" system call (and all its cousins:
>> setgit/setreuid etc). And the whole thread has been about filtering
>> system calls, no?
>>
>> Do a google code search for setuid.
>>
>> In good code, it will look something like
>>
>>  uid = getuid();
>>
>>  if (setuid(uid)) {
>>    fprintf(stderr, "Unable to drop provileges\n");
>>    exit(1);
>>  }
>>
>> but I guarantee you that there are cases where people just blindly
>> drop privileges. google code search found me at least the "heirloom"
>> source code doing exactly that.
>>
>> And if you filter system calls, it's entirely possible that you can
>> attack suid executables through such a vector. Your "limit system
>> calls for security" security suddenly turned into "avoid the system
>> call that made things secure"!
>>
>> See what I'm saying?
>
> Absolutely - that was what I meant :/  The patches do not currently
> check creds at creation or again at use, which would lead to
> unprivileged filters being used in a privileged context.  Right now,
> though, if setuid() is not allowed by the seccomp-filter, the process
> will be immediately killed with do_exit(SIGKILL) on call -- thus
> avoiding a silent failure. I mentioned file capabilities because they
> can have setuid-like side effects, too.  As long as system call
> rejection results in a process death, I *think* it helps with some of
> this complexity, but I haven't fully vetted the patches for these
> scenarios to be 100% confident.

Bah - by "setuid-like side effects", I meant suid executable-like side
effects.  And I blocking even outside of those scenarios, I think
immediate process-death helps resolves coding mistakes leading to
filtering setuid() calls prior to use.

cheers,
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 16:46                                                                                             ` Linus Torvalds
  2011-05-26 17:02                                                                                               ` Will Drewry
@ 2011-05-26 17:07                                                                                               ` Steven Rostedt
  2011-05-26 18:43                                                                                                 ` Casey Schaufler
  2011-05-26 18:34                                                                                               ` david
  2011-05-26 18:54                                                                                               ` Ingo Molnar
  3 siblings, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-05-26 17:07 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Will Drewry, Colin Walters, Kees Cook, Thomas Gleixner,
	Ingo Molnar, Peter Zijlstra, linux-kernel, James Morris

On Thu, 2011-05-26 at 09:46 -0700, Linus Torvalds wrote:

> And if you filter system calls, it's entirely possible that you can
> attack suid executables through such a vector. Your "limit system
> calls for security" security suddenly turned into "avoid the system
> call that made things secure"!
> 
> See what I'm saying?

So you are not complaining about this implementation, but the use of
syscall filtering?

There may be some user that says, "oh I don't want my other apps to be
able to call setuid" thinking it will secure their application even
more. But because that application did the brain dead thing to not check
the return code of setuid, and it just happened to be running
privileged, it then execs off another application that can root the box.

Because, originally that setuid would have succeeded if the user did
nothing special, but now with this filtering, and the user thinking that
they could limit their app from doing harm, they just opened up a hole
that caused their app to do the exact opposite and give the exec'd app
full root privileges.

Did I get this right?

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 17:02                                                                                               ` Will Drewry
  2011-05-26 17:04                                                                                                 ` Will Drewry
@ 2011-05-26 17:17                                                                                                 ` Linus Torvalds
  2011-05-26 17:38                                                                                                   ` Will Drewry
  2011-05-26 17:38                                                                                                 ` [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering Valdis.Kletnieks
  2 siblings, 1 reply; 406+ messages in thread
From: Linus Torvalds @ 2011-05-26 17:17 UTC (permalink / raw)
  To: Will Drewry
  Cc: Colin Walters, Kees Cook, Thomas Gleixner, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, linux-kernel, James Morris

On Thu, May 26, 2011 at 10:02 AM, Will Drewry <wad@chromium.org> wrote:
>
> Absolutely - that was what I meant :/  The patches do not currently
> check creds at creation or again at use, which would lead to
> unprivileged filters being used in a privileged context.  Right now,
> though, if setuid() is not allowed by the seccomp-filter, the process
> will be immediately killed with do_exit(SIGKILL) on call -- thus
> avoiding a silent failure.

Umm.

You do realize that there is a reason we don't allow random kill()
system calls to succeed without privileges either?

So no, "we kill it with sigkill" is not safe *either*. It now is
potentially a way to kill privileged processes that you didn't have
permission to kill.

My point is that it all sounds designed for well-behaved processes.
"kill it if it does something bad" sounds like a *wonderful* idea if
you're doing a sandbox.

But it is suddenly potentially deadly if that capability is used by a
malicious user for a process that isn't ready for it.

One option is to just not ever allow execve() from inside a restricted
environment.

                       Linus

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 17:17                                                                                                 ` Linus Torvalds
@ 2011-05-26 17:38                                                                                                   ` Will Drewry
  2011-05-26 18:33                                                                                                     ` Linus Torvalds
  0 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-05-26 17:38 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Colin Walters, Kees Cook, Thomas Gleixner, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, linux-kernel, James Morris

On Thu, May 26, 2011 at 12:17 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Thu, May 26, 2011 at 10:02 AM, Will Drewry <wad@chromium.org> wrote:
>>
>> Absolutely - that was what I meant :/  The patches do not currently
>> check creds at creation or again at use, which would lead to
>> unprivileged filters being used in a privileged context.  Right now,
>> though, if setuid() is not allowed by the seccomp-filter, the process
>> will be immediately killed with do_exit(SIGKILL) on call -- thus
>> avoiding a silent failure.
>
> Umm.
>
> You do realize that there is a reason we don't allow random kill()
> system calls to succeed without privileges either?
>
> So no, "we kill it with sigkill" is not safe *either*. It now is
> potentially a way to kill privileged processes that you didn't have
> permission to kill.
>
> My point is that it all sounds designed for well-behaved processes.
> "kill it if it does something bad" sounds like a *wonderful* idea if
> you're doing a sandbox.

Yeah - we end up in a weird place, because for many suid executables,
the failure would be immediate (at priv drop), but it introduces bugs
that will be less obvious in more complex scenarios.

> But it is suddenly potentially deadly if that capability is used by a
> malicious user for a process that isn't ready for it.
>
> One option is to just not ever allow execve() from inside a restricted
> environment.

That'd certainly be fine with me.  Another option could be adding a
cred checking (from set to use) or execve time checking or ..., but
simple works for me.  I'm not hung up on the implementation details
specifically if the end result is that the syscalls can be _safely_
whitelisted.

Thanks!

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 17:02                                                                                               ` Will Drewry
  2011-05-26 17:04                                                                                                 ` Will Drewry
  2011-05-26 17:17                                                                                                 ` Linus Torvalds
@ 2011-05-26 17:38                                                                                                 ` Valdis.Kletnieks
  2011-05-26 18:08                                                                                                   ` Will Drewry
  2 siblings, 1 reply; 406+ messages in thread
From: Valdis.Kletnieks @ 2011-05-26 17:38 UTC (permalink / raw)
  To: Will Drewry
  Cc: Linus Torvalds, Colin Walters, Kees Cook, Thomas Gleixner,
	Ingo Molnar, Peter Zijlstra, Steven Rostedt, linux-kernel,
	James Morris

[-- Attachment #1: Type: text/plain, Size: 1395 bytes --]

On Thu, 26 May 2011 12:02:45 CDT, Will Drewry said:

> Absolutely - that was what I meant :/  The patches do not currently
> check creds at creation or again at use, which would lead to
> unprivileged filters being used in a privileged context.  Right now,
> though, if setuid() is not allowed by the seccomp-filter, the process
> will be immediately killed with do_exit(SIGKILL) on call -- thus
> avoiding a silent failure.

How do you know you have the bounding set correct?

This has been a long-standing issue for SELinux policy writing - it's usually
easy to get 95% of the bounding box right (you need these rules for shared
libraries, you need these rules to access the user's home directory, you need
these other rules to talk TCP to the net, etc).  There's a nice tool that
converts any remaining rejection messages into rules you can add to the policy.

The problem is twofold: (a) that way you can never be sure you got *all* the
rules right and (b) the missing rules are almost always in squirrelly little
error-handling code that gets invoked once in a blue moon.  So in this case,
you end up with trying to debug the SIGKILL that happened when the process was
already in trouble for some other reason...

"Wow. Who would have guessed that program only called gettimeofday() in
the error handler for when it was formatting its crash message?"

Exactly.


[-- Attachment #2: Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 11:38                                                                                             ` Ingo Molnar
@ 2011-05-26 18:06                                                                                               ` Avi Kivity
  2011-05-26 18:15                                                                                                 ` Ingo Molnar
  0 siblings, 1 reply; 406+ messages in thread
From: Avi Kivity @ 2011-05-26 18:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg

On 05/26/2011 02:38 PM, Ingo Molnar wrote:
> * Avi Kivity<avi@redhat.com>  wrote:
>
> >  >  The biggest amount of RAM is the guest RAM image - but if that is
> >  >  mmap(SHARED) and mapped using hugepages then the pte overhead
> >  >  from a process model is largely mitigated.
> >
> >  That doesn't work with memory hotplug.
>
> Why not, if we do the sensible thing and restrict the size
> granularity and alignment of plugged/unplugged memory regions to 2MB?

Once forked, you cannot have new shared anonymous memory, can you?

> We can fix guest Linux as well to not be stupid about the sizing of
> memory hotplug requests. It does hotplug based on the memory map we
> pass to it anyway.
>
> Am i missing something obvious here?

Yes, the new mmap() will be only visible in the calling process.

> >  >  Maybe even the isolation and per device access control of
> >  >  *same-class* devices from each other is possible: with careful
> >  >  implementation of the subsystem shared data structures. (which
> >  >  isnt much really)
> >
> >  Right, hardly at all in fact.  The problem comes from the side-band
> >  issues like reset, interrupts, hotplug, and whatnot.
>
> Yeah. There are two good aspects here i think:
>
>   - The sideband IPC overhead does not matter much, it's a side band.
>
>   - Spending the effort to isolate configuration details is worth it:
>     sideband code is a primary breeding ground for bugs and security
>     holes.
>
> The main worry to me would be the maintainability difference: does it
> result in much more complex code? As always i'm cautiously optimistic
> about that: i think once we try it we can find a suitable model ...
> It might even turn out to be more readable and more flexible in the
> end.

I also believe it will be more maintainable, especially if written in a 
language that has explicit support for message passing (e.g. Erlang).  
This is because it is more similar to how hardware actually works.  
However it needs to be designed in, it's not just a matter of switching 
a thread to a process.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 17:38                                                                                                 ` [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering Valdis.Kletnieks
@ 2011-05-26 18:08                                                                                                   ` Will Drewry
  2011-05-26 18:22                                                                                                     ` Valdis.Kletnieks
  0 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-05-26 18:08 UTC (permalink / raw)
  To: Valdis.Kletnieks
  Cc: Linus Torvalds, Colin Walters, Kees Cook, Thomas Gleixner,
	Ingo Molnar, Peter Zijlstra, Steven Rostedt, linux-kernel,
	James Morris

On Thu, May 26, 2011 at 12:38 PM,  <Valdis.Kletnieks@vt.edu> wrote:
> On Thu, 26 May 2011 12:02:45 CDT, Will Drewry said:
>
>> Absolutely - that was what I meant :/  The patches do not currently
>> check creds at creation or again at use, which would lead to
>> unprivileged filters being used in a privileged context.  Right now,
>> though, if setuid() is not allowed by the seccomp-filter, the process
>> will be immediately killed with do_exit(SIGKILL) on call -- thus
>> avoiding a silent failure.
>
> How do you know you have the bounding set correct?
>
> This has been a long-standing issue for SELinux policy writing - it's usually
> easy to get 95% of the bounding box right (you need these rules for shared
> libraries, you need these rules to access the user's home directory, you need
> these other rules to talk TCP to the net, etc).  There's a nice tool that
> converts any remaining rejection messages into rules you can add to the policy.
>
> The problem is twofold: (a) that way you can never be sure you got *all* the
> rules right and (b) the missing rules are almost always in squirrelly little
> error-handling code that gets invoked once in a blue moon.  So in this case,
> you end up with trying to debug the SIGKILL that happened when the process was
> already in trouble for some other reason...
>
> "Wow. Who would have guessed that program only called gettimeofday() in
> the error handler for when it was formatting its crash message?"
>
> Exactly.

Depending on the need, there is work involved, and there are many ways
to determine your bounding box.  It can be very tight -- where you
analyze normal workloads (perf,strace,objdump) and accept the fact
that pathological workloads may result in process death -- or it can
be quite loose and enable most system calls, just not newer ones,
let's say.  In practice, you might get bit a few times if you're
overly zealous (I know I have), but it's the difference between
failing open and failing closed.  There are some scenarios where you
never, ever want to fail-open even at the cost of process death and
lack of solid insight into a valid failure path.

Hope that makes sense and isn't too general,
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:06                                                                                               ` Avi Kivity
@ 2011-05-26 18:15                                                                                                 ` Ingo Molnar
  2011-05-26 18:20                                                                                                   ` Avi Kivity
  2011-05-26 18:22                                                                                                   ` Peter Zijlstra
  0 siblings, 2 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 18:15 UTC (permalink / raw)
  To: Avi Kivity
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg


* Avi Kivity <avi@redhat.com> wrote:

> On 05/26/2011 02:38 PM, Ingo Molnar wrote:
> >* Avi Kivity<avi@redhat.com>  wrote:
> >
> >>  >  The biggest amount of RAM is the guest RAM image - but if that is
> >>  >  mmap(SHARED) and mapped using hugepages then the pte overhead
> >>  >  from a process model is largely mitigated.
> >>
> >>  That doesn't work with memory hotplug.
> >
> > Why not, if we do the sensible thing and restrict the size 
> > granularity and alignment of plugged/unplugged memory regions to 
> > 2MB?
> 
> Once forked, you cannot have new shared anonymous memory, can you?

We can have named shared memory.

Incidentally i suggested this to Pekka just yesterday: i think we 
should consider guest RAM images to be named files on the local 
filesystem (prefixed with the disk image's name or so, for easy 
identification), this will help with debugging and with swapping as 
well. (This way guest RAM wont eat up regular anonymous swap space - 
it will be swapped to the filesystem.)

As a sidenote, live migration might also become possible this way: in 
theory we could freeze a guest to its RAM image - which can then be 
copied (together with the disk image) to another box as files and 
restarted there, with some some hw configuration state dumped to a 
header portion of that RAM image as well. (outside of the RAM area)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:15                                                                                                 ` Ingo Molnar
@ 2011-05-26 18:20                                                                                                   ` Avi Kivity
  2011-05-26 18:36                                                                                                     ` Ingo Molnar
  2011-05-26 18:22                                                                                                   ` Peter Zijlstra
  1 sibling, 1 reply; 406+ messages in thread
From: Avi Kivity @ 2011-05-26 18:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg

On 05/26/2011 09:15 PM, Ingo Molnar wrote:
> * Avi Kivity<avi@redhat.com>  wrote:
>
> >  On 05/26/2011 02:38 PM, Ingo Molnar wrote:
> >  >* Avi Kivity<avi@redhat.com>   wrote:
> >  >
> >  >>   >   The biggest amount of RAM is the guest RAM image - but if that is
> >  >>   >   mmap(SHARED) and mapped using hugepages then the pte overhead
> >  >>   >   from a process model is largely mitigated.
> >  >>
> >  >>   That doesn't work with memory hotplug.
> >  >
> >  >  Why not, if we do the sensible thing and restrict the size
> >  >  granularity and alignment of plugged/unplugged memory regions to
> >  >  2MB?
> >
> >  Once forked, you cannot have new shared anonymous memory, can you?
>
> We can have named shared memory.

But then the benefit of transparent huge pages goes away.

Of course, if some is working on extending transparent hugepages, the 
problem is solved.  I know there is interest in this.

> Incidentally i suggested this to Pekka just yesterday: i think we
> should consider guest RAM images to be named files on the local
> filesystem (prefixed with the disk image's name or so, for easy
> identification), this will help with debugging and with swapping as
> well. (This way guest RAM wont eat up regular anonymous swap space -
> it will be swapped to the filesystem.)

Qemu supports this via -mem-path.  The motivation was supporting 
hugetlbfs, before THP.  I can't say it was useful for debugging (but 
then qemu has a built in memory inspector and debugger, and supports 
attaching gdb to the guest).

> As a sidenote, live migration might also become possible this way: in
> theory we could freeze a guest to its RAM image - which can then be
> copied (together with the disk image) to another box as files and
> restarted there, with some some hw configuration state dumped to a
> header portion of that RAM image as well. (outside of the RAM area)

Live migration involves the guest running in parallel with its memory 
being copied over.  Even a 1GB guest will take 10s over 1GbE; any 
reasonably sized guest will take forever.

-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:15                                                                                                 ` Ingo Molnar
  2011-05-26 18:20                                                                                                   ` Avi Kivity
@ 2011-05-26 18:22                                                                                                   ` Peter Zijlstra
  2011-05-26 18:38                                                                                                     ` Ingo Molnar
  1 sibling, 1 reply; 406+ messages in thread
From: Peter Zijlstra @ 2011-05-26 18:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Avi Kivity, James Morris, Linus Torvalds, Kees Cook,
	Thomas Gleixner, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg

On Thu, 2011-05-26 at 20:15 +0200, Ingo Molnar wrote:
> Incidentally i suggested this to Pekka just yesterday: i think we 
> should consider guest RAM images to be named files on the local 
> filesystem (prefixed with the disk image's name or so, for easy 
> identification), 

That'll break THP and KSM, both rely and work on anon only.


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:08                                                                                                   ` Will Drewry
@ 2011-05-26 18:22                                                                                                     ` Valdis.Kletnieks
  0 siblings, 0 replies; 406+ messages in thread
From: Valdis.Kletnieks @ 2011-05-26 18:22 UTC (permalink / raw)
  To: Will Drewry
  Cc: Linus Torvalds, Colin Walters, Kees Cook, Thomas Gleixner,
	Ingo Molnar, Peter Zijlstra, Steven Rostedt, linux-kernel,
	James Morris

[-- Attachment #1: Type: text/plain, Size: 1212 bytes --]

On Thu, 26 May 2011 13:08:10 CDT, Will Drewry said:

> Depending on the need, there is work involved, and there are many ways
> to determine your bounding box.  It can be very tight -- where you
> analyze normal workloads (perf,strace,objdump) and accept the fact
> that pathological workloads may result in process death -- or it can
> be quite loose and enable most system calls, just not newer ones,
> let's say.  In practice, you might get bit a few times if you're
> overly zealous (I know I have), but it's the difference between
> failing open and failing closed.  There are some scenarios where you
> never, ever want to fail-open even at the cost of process death and
> lack of solid insight into a valid failure path.

> Hope that makes sense and isn't too general,

Oh, I already understood all that. :)   I'd have to double-check the actual
patch, does it give a (hopefully rate-limited) printk or other hint which
syscall caused the issue, to help in making up the list of needed syscalls?

And we probably want a cleaned-up copy of the quoted paragraph in the
documentation for this feature when it hits the streets.  People tuning in late
will need guidance on how to use this in their projects.


[-- Attachment #2: Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 17:38                                                                                                   ` Will Drewry
@ 2011-05-26 18:33                                                                                                     ` Linus Torvalds
  2011-05-26 18:47                                                                                                       ` Ingo Molnar
  2011-05-26 18:49                                                                                                       ` Will Drewry
  0 siblings, 2 replies; 406+ messages in thread
From: Linus Torvalds @ 2011-05-26 18:33 UTC (permalink / raw)
  To: Will Drewry
  Cc: Colin Walters, Kees Cook, Thomas Gleixner, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, linux-kernel, James Morris

On Thu, May 26, 2011 at 10:38 AM, Will Drewry <wad@chromium.org> wrote:
>>
>> One option is to just not ever allow execve() from inside a restricted
>> environment.
>
> That'd certainly be fine with me.

So if it ends up being purely a "internal to the process" thing, then
I'm much happier about it - it not only limits the scope of things
sufficiently that I don't worry too much about security issues, but it
makes it very clear that it's about a process going into "lock-down"
mode on its own.

It also gets rid of all configuration - one of the things that makes
most security frameworks (look at selinux, but also just ACL's etc)
such a crazy rats nest is the whole "set up for other processes". If
it's designed very much to be about just the "self" process (after
initialization etc), then I think that avoids pretty much all the
serious issues.

A lot of server processes could probably use it as a way to say "Hey,
I guarantee that I will only open new files read-only, and will only
write to the socket that was already opened for me by the accept", and
explicitly limit their worker threads that way.

If that is really sufficient for some chrome sandboxing, then hey,
that's all fine.

Sometimes limiting yourself (rather than looking for some bigger
"generic" solution) is the right answer.

                         Linus

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 16:46                                                                                             ` Linus Torvalds
  2011-05-26 17:02                                                                                               ` Will Drewry
  2011-05-26 17:07                                                                                               ` Steven Rostedt
@ 2011-05-26 18:34                                                                                               ` david
  2011-05-26 18:54                                                                                               ` Ingo Molnar
  3 siblings, 0 replies; 406+ messages in thread
From: david @ 2011-05-26 18:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Will Drewry, Colin Walters, Kees Cook, Thomas Gleixner,
	Ingo Molnar, Peter Zijlstra, Steven Rostedt, linux-kernel,
	James Morris

On Thu, 26 May 2011, Linus Torvalds wrote:
> 
> On Thu, May 26, 2011 at 9:33 AM, Will Drewry <wad@chromium.org> wrote:
>>
>> FWIW, none of the patches deal with privilege escalation via setuid
>> files or file capabilities.
>
> That is NOT AT ALL what I'm talking about.
>
> I'm talking about the "setuid()" system call (and all its cousins:
> setgit/setreuid etc). And the whole thread has been about filtering
> system calls, no?
>
> Do a google code search for setuid.
>
> In good code, it will look something like
>
>  uid = getuid();
>
>  if (setuid(uid)) {
>    fprintf(stderr, "Unable to drop provileges\n");
>    exit(1);
>  }
>
> but I guarantee you that there are cases where people just blindly
> drop privileges. google code search found me at least the "heirloom"
> source code doing exactly that.

I believe that sendmail had this exact vunerability when capibilities were 
used to control setuid a couple of years ago.

David Lang

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:20                                                                                                   ` Avi Kivity
@ 2011-05-26 18:36                                                                                                     ` Ingo Molnar
  2011-05-26 18:43                                                                                                       ` Valdis.Kletnieks
  0 siblings, 1 reply; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 18:36 UTC (permalink / raw)
  To: Avi Kivity
  Cc: James Morris, Linus Torvalds, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg


* Avi Kivity <avi@redhat.com> wrote:

> Live migration involves the guest running in parallel with its 
> memory being copied over.  Even a 1GB guest will take 10s over 
> 1GbE; any reasonably sized guest will take forever.

I suspect we are really offtopic here, but an initial rsync, then 
stopping the guest, final rsync + restart the guest at the target 
would work with minimal interruption.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:22                                                                                                   ` Peter Zijlstra
@ 2011-05-26 18:38                                                                                                     ` Ingo Molnar
  2011-05-27  0:12                                                                                                       ` James Morris
  0 siblings, 1 reply; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 18:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Avi Kivity, James Morris, Linus Torvalds, Kees Cook,
	Thomas Gleixner, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Thu, 2011-05-26 at 20:15 +0200, Ingo Molnar wrote:
>
> > Incidentally i suggested this to Pekka just yesterday: i think we 
> > should consider guest RAM images to be named files on the local 
> > filesystem (prefixed with the disk image's name or so, for easy 
> > identification),
> 
> That'll break THP and KSM, both rely and work on anon only.

No reason they should be limited to anon only though.

Also, don't we have some sort of anonfs, from which we could get an 
fd, which, if mmap()-ed produces regular anonymous shared memory?

That fd could be passed over to other processes, who could then 
mmap() the new piece of shared-anon memory as well.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 17:07                                                                                               ` Steven Rostedt
@ 2011-05-26 18:43                                                                                                 ` Casey Schaufler
  2011-05-26 18:54                                                                                                   ` Steven Rostedt
  0 siblings, 1 reply; 406+ messages in thread
From: Casey Schaufler @ 2011-05-26 18:43 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Linus Torvalds, Will Drewry, Colin Walters, Kees Cook,
	Thomas Gleixner, Ingo Molnar, Peter Zijlstra, linux-kernel,
	James Morris

On 5/26/2011 10:07 AM, Steven Rostedt wrote:
> On Thu, 2011-05-26 at 09:46 -0700, Linus Torvalds wrote:
>
>> And if you filter system calls, it's entirely possible that you can
>> attack suid executables through such a vector. Your "limit system
>> calls for security" security suddenly turned into "avoid the system
>> call that made things secure"!
>>
>> See what I'm saying?
> So you are not complaining about this implementation, but the use of
> syscall filtering?
>
> There may be some user that says, "oh I don't want my other apps to be
> able to call setuid" thinking it will secure their application even
> more. But because that application did the brain dead thing to not check
> the return code of setuid, and it just happened to be running
> privileged, it then execs off another application that can root the box.
>
> Because, originally that setuid would have succeeded if the user did
> nothing special, but now with this filtering, and the user thinking that
> they could limit their app from doing harm, they just opened up a hole
> that caused their app to do the exact opposite and give the exec'd app
> full root privileges.
>
> Did I get this right?

Yes. Some system calls are there so that you can turn off
privilege. There was a major exploit with sendmail when capabilities
were first introduced that brought the potential for this sort of
problem into the public eye. Kernel mechanisms intended to provide
additional security have to be massively careful about the impact
they may have on applications that are currently security aware and
that make use of the existing mechanisms. The ACL mechanism is much
more complicated than it probably ought to be to accommodate chmod()
and capabilities go way over the top to deal with traditional root
behavior.

> -- Steve
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>
>


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:36                                                                                                     ` Ingo Molnar
@ 2011-05-26 18:43                                                                                                       ` Valdis.Kletnieks
  2011-05-26 18:50                                                                                                         ` Ingo Molnar
  0 siblings, 1 reply; 406+ messages in thread
From: Valdis.Kletnieks @ 2011-05-26 18:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Avi Kivity, James Morris, Linus Torvalds, Kees Cook,
	Thomas Gleixner, Peter Zijlstra, Will Drewry, Steven Rostedt,
	linux-kernel, gnatapov, Chris Wright, Pekka Enberg

[-- Attachment #1: Type: text/plain, Size: 516 bytes --]

On Thu, 26 May 2011 20:36:35 +0200, Ingo Molnar said:

> I suspect we are really offtopic here, but an initial rsync, then 
> stopping the guest, final rsync + restart the guest at the target 
> would work with minimal interruption.

Actually, after you kick off the migrate, you really want to be tracking in
real time what pages get dirtied while you're doing the initial copy, so that
the second rsync doesn't have to walk through the file finding the differences.
 But that requires some extra instrumentation.


[-- Attachment #2: Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:33                                                                                                     ` Linus Torvalds
@ 2011-05-26 18:47                                                                                                       ` Ingo Molnar
  2011-05-26 19:05                                                                                                         ` david
  2011-05-26 18:49                                                                                                       ` Will Drewry
  1 sibling, 1 reply; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 18:47 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Will Drewry, Colin Walters, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Steven Rostedt, linux-kernel, James Morris


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> It also gets rid of all configuration - one of the things that 
> makes most security frameworks (look at selinux, but also just 
> ACL's etc) such a crazy rats nest is the whole "set up for other 
> processes". If it's designed very much to be about just the "self" 
> process (after initialization etc), then I think that avoids pretty 
> much all the serious issues.

That's how the event filters work currently: even when inherited they 
get removed when exec-ing a setuid task, so they cannot leak into 
privileged context and cannot modify execution there.

Inheritance works when requested, covering only same-credential child 
tasks, not privileged successors.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:33                                                                                                     ` Linus Torvalds
  2011-05-26 18:47                                                                                                       ` Ingo Molnar
@ 2011-05-26 18:49                                                                                                       ` Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 01/13] tracing: split out filter initialization and clean up Will Drewry
                                                                                                                           ` (12 more replies)
  1 sibling, 13 replies; 406+ messages in thread
From: Will Drewry @ 2011-05-26 18:49 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Colin Walters, Kees Cook, Thomas Gleixner, Ingo Molnar,
	Peter Zijlstra, Steven Rostedt, linux-kernel, James Morris

On Thu, May 26, 2011 at 1:33 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Thu, May 26, 2011 at 10:38 AM, Will Drewry <wad@chromium.org> wrote:
>>>
>>> One option is to just not ever allow execve() from inside a restricted
>>> environment.
>>
>> That'd certainly be fine with me.
>
> So if it ends up being purely a "internal to the process" thing, then
> I'm much happier about it - it not only limits the scope of things
> sufficiently that I don't worry too much about security issues, but it
> makes it very clear that it's about a process going into "lock-down"
> mode on its own.
>
> It also gets rid of all configuration - one of the things that makes
> most security frameworks (look at selinux, but also just ACL's etc)
> such a crazy rats nest is the whole "set up for other processes". If
> it's designed very much to be about just the "self" process (after
> initialization etc), then I think that avoids pretty much all the
> serious issues.
>
> A lot of server processes could probably use it as a way to say "Hey,
> I guarantee that I will only open new files read-only, and will only
> write to the socket that was already opened for me by the accept", and
> explicitly limit their worker threads that way.
>
> If that is really sufficient for some chrome sandboxing, then hey,
> that's all fine.

It adds some hoops, but less than exist today.

> Sometimes limiting yourself (rather than looking for some bigger
> "generic" solution) is the right answer.

I will very happily validate usage and repost with a self-limited
patch series.  Doing so makes the change much more explicitly an
expansion of seccomp, which keeps things sane.

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:43                                                                                                       ` Valdis.Kletnieks
@ 2011-05-26 18:50                                                                                                         ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 18:50 UTC (permalink / raw)
  To: Valdis.Kletnieks
  Cc: Avi Kivity, James Morris, Linus Torvalds, Kees Cook,
	Thomas Gleixner, Peter Zijlstra, Will Drewry, Steven Rostedt,
	linux-kernel, gnatapov, Chris Wright, Pekka Enberg


* Valdis.Kletnieks@vt.edu <Valdis.Kletnieks@vt.edu> wrote:

> On Thu, 26 May 2011 20:36:35 +0200, Ingo Molnar said:
> 
> > I suspect we are really offtopic here, but an initial rsync, then 
> > stopping the guest, final rsync + restart the guest at the target 
> > would work with minimal interruption.
> 
> Actually, after you kick off the migrate, you really want to be 
> tracking in real time what pages get dirtied while you're doing the 
> initial copy, so that the second rsync doesn't have to walk through 
> the file finding the differences.  But that requires some extra 
> instrumentation.

Yeah, and that's how socket based live migration works - it's 
completely seemless.

But note that the rsync re-scan should not be an issue: both the 
source and the target system will obviously have a *lot* more RAM 
than the guest RAM image size.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:43                                                                                                 ` Casey Schaufler
@ 2011-05-26 18:54                                                                                                   ` Steven Rostedt
  0 siblings, 0 replies; 406+ messages in thread
From: Steven Rostedt @ 2011-05-26 18:54 UTC (permalink / raw)
  To: Casey Schaufler
  Cc: Linus Torvalds, Will Drewry, Colin Walters, Kees Cook,
	Thomas Gleixner, Ingo Molnar, Peter Zijlstra, linux-kernel,
	James Morris

On Thu, 2011-05-26 at 11:43 -0700, Casey Schaufler wrote:

> > Did I get this right?
> 
> Yes.

Thanks for the validation ;)

-- Steve



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 16:46                                                                                             ` Linus Torvalds
                                                                                                                 ` (2 preceding siblings ...)
  2011-05-26 18:34                                                                                               ` david
@ 2011-05-26 18:54                                                                                               ` Ingo Molnar
  3 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 18:54 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Will Drewry, Colin Walters, Kees Cook, Thomas Gleixner,
	Peter Zijlstra, Steven Rostedt, linux-kernel, James Morris


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> And if you filter system calls, it's entirely possible that you can 
> attack suid executables through such a vector. Your "limit system 
> calls for security" security suddenly turned into "avoid the system 
> call that made things secure"!

That should not be possible with Will's event filter based solution 
(his last submitted patch), due to this code in fs/exec.c (which is 
in your upstream tree as well):

        /*
         * Flush performance counters when crossing a
         * security domain:
         */
        if (!get_dumpable(current->mm))
                perf_event_exit_task(current);

This will drop all filters if a setuid-root (or whatever setuid) 
binary is executed from a filtered environment.

Does this cover the case you were thinking of?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:47                                                                                                       ` Ingo Molnar
@ 2011-05-26 19:05                                                                                                         ` david
  2011-05-26 19:09                                                                                                           ` Eric Paris
  2011-05-26 19:46                                                                                                           ` Ingo Molnar
  0 siblings, 2 replies; 406+ messages in thread
From: david @ 2011-05-26 19:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Will Drewry, Colin Walters, Kees Cook,
	Thomas Gleixner, Peter Zijlstra, Steven Rostedt, linux-kernel,
	James Morris

On Thu, 26 May 2011, Ingo Molnar wrote:

> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> It also gets rid of all configuration - one of the things that
>> makes most security frameworks (look at selinux, but also just
>> ACL's etc) such a crazy rats nest is the whole "set up for other
>> processes". If it's designed very much to be about just the "self"
>> process (after initialization etc), then I think that avoids pretty
>> much all the serious issues.
>
> That's how the event filters work currently: even when inherited they
> get removed when exec-ing a setuid task, so they cannot leak into
> privileged context and cannot modify execution there.
>
> Inheritance works when requested, covering only same-credential child
> tasks, not privileged successors.

this is a very reasonable default, but there should be some way of saying 
that you want the restrictions to carry over to the suid task (I really 
know what I'm doing switch)

David Lang

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 19:05                                                                                                         ` david
@ 2011-05-26 19:09                                                                                                           ` Eric Paris
  2011-05-26 19:46                                                                                                           ` Ingo Molnar
  1 sibling, 0 replies; 406+ messages in thread
From: Eric Paris @ 2011-05-26 19:09 UTC (permalink / raw)
  To: david
  Cc: Ingo Molnar, Linus Torvalds, Will Drewry, Colin Walters,
	Kees Cook, Thomas Gleixner, Peter Zijlstra, Steven Rostedt,
	linux-kernel, James Morris

On Thu, May 26, 2011 at 3:05 PM,  <david@lang.hm> wrote:
> On Thu, 26 May 2011, Ingo Molnar wrote:
>
>> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>
>>> It also gets rid of all configuration - one of the things that
>>> makes most security frameworks (look at selinux, but also just
>>> ACL's etc) such a crazy rats nest is the whole "set up for other
>>> processes". If it's designed very much to be about just the "self"
>>> process (after initialization etc), then I think that avoids pretty
>>> much all the serious issues.
>>
>> That's how the event filters work currently: even when inherited they
>> get removed when exec-ing a setuid task, so they cannot leak into
>> privileged context and cannot modify execution there.
>>
>> Inheritance works when requested, covering only same-credential child
>> tasks, not privileged successors.
>
> this is a very reasonable default, but there should be some way of saying
> that you want the restrictions to carry over to the suid task (I really know
> what I'm doing switch)

You mean the "i'm a hacker and want to be able to learn about tasks I
shouldn't be able to learn about" switch?  No.  You either get out of
the way on SUID or refuse to launch SUID apps.  Those are the only
reasonable choices.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 19:05                                                                                                         ` david
  2011-05-26 19:09                                                                                                           ` Eric Paris
@ 2011-05-26 19:46                                                                                                           ` Ingo Molnar
  2011-05-26 19:49                                                                                                             ` david
  1 sibling, 1 reply; 406+ messages in thread
From: Ingo Molnar @ 2011-05-26 19:46 UTC (permalink / raw)
  To: david
  Cc: Linus Torvalds, Will Drewry, Colin Walters, Kees Cook,
	Thomas Gleixner, Peter Zijlstra, Steven Rostedt, linux-kernel,
	James Morris


* david@lang.hm <david@lang.hm> wrote:

> On Thu, 26 May 2011, Ingo Molnar wrote:
> 
> >* Linus Torvalds <torvalds@linux-foundation.org> wrote:
> >
> >>It also gets rid of all configuration - one of the things that
> >>makes most security frameworks (look at selinux, but also just
> >>ACL's etc) such a crazy rats nest is the whole "set up for other
> >>processes". If it's designed very much to be about just the "self"
> >>process (after initialization etc), then I think that avoids pretty
> >>much all the serious issues.
> >
> >That's how the event filters work currently: even when inherited they
> >get removed when exec-ing a setuid task, so they cannot leak into
> >privileged context and cannot modify execution there.
> >
> >Inheritance works when requested, covering only same-credential child
> >tasks, not privileged successors.
> 
> this is a very reasonable default, but there should be some way of 
> saying that you want the restrictions to carry over to the suid 
> task (I really know what I'm doing switch)

Unless you mean that root should be able to do it it's a security 
hole both for events and for filters:

 - for example we dont want really finegrained events to be used to 
   BTS hw-trace sshd and thus enable it to discover cryptographic 
   properties of the private key sshd is using.

 - we do not want to *modify* the execution flow of a setuid program,
   that can lead to exploits: by pushing the privileged codepath into 
   a condition that can never occur on a normal system - and thus can 
   push it into doing something it was not intended to do.

   data damage could be done as well: for example if the privileged 
   code is logging into a system file then modifying execution can 
   damage the log file.

So it's not a good idea in general to allow unprivileged code to 
modify the execution of privileged code. In fact it's not a good idea 
to allow it to simply *observe* privileged code. It must remain a 
black box with very few information leaking outwards.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 19:46                                                                                                           ` Ingo Molnar
@ 2011-05-26 19:49                                                                                                             ` david
  0 siblings, 0 replies; 406+ messages in thread
From: david @ 2011-05-26 19:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Will Drewry, Colin Walters, Kees Cook,
	Thomas Gleixner, Peter Zijlstra, Steven Rostedt, linux-kernel,
	James Morris

On Thu, 26 May 2011, Ingo Molnar wrote:

> * david@lang.hm <david@lang.hm> wrote:
>
>> On Thu, 26 May 2011, Ingo Molnar wrote:
>>
>>> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
>>>
>>>> It also gets rid of all configuration - one of the things that
>>>> makes most security frameworks (look at selinux, but also just
>>>> ACL's etc) such a crazy rats nest is the whole "set up for other
>>>> processes". If it's designed very much to be about just the "self"
>>>> process (after initialization etc), then I think that avoids pretty
>>>> much all the serious issues.
>>>
>>> That's how the event filters work currently: even when inherited they
>>> get removed when exec-ing a setuid task, so they cannot leak into
>>> privileged context and cannot modify execution there.
>>>
>>> Inheritance works when requested, covering only same-credential child
>>> tasks, not privileged successors.
>>
>> this is a very reasonable default, but there should be some way of
>> saying that you want the restrictions to carry over to the suid
>> task (I really know what I'm doing switch)
>
> Unless you mean that root should be able to do it it's a security
> hole both for events and for filters:
>
> - for example we dont want really finegrained events to be used to
>   BTS hw-trace sshd and thus enable it to discover cryptographic
>   properties of the private key sshd is using.
>
> - we do not want to *modify* the execution flow of a setuid program,
>   that can lead to exploits: by pushing the privileged codepath into
>   a condition that can never occur on a normal system - and thus can
>   push it into doing something it was not intended to do.
>
>   data damage could be done as well: for example if the privileged
>   code is logging into a system file then modifying execution can
>   damage the log file.
>
> So it's not a good idea in general to allow unprivileged code to
> modify the execution of privileged code. In fact it's not a good idea
> to allow it to simply *observe* privileged code. It must remain a
> black box with very few information leaking outwards.

I was thinking of the use case of the real sysadmin (i.e. root) wanting to 
be able to constrain things. I can see why you would not want to allow 
normal users to do this.

David Lang

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-26 18:38                                                                                                     ` Ingo Molnar
@ 2011-05-27  0:12                                                                                                       ` James Morris
  0 siblings, 0 replies; 406+ messages in thread
From: James Morris @ 2011-05-27  0:12 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Avi Kivity, Linus Torvalds, Kees Cook,
	Thomas Gleixner, Will Drewry, Steven Rostedt, linux-kernel,
	gnatapov, Chris Wright, Pekka Enberg

Btw, if anyone's going to be at Plumbers this year, we have a day set 
aside for the security summit:

https://security.wiki.kernel.org/index.php/LinuxSecuritySummit2011

This may make a good discussion topic.


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 18:42                                                                                 ` Linus Torvalds
                                                                                                     ` (2 preceding siblings ...)
  2011-05-26  1:19                                                                                   ` James Morris
@ 2011-05-29 16:51                                                                                   ` Aneesh Kumar K.V
  2011-05-29 17:02                                                                                     ` Linus Torvalds
  3 siblings, 1 reply; 406+ messages in thread
From: Aneesh Kumar K.V @ 2011-05-29 16:51 UTC (permalink / raw)
  To: Linus Torvalds, Kees Cook, Al Viro
  Cc: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Will Drewry,
	Steven Rostedt, linux-kernel

On Wed, 25 May 2011 11:42:44 -0700, Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Wed, May 25, 2011 at 11:01 AM, Kees Cook <kees.cook@canonical.com> wrote:
> >
> > Can we just go back to the original spec? A lot of people were excited
> > about the prctl() API as done in Will's earlier patchset, we don't lose the
> > extremely useful "enable_on_exec" feature, and we can get away from all
> > this disagreement.
> 
> .. and quite frankly, I'm not even convinced about the original simpler spec.
> 
> Security is a morass. People come up with cool ideas every day, and
> nobody actually uses them - or if they use them, they are just a
> maintenance nightmare.
> 
> Quite frankly, limiting pathname access by some prefix is "cool", but
> it's basically useless.
> 
> That's not where security problems are.
> 
> Security problems are in the odd corners - ioctl's, /proc files,
> random small interfaces that aren't just about file access.
> 
> And who would *use* this thing in real life? Nobody. In order to sell
> me on a new security interface, give me a real actual use case that is
> security-conscious and relevant to real users.
> 
> For things like web servers that actually want to limit filename
> lookup, we'd be <i>much</i> better off with a few new flags to
> pathname lookup that say "don't follow symlinks" and "don't follow
> '..'". Things like that can actually be beneficial to
> security-conscious programming, with very little overhead. Some of
> those things currently look up pathnames one component at a time,
> because they can't afford to not do so. That's a *much* better model
> for the whole "only limit to this subtree" case that was quoted
> sometime early in this thread.


The "make sure we don't follow symlinks at all" is a real problem in
VirtFS (http://wiki.qemu.org/Documentation/9psetup) that we are fixing
by adding a forked chrooted process to Qemu. If we are open to a new
open flag O_NOFOLLOW_PATH, which would fail with ELOOP if any of the
path component is a symbolic link, that would greatly simplify VirtFS.
Will such a new flag to open be acceptable ? 


-aneesh


^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-29 16:51                                                                                   ` Aneesh Kumar K.V
@ 2011-05-29 17:02                                                                                     ` Linus Torvalds
  2011-05-29 18:23                                                                                       ` Al Viro
  0 siblings, 1 reply; 406+ messages in thread
From: Linus Torvalds @ 2011-05-29 17:02 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: Kees Cook, Al Viro, Thomas Gleixner, Ingo Molnar, Peter Zijlstra,
	Will Drewry, Steven Rostedt, linux-kernel

On Sun, May 29, 2011 at 9:51 AM, Aneesh Kumar K.V
<aneesh.kumar@linux.vnet.ibm.com> wrote:
>
> The "make sure we don't follow symlinks at all" is a real problem in
> VirtFS (http://wiki.qemu.org/Documentation/9psetup) that we are fixing
> by adding a forked chrooted process to Qemu. If we are open to a new
> open flag O_NOFOLLOW_PATH, which would fail with ELOOP if any of the
> path component is a symbolic link, that would greatly simplify VirtFS.
> Will such a new flag to open be acceptable ?

Such a flag should be something like 3 lines of actual code (and then
the header file changes to actually add the mask itself, which is apt
to be th ebulk of the patch just because we have to have different
values for different architectures).

And yes, it is absolutely acceptable. The only questions in my mind are

 - why haven't we done this long ago?

 - do we have the flag space?

 - should we do a O_NOMNT_PATH flag to do the same for mount-points?

  Some people worry about being confused by bind mounts etc.

 - do we think ".." is worthy of a flag too?

   or is that a "user space can damn well check that itself, even if
it would be absolutely trivial to check in the kernel too"?

Whatever. I think the NOFOLLOW_PATH one is pretty much a no-brainer.
It's not like symlink worries are unusual.

                    Linus

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-29 17:02                                                                                     ` Linus Torvalds
@ 2011-05-29 18:23                                                                                       ` Al Viro
  0 siblings, 0 replies; 406+ messages in thread
From: Al Viro @ 2011-05-29 18:23 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Aneesh Kumar K.V, Kees Cook, Thomas Gleixner, Ingo Molnar,
	Peter Zijlstra, Will Drewry, Steven Rostedt, linux-kernel

On Sun, May 29, 2011 at 10:02:06AM -0700, Linus Torvalds wrote:

> And yes, it is absolutely acceptable. The only questions in my mind are
> 
>  - why haven't we done this long ago?
> 
>  - do we have the flag space?
> 
>  - should we do a O_NOMNT_PATH flag to do the same for mount-points?
> 
>   Some people worry about being confused by bind mounts etc.
> 
>  - do we think ".." is worthy of a flag too?
> 
>    or is that a "user space can damn well check that itself, even if
> it would be absolutely trivial to check in the kernel too"?
> 
> Whatever. I think the NOFOLLOW_PATH one is pretty much a no-brainer.
> It's not like symlink worries are unusual.

It's not *quite* a no-brainer.  Guys, please hold that one off for a while;
we have more massage to do in the area and I *really* want to get atomic
open work finished (== intents gone, revalidation vs mountpoints sanitized,
etc.) before anything else is done to fs/namie.c.  OK?

And as for .. - userland can bloody well check that on its own if it cares.
Let's keep it simple, please - we already have things far too complicated
in there for my taste.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 17:43                                                                               ` Peter Zijlstra
  (?)
@ 2011-05-29 20:17                                                                                 ` Ingo Molnar
  -1 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-29 20:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Thomas Gleixner, Will Drewry, Steven Rostedt,
	Frederic Weisbecker, James Morris, linux-kernel, Eric Paris,
	kees.cook, agl, Serge E. Hallyn, Ingo Molnar, Andrew Morton,
	Tejun Heo, Michal Marek, Oleg Nesterov, Jiri Slaby,
	David Howells, Russell King, Michal Simek, Ralf Baechle,
	Benjamin Herrenschmidt, Paul Mackerras, Martin Schwidefsky,
	Heiko Carstens, linux390, Paul Mundt, David S. Miller,
	H. Peter Anvin, x86, linux-arm-kernel, linux-mips, linuxppc-dev,
	linux-s390, linux-sh, sparclinux, Linus Torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> But face it, you can argue until you're blue in the face,

That is not a technical argument though - and i considered and 
answered every valid technical argument made by you and Thomas.
You were either not able to or not willing to counter them.

> [...] but both tglx and I will NAK any and all patches that extend 
> perf/ftrace beyond the passive observing role.

The thing is, perf is *already* well beyond the 'passive observer' 
role: we already generate lots of 'action' in response to events. We 
generate notification signals, we write events - all of which can 
(and does) modify program behavior.

So what's your point? There's no "passive observer" role really - 
it's apparently just that you dislike this use of instrumentation 
while you approve of other uses.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-29 20:17                                                                                 ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-29 20:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-mips, linux-sh, Frederic Weisbecker, Heiko Carstens,
	Oleg Nesterov, David Howells, Paul Mackerras, Eric Paris,
	H. Peter Anvin, sparclinux, Jiri Slaby, linux-s390, Russell King,
	x86, James Morris, Linus Torvalds, Ingo Molnar, kees.cook,
	Serge E. Hallyn, Steven Rostedt, Martin Schwidefsky,
	Thomas Gleixner, linux-arm-kernel, Michal Marek, Michal Simek,
	Will Drewry, linuxppc-dev, linux-kernel, Ralf Baechle,
	Paul Mundt, Tejun Heo, linux390, Andrew Morton, agl,
	David S. Miller


* Peter Zijlstra <peterz@infradead.org> wrote:

> But face it, you can argue until you're blue in the face,

That is not a technical argument though - and i considered and 
answered every valid technical argument made by you and Thomas.
You were either not able to or not willing to counter them.

> [...] but both tglx and I will NAK any and all patches that extend 
> perf/ftrace beyond the passive observing role.

The thing is, perf is *already* well beyond the 'passive observer' 
role: we already generate lots of 'action' in response to events. We 
generate notification signals, we write events - all of which can 
(and does) modify program behavior.

So what's your point? There's no "passive observer" role really - 
it's apparently just that you dislike this use of instrumentation 
while you approve of other uses.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
@ 2011-05-29 20:17                                                                                 ` Ingo Molnar
  0 siblings, 0 replies; 406+ messages in thread
From: Ingo Molnar @ 2011-05-29 20:17 UTC (permalink / raw)
  To: linux-arm-kernel


* Peter Zijlstra <peterz@infradead.org> wrote:

> But face it, you can argue until you're blue in the face,

That is not a technical argument though - and i considered and 
answered every valid technical argument made by you and Thomas.
You were either not able to or not willing to counter them.

> [...] but both tglx and I will NAK any and all patches that extend 
> perf/ftrace beyond the passive observing role.

The thing is, perf is *already* well beyond the 'passive observer' 
role: we already generate lots of 'action' in response to events. We 
generate notification signals, we write events - all of which can 
(and does) modify program behavior.

So what's your point? There's no "passive observer" role really - 
it's apparently just that you dislike this use of instrumentation 
while you approve of other uses.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* [PATCH v3 01/13] tracing: split out filter initialization and clean up.
  2011-05-26 18:49                                                                                                       ` Will Drewry
@ 2011-06-01  3:10                                                                                                         ` Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
                                                                                                                           ` (11 subsequent siblings)
  12 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Peter Zijlstra, Paul Mackerras, Arnaldo Carvalho de Melo,
	Frederic Weisbecker

Moves the perf-specific profile event allocation and freeing code into
kernel/perf_event.c where it is called from and two symbols are exported
via ftrace_event.h for instantiating struct event_filters without
requiring a change to the core tracing code.

The change allows globally registered ftrace events to be used in
event_filter structs.  perf is the current consumer, but a possible
future consumer is a system call filtering using the secure computing
hooks (and the existing syscalls subsystem events).

Signed-off-by: Will Drewry <wad@chromium.org>
---
 include/linux/ftrace_event.h       |    9 +++--
 kernel/perf_event.c                |    7 +++-
 kernel/trace/trace_events_filter.c |   60 ++++++++++++++++++++++--------------
 3 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 22b32af..fea9d98 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -216,6 +216,12 @@ extern int filter_current_check_discard(struct ring_buffer *buffer,
 					void *rec,
 					struct ring_buffer_event *event);
 
+extern void ftrace_free_filter(struct event_filter *filter);
+extern int ftrace_parse_filter(struct event_filter **filter,
+			       int event_id,
+			       const char *filter_str);
+extern const char *ftrace_get_filter_string(const struct event_filter *filter);
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
@@ -266,9 +272,6 @@ extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
 extern int  perf_trace_add(struct perf_event *event, int flags);
 extern void perf_trace_del(struct perf_event *event, int flags);
-extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
-				     char *filter_str);
-extern void ftrace_profile_free_filter(struct perf_event *event);
 extern void *perf_trace_buf_prepare(int size, unsigned short type,
 				    struct pt_regs *regs, int *rctxp);
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8e81a98..1da45e7 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5588,7 +5588,8 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 	if (IS_ERR(filter_str))
 		return PTR_ERR(filter_str);
 
-	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+	ret = ftrace_parse_filter(&event->filter, event->attr.config,
+				  filter_str);
 
 	kfree(filter_str);
 	return ret;
@@ -5596,7 +5597,9 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 
 static void perf_event_free_filter(struct perf_event *event)
 {
-	ftrace_profile_free_filter(event);
+	struct event_filter *filter = event->filter;
+	event->filter = NULL;
+	ftrace_free_filter(filter);
 }
 
 #else
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8008ddc..787b174 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -522,7 +522,7 @@ static void remove_filter_string(struct event_filter *filter)
 }
 
 static int replace_filter_string(struct event_filter *filter,
-				 char *filter_string)
+				 const char *filter_string)
 {
 	kfree(filter->filter_string);
 	filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
@@ -1936,21 +1936,27 @@ out_unlock:
 	return err;
 }
 
-#ifdef CONFIG_PERF_EVENTS
-
-void ftrace_profile_free_filter(struct perf_event *event)
+/* ftrace_free_filter - frees a parsed filter its internal structures.
+ *
+ * @filter: pointer to the event_filter to free.
+ */
+void ftrace_free_filter(struct event_filter *filter)
 {
-	struct event_filter *filter = event->filter;
-
-	event->filter = NULL;
-	__free_filter(filter);
+	if (filter)
+		__free_filter(filter);
 }
+EXPORT_SYMBOL_GPL(ftrace_free_filter);
 
-int ftrace_profile_set_filter(struct perf_event *event, int event_id,
-			      char *filter_str)
+/* ftrace_parse_filter - allocates and populates a new event_filter
+ *
+ * @event_id: may be something like syscalls::sys_event_tkill's id.
+ * @filter_str: pointer to the filter string. Ownership IS taken.
+ */
+int ftrace_parse_filter(struct event_filter **filter,
+			int event_id,
+			const char *filter_str)
 {
 	int err;
-	struct event_filter *filter;
 	struct filter_parse_state *ps;
 	struct ftrace_event_call *call = NULL;
 
@@ -1966,12 +1972,12 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 		goto out_unlock;
 
 	err = -EEXIST;
-	if (event->filter)
+	if (*filter)
 		goto out_unlock;
 
-	filter = __alloc_filter();
-	if (!filter) {
-		err = PTR_ERR(filter);
+	*filter = __alloc_filter();
+	if (IS_ERR_OR_NULL(*filter)) {
+		err = PTR_ERR(*filter);
 		goto out_unlock;
 	}
 
@@ -1980,14 +1986,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	if (!ps)
 		goto free_filter;
 
-	parse_init(ps, filter_ops, filter_str);
+	replace_filter_string(*filter, filter_str);
+
+	parse_init(ps, filter_ops, (*filter)->filter_string);
 	err = filter_parse(ps);
 	if (err)
 		goto free_ps;
 
-	err = replace_preds(call, filter, ps, filter_str, false);
-	if (!err)
-		event->filter = filter;
+	err = replace_preds(call, *filter, ps, (*filter)->filter_string, false);
 
 free_ps:
 	filter_opstack_clear(ps);
@@ -1995,14 +2001,22 @@ free_ps:
 	kfree(ps);
 
 free_filter:
-	if (err)
-		__free_filter(filter);
+	if (err) {
+		__free_filter(*filter);
+		*filter = NULL;
+	}
 
 out_unlock:
 	mutex_unlock(&event_mutex);
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(ftrace_parse_filter);
 
-#endif /* CONFIG_PERF_EVENTS */
-
+const char *ftrace_get_filter_string(const struct event_filter *filter)
+{
+	if (!filter)
+		return NULL;
+	return filter->filter_string;
+}
+EXPORT_SYMBOL_GPL(ftrace_get_filter_string);
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 02/13] tracing: split out syscall_trace_enter construction
  2011-05-26 18:49                                                                                                       ` Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 01/13] tracing: split out filter initialization and clean up Will Drewry
@ 2011-06-01  3:10                                                                                                         ` Will Drewry
  2011-06-01  7:00                                                                                                           ` Ingo Molnar
  2011-06-01  3:10                                                                                                         ` [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters Will Drewry
                                                                                                                           ` (10 subsequent siblings)
  12 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Frederic Weisbecker, Ingo Molnar

perf appears to be the primary consumer of the CONFIG_FTRACE_SYSCALLS
infrastructure.  As such, many the helpers target at perf can be split
into a peerf-focused helper and a generic CONFIG_FTRACE_SYSCALLS
consumer interface.

This change splits out syscall_trace_enter construction from
perf_syscall_enter for current into two helpers:
- ftrace_syscall_enter_state
- ftrace_syscall_enter_state_size

And adds another helper for completeness:
- ftrace_syscall_exit_state_size

These helpers allow for shared code between perf ftrace events and
any other consumers of CONFIG_FTRACE_SYSCALLS events.  The proposed
seccomp_filter patches use this code.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 include/trace/syscall.h       |    4 ++
 kernel/trace/trace_syscalls.c |   96 +++++++++++++++++++++++++++++++++++------
 2 files changed, 86 insertions(+), 14 deletions(-)

diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 31966a4..242ae04 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -41,6 +41,10 @@ extern int reg_event_syscall_exit(struct ftrace_event_call *call);
 extern void unreg_event_syscall_exit(struct ftrace_event_call *call);
 extern int
 ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s);
+extern int ftrace_syscall_enter_state(u8 *buf, size_t available,
+				      struct trace_entry **entry);
+extern size_t ftrace_syscall_enter_state_size(int nb_args);
+extern size_t ftrace_syscall_exit_state_size(void);
 enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
 				      struct trace_event *event);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee7b5a0..f37f120 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -95,7 +95,7 @@ find_syscall_meta(unsigned long syscall)
 	return NULL;
 }
 
-static struct syscall_metadata *syscall_nr_to_meta(int nr)
+struct syscall_metadata *syscall_nr_to_meta(int nr)
 {
 	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
 		return NULL;
@@ -498,7 +498,7 @@ static int sys_perf_refcount_exit;
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
-	struct syscall_trace_enter *rec;
+	void *buf;
 	struct hlist_head *head;
 	int syscall_nr;
 	int rctx;
@@ -513,25 +513,22 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
-	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
-	size = ALIGN(size + sizeof(u32), sizeof(u64));
-	size -= sizeof(u32);
+	size = ftrace_syscall_enter_state_size(sys_data->nb_args);
 
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 		      "perf buffer not large enough"))
 		return;
 
-	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-				sys_data->enter_event->event.type, regs, &rctx);
-	if (!rec)
+	buf = perf_trace_buf_prepare(size, sys_data->enter_event->event.type,
+				     regs, &rctx);
+	if (!buf)
 		return;
 
-	rec->nr = syscall_nr;
-	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
-			       (unsigned long *)&rec->args);
+	/* The only error conditions in this helper are handled above. */
+	ftrace_syscall_enter_state(buf, size, NULL);
 
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(buf, size, rctx, 0, 1, regs, head);
 }
 
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -587,8 +584,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 		return;
 
 	/* We can probably do that at build time */
-	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
-	size -= sizeof(u32);
+	size = ftrace_syscall_exit_state_size();
 
 	/*
 	 * Impossible, but be paranoid with the future
@@ -688,3 +684,75 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 	}
 	return 0;
 }
+
+/* ftrace_syscall_enter_state_size - returns the state size required.
+ *
+ * @nb_args: number of system call args expected.
+ *           a negative value implies the maximum allowed.
+ */
+size_t ftrace_syscall_enter_state_size(int nb_args)
+{
+	/* syscall_get_arguments only supports up to 6 arguments. */
+	int arg_count = (nb_args >= 0 ? nb_args : 6);
+	size_t size = (sizeof(unsigned long) * arg_count) +
+		      sizeof(struct syscall_trace_enter);
+	size = ALIGN(size + sizeof(u32), sizeof(u64));
+	size -= sizeof(u32);
+	return size;
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_enter_state_size);
+
+size_t ftrace_syscall_exit_state_size(void)
+{
+	return ALIGN(sizeof(struct syscall_trace_exit) + sizeof(u32),
+		     sizeof(u64)) - sizeof(u32);
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_exit_state_size);
+
+/* ftrace_syscall_enter_state - build state for filter matching
+ *
+ * @buf: buffer to populate with current task state for matching
+ * @available: size available for use in the buffer.
+ * @entry: optional pointer to the trace_entry member of the state.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ * If @entry is NULL, it will be ignored.
+ */
+int ftrace_syscall_enter_state(u8 *buf, size_t available,
+			       struct trace_entry **entry)
+{
+	struct syscall_trace_enter *sys_enter;
+	struct syscall_metadata *sys_data;
+	int size;
+	int syscall_nr;
+	struct pt_regs *regs = task_pt_regs(current);
+
+	syscall_nr = syscall_get_nr(current, regs);
+	if (syscall_nr < 0)
+		return -EINVAL;
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return -EINVAL;
+
+	/* Determine the actual size needed. */
+	size = sizeof(unsigned long) * sys_data->nb_args +
+	       sizeof(struct syscall_trace_enter);
+	size = ALIGN(size + sizeof(u32), sizeof(u64));
+	size -= sizeof(u32);
+
+	BUG_ON(size > available);
+	sys_enter = (struct syscall_trace_enter *)buf;
+
+	/* Populating the struct trace_sys_enter is left to the caller, but
+	 * a pointer is returned to encourage opacity.
+	 */
+	if (entry)
+		*entry = &sys_enter->ent;
+
+	sys_enter->nr = syscall_nr;
+	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+			      sys_enter->args);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_enter_state);
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
  2011-05-26 18:49                                                                                                       ` Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 01/13] tracing: split out filter initialization and clean up Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
@ 2011-06-01  3:10                                                                                                         ` Will Drewry
  2011-06-02 17:36                                                                                                           ` Paul E. McKenney
  2011-06-01  3:10                                                                                                         ` [PATCH v3 04/13] seccomp_filter: add process state reporting Will Drewry
                                                                                                                           ` (9 subsequent siblings)
  12 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Peter Zijlstra, Frederic Weisbecker, linux-security-module

This change adds a new seccomp mode which specifies the allowed system
calls dynamically.  When in the new mode (2), all system calls are
checked against process-defined filters - first by system call number,
then by a filter string.  If an entry exists for a given system call and
all filter predicates evaluate to true, then the task may proceed.
Otherwise, the task is killed.

Filter string parsing and evaluation is handled by the ftrace filter
engine.  Related patches tweak to the perf filter trace and free
allowing the calls to be shared. Filters inherit their understanding of
types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
subsystem which already populates this information in syscall_metadata
associated enter_event (and exit_event) structures. If
CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
will be allowed.

The net result is a process may have its system calls filtered using the
ftrace filter engine's inherent understanding of systems calls.  The set
of filters is specified through the PR_SET_SECCOMP_FILTER argument in
prctl(). For example, a filterset for a process, like pdftotext, that
should only process read-only input could (roughly) look like:
  sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
  prctl(PR_SET_SECCOMP_FILTER, __NR_open, rdonly);
  prctl(PR_SET_SECCOMP_FILTER, __NR__llseek, "1");
  prctl(PR_SET_SECCOMP_FILTER, __NR_brk, "1");
  prctl(PR_SET_SECCOMP_FILTER, __NR_close, "1");
  prctl(PR_SET_SECCOMP_FILTER, __NR_exit_group, "1");
  prctl(PR_SET_SECCOMP_FILTER, __NR_fstat64, "1");
  prctl(PR_SET_SECCOMP_FILTER, __NR_mmap2, "1");
  prctl(PR_SET_SECCOMP_FILTER, __NR_munmap, "1");
  prctl(PR_SET_SECCOMP_FILTER, __NR_read, "1");
  prctl(PR_SET_SECCOMP_FILTER, __NR_write, "(fd == 1 | fd == 2)");
  prctl(PR_SET_SECCOMP, 2);

Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
be &&'d together to ensure that attack surface may only be reduced:
  prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");

With the earlier example, the active filter becomes:
  "(fd == 1 || fd == 2) && fd != 2"

The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
The latter returns the current filter for a system call to userspace:

  prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf, bufsize);

while the former clears any filters for a given system call changing it
back to a defaulty deny:

  prctl(PR_CLEAR_SECCOMP_FILTER, __NR_write);

v3: - always block execve calls (as per linus torvalds)
    - add __NR_seccomp_execve(_32) to seccomp-supporting arches
    - ensure compat tasks can't reach ftrace:syscalls
    - dropped new defines for seccomp modes.
    - two level array instead of hlists (sugg. by olof johansson)
    - added generic Kconfig entry that is not connected.
    - dropped internal seccomp.h
    - move prctl helpers to seccomp_filter
    - killed seccomp_t typedef (as per checkpatch)
v2: - changed to use the existing syscall number ABI.
    - prctl changes to minimize parsing in the kernel:
      prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
      prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
      prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
      prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
    - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
    - added flags
    - provide a default fail syscall_nr_to_meta in ftrace
    - provides fallback for unhooked system calls
    - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
    - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
    - moved to a hlist and 4 bit hash of linked lists
    - added support to operate without CONFIG_FTRACE_SYSCALLS
    - moved Kconfig support next to SECCOMP
    - made Kconfig entries dependent on EXPERIMENTAL
    - added macros to avoid ifdefs from kernel/fork.c
    - added compat task/filter matching
    - drop seccomp.h inclusion in sched.h and drop seccomp_t
    - added Filtering to "show" output
    - added on_exec state dup'ing when enabling after a fast-path accept.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 include/linux/prctl.h   |    5 +
 include/linux/sched.h   |    2 +-
 include/linux/seccomp.h |   98 ++++++-
 include/trace/syscall.h |    7 +
 kernel/Makefile         |    3 +
 kernel/fork.c           |    3 +
 kernel/seccomp.c        |   38 ++-
 kernel/seccomp_filter.c |  784 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c            |   13 +-
 security/Kconfig        |   17 +
 10 files changed, 954 insertions(+), 16 deletions(-)
 create mode 100644 kernel/seccomp_filter.c

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..44723ce 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -64,6 +64,11 @@
 #define PR_GET_SECCOMP	21
 #define PR_SET_SECCOMP	22
 
+/* Get/set process seccomp filters */
+#define PR_GET_SECCOMP_FILTER	35
+#define PR_SET_SECCOMP_FILTER	36
+#define PR_CLEAR_SECCOMP_FILTER	37
+
 /* Get/set the capability bounding set (as per security/commoncap.c) */
 #define PR_CAPBSET_READ 23
 #define PR_CAPBSET_DROP 24
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 18d63ce..3f0bc8d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1374,7 +1374,7 @@ struct task_struct {
 	uid_t loginuid;
 	unsigned int sessionid;
 #endif
-	seccomp_t seccomp;
+	struct seccomp_struct seccomp;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 167c333..f4434ca 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -1,13 +1,33 @@
 #ifndef _LINUX_SECCOMP_H
 #define _LINUX_SECCOMP_H
 
+struct seq_file;
 
 #ifdef CONFIG_SECCOMP
 
+#include <linux/errno.h>
 #include <linux/thread_info.h>
+#include <linux/types.h>
 #include <asm/seccomp.h>
 
-typedef struct { int mode; } seccomp_t;
+struct seccomp_filters;
+/**
+ * struct seccomp_struct - the state of a seccomp'ed process
+ *
+ * @mode:
+ *     if this is 1, the process is under standard seccomp rules
+ *             is 2, the process is only allowed to make system calls where
+ *                   associated filters evaluate successfully.
+ * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
+ *           filters assignment/use should be RCU-protected and its contents
+ *           should never be modified when attached to a seccomp_struct.
+ */
+struct seccomp_struct {
+	uint16_t mode;
+#ifdef CONFIG_SECCOMP_FILTER
+	struct seccomp_filters *filters;
+#endif
+};
 
 extern void __secure_computing(int);
 static inline void secure_computing(int this_syscall)
@@ -16,15 +36,14 @@ static inline void secure_computing(int this_syscall)
 		__secure_computing(this_syscall);
 }
 
-extern long prctl_get_seccomp(void);
 extern long prctl_set_seccomp(unsigned long);
+extern long prctl_get_seccomp(void);
 
 #else /* CONFIG_SECCOMP */
 
 #include <linux/errno.h>
 
-typedef struct { } seccomp_t;
-
+struct seccomp_struct { };
 #define secure_computing(x) do { } while (0)
 
 static inline long prctl_get_seccomp(void)
@@ -32,11 +51,80 @@ static inline long prctl_get_seccomp(void)
 	return -EINVAL;
 }
 
-static inline long prctl_set_seccomp(unsigned long arg2)
+static inline long prctl_set_seccomp(unsigned long a2);
 {
 	return -EINVAL;
 }
 
 #endif /* CONFIG_SECCOMP */
 
+#ifdef CONFIG_SECCOMP_FILTER
+
+#define inherit_tsk_seccomp(_child, _orig) do { \
+	_child->seccomp.mode = _orig->seccomp.mode; \
+	_child->seccomp.filters = get_seccomp_filters(_orig->seccomp.filters); \
+	} while (0)
+#define put_tsk_seccomp(_tsk) put_seccomp_filters(_tsk->seccomp.filters)
+
+extern int seccomp_show_filters(struct seccomp_filters *filters,
+				struct seq_file *);
+extern long seccomp_set_filter(int, char *);
+extern long seccomp_clear_filter(int);
+extern long seccomp_get_filter(int, char *, unsigned long);
+
+extern long prctl_set_seccomp_filter(unsigned long, char __user *);
+extern long prctl_get_seccomp_filter(unsigned long, char __user *,
+				     unsigned long);
+extern long prctl_clear_seccomp_filter(unsigned long);
+
+extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
+extern void put_seccomp_filters(struct seccomp_filters *);
+
+extern int seccomp_test_filters(int);
+extern void seccomp_filter_log_failure(int);
+
+#else  /* CONFIG_SECCOMP_FILTER */
+
+struct seccomp_filters { };
+#define inherit_tsk_seccomp(_child, _orig) do { } while (0)
+#define put_tsk_seccomp(_tsk) do { } while (0)
+
+static inline int seccomp_show_filters(struct seccomp_filters *filters,
+				       struct seq_file *m)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_set_filter(int syscall_nr, char *filter)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_clear_filter(int syscall_nr)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_get_filter(int syscall_nr,
+				      char *buf, unsigned long available)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_clear_seccomp_filter(unsigned long a2)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
+					    unsigned long a4)
+{
+	return -ENOSYS;
+}
+#endif  /* CONFIG_SECCOMP_FILTER */
 #endif /* _LINUX_SECCOMP_H */
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 242ae04..e061ad0 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -35,6 +35,8 @@ struct syscall_metadata {
 extern unsigned long arch_syscall_addr(int nr);
 extern int init_syscall_trace(struct ftrace_event_call *call);
 
+extern struct syscall_metadata *syscall_nr_to_meta(int);
+
 extern int reg_event_syscall_enter(struct ftrace_event_call *call);
 extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
 extern int reg_event_syscall_exit(struct ftrace_event_call *call);
@@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
 				      struct trace_event *event);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
 				     struct trace_event *event);
+#else
+static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+	return NULL;
+}
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/Makefile b/kernel/Makefile
index 85cbfb3..84e7dfb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
+ifeq ($(CONFIG_SECCOMP_FILTER),y)
+obj-$(CONFIG_SECCOMP) += seccomp_filter.o
+endif
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
diff --git a/kernel/fork.c b/kernel/fork.c
index e7548de..6f835e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
+	put_tsk_seccomp(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	if (err)
 		goto out;
 
+	inherit_tsk_seccomp(tsk, orig);
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13..0a942be 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -2,16 +2,20 @@
  * linux/kernel/seccomp.c
  *
  * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
+ * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
  *
  * This defines a simple but solid secure-computing mode.
  */
 
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/unistd.h>
+#include <linux/ftrace_event.h>
 
+#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
 /* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
 
 /*
  * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
 
 void __secure_computing(int this_syscall)
 {
-	int mode = current->seccomp.mode;
 	int * syscall;
 
-	switch (mode) {
+	switch (current->seccomp.mode) {
 	case 1:
 		syscall = mode1_syscalls;
 #ifdef CONFIG_COMPAT
@@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
 				return;
 		} while (*++syscall);
 		break;
+#ifdef CONFIG_SECCOMP_FILTER
+	case 2:
+		if (this_syscall >= NR_syscalls || this_syscall < 0)
+			break;
+
+		if (!seccomp_test_filters(this_syscall))
+			return;
+
+		seccomp_filter_log_failure(this_syscall);
+		break;
+#endif
 	default:
 		BUG();
 	}
@@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
 	if (unlikely(current->seccomp.mode))
 		goto out;
 
-	ret = -EINVAL;
-	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
-		current->seccomp.mode = seccomp_mode;
-		set_thread_flag(TIF_SECCOMP);
+	ret = 0;
+	switch (seccomp_mode) {
+	case 1:
 #ifdef TIF_NOTSC
 		disable_TSC();
 #endif
-		ret = 0;
+#ifdef CONFIG_SECCOMP_FILTER
+	case 2:
+#endif
+		current->seccomp.mode = seccomp_mode;
+		set_thread_flag(TIF_SECCOMP);
+		break;
+	default:
+		ret = -EINVAL;
 	}
 
- out:
+out:
 	return ret;
 }
diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
new file mode 100644
index 0000000..9782f25
--- /dev/null
+++ b/kernel/seccomp_filter.c
@@ -0,0 +1,784 @@
+/* filter engine-based seccomp system call filtering
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ */
+
+#include <linux/compat.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/ftrace_event.h>
+#include <linux/seccomp.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include <asm/syscall.h>
+#include <trace/syscall.h>
+
+
+#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
+
+#define SECCOMP_FILTER_ALLOW "1"
+#define SECCOMP_ACTION_DENY 0xffff
+#define SECCOMP_ACTION_ALLOW 0xfffe
+
+/**
+ * struct seccomp_filters - container for seccomp filterset
+ *
+ * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
+ *            May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
+ * @event_filters: array of pointers to ftrace event objects
+ * @count: size of @event_filters
+ * @flags: anonymous struct to wrap filters-specific flags
+ * @usage: reference count to simplify use.
+ */
+struct seccomp_filters {
+	uint16_t syscalls[NR_syscalls];
+	struct event_filter **event_filters;
+	uint16_t count;
+	struct {
+		uint32_t compat:1,
+			 __reserved:31;
+	} flags;
+	atomic_t usage;
+};
+
+/* Handle ftrace symbol non-existence */
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define create_event_filter(_ef_pptr, _event_type, _str) \
+	ftrace_parse_filter(_ef_pptr, _event_type, _str)
+#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
+#define free_event_filter(_f) ftrace_free_filter(_f)
+
+#else
+
+#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
+#define get_filter_string(_ef) (NULL)
+#define free_event_filter(_f) do { } while (0)
+#endif
+
+/**
+ * seccomp_filters_new - allocates a new filters object
+ * @count: count to allocate for the event_filters array
+ *
+ * Returns ERR_PTR on error or an allocated object.
+ */
+static struct seccomp_filters *seccomp_filters_new(uint16_t count)
+{
+	struct seccomp_filters *f;
+
+	if (count >= SECCOMP_ACTION_ALLOW)
+		return ERR_PTR(-EINVAL);
+
+	f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
+	if (!f)
+		return ERR_PTR(-ENOMEM);
+
+	/* Lazy SECCOMP_ACTION_DENY assignment. */
+	memset(f->syscalls, 0xff, sizeof(f->syscalls));
+	atomic_set(&f->usage, 1);
+
+	f->event_filters = NULL;
+	f->count = count;
+	if (!count)
+		return f;
+
+	f->event_filters = kzalloc(count * sizeof(struct event_filter *),
+				   GFP_KERNEL);
+	if (!f->event_filters) {
+		kfree(f);
+		f = ERR_PTR(-ENOMEM);
+	}
+	return f;
+}
+
+/**
+ * seccomp_filters_free - cleans up the filter list and frees the table
+ * @filters: NULL or live object to be completely destructed.
+ */
+static void seccomp_filters_free(struct seccomp_filters *filters)
+{
+	uint16_t count = 0;
+	if (!filters)
+		return;
+	while (count < filters->count) {
+		struct event_filter *f = filters->event_filters[count];
+		free_event_filter(f);
+		count++;
+	}
+	kfree(filters->event_filters);
+	kfree(filters);
+}
+
+static void __put_seccomp_filters(struct seccomp_filters *orig)
+{
+	WARN_ON(atomic_read(&orig->usage));
+	seccomp_filters_free(orig);
+}
+
+#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
+#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
+#define seccomp_filter_dynamic(_id) \
+	(!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
+static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
+					 int syscall_nr)
+{
+	if (!f)
+		return SECCOMP_ACTION_DENY;
+	return f->syscalls[syscall_nr];
+}
+
+static inline struct event_filter *seccomp_dynamic_filter(
+		const struct seccomp_filters *filters, uint16_t id)
+{
+	if (!seccomp_filter_dynamic(id))
+		return NULL;
+	return filters->event_filters[id];
+}
+
+static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
+					 int syscall_nr, uint16_t id)
+{
+	filters->syscalls[syscall_nr] = id;
+}
+
+static inline void set_seccomp_filter(struct seccomp_filters *filters,
+				      int syscall_nr, uint16_t id,
+				      struct event_filter *dynamic_filter)
+{
+	filters->syscalls[syscall_nr] = id;
+	if (seccomp_filter_dynamic(id))
+		filters->event_filters[id] = dynamic_filter;
+}
+
+static struct event_filter *alloc_event_filter(int syscall_nr,
+					       const char *filter_string)
+{
+	struct syscall_metadata *data;
+	struct event_filter *filter = NULL;
+	int err;
+
+	data = syscall_nr_to_meta(syscall_nr);
+	/* Argument-based filtering only works on ftrace-hooked syscalls. */
+	err = -ENOSYS;
+	if (!data)
+		goto fail;
+	err = create_event_filter(&filter,
+				  data->enter_event->event.type,
+				  filter_string);
+	if (err)
+		goto fail;
+
+	return filter;
+fail:
+	kfree(filter);
+	return ERR_PTR(err);
+}
+
+/**
+ * seccomp_filters_copy - copies filters from src to dst.
+ *
+ * @dst: seccomp_filters to populate.
+ * @src: table to read from.
+ * @skip: specifies an entry, by system call, to skip.
+ *
+ * Returns non-zero on failure.
+ * Both the source and the destination should have no simultaneous
+ * writers, and dst should be exclusive to the caller.
+ * If @skip is < 0, it is ignored.
+ */
+static int seccomp_filters_copy(struct seccomp_filters *dst,
+				const struct seccomp_filters *src,
+				int skip)
+{
+	int id = 0, ret = 0, nr;
+	memcpy(&dst->flags, &src->flags, sizeof(src->flags));
+	memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
+	if (!src->count)
+		goto done;
+	for (nr = 0; nr < NR_syscalls; ++nr) {
+		struct event_filter *filter;
+		const char *str;
+		uint16_t src_id = seccomp_filter_id(src, nr);
+		if (nr == skip) {
+			set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
+					   NULL);
+			continue;
+		}
+		if (!seccomp_filter_dynamic(src_id))
+			continue;
+		if (id >= dst->count) {
+			ret = -EINVAL;
+			goto done;
+		}
+		str = get_filter_string(seccomp_dynamic_filter(src, src_id));
+		filter = alloc_event_filter(nr, str);
+		if (IS_ERR(filter)) {
+			ret = PTR_ERR(filter);
+			goto done;
+		}
+		set_seccomp_filter(dst, nr, id, filter);
+		id++;
+	}
+
+done:
+	return ret;
+}
+
+/**
+ * seccomp_extend_filter - appends more text to a syscall_nr's filter
+ * @filters: unattached filter object to operate on
+ * @syscall_nr: syscall number to update filters for
+ * @filter_string: string to append to the existing filter
+ *
+ * The new string will be &&'d to the original filter string to ensure that it
+ * always matches the existing predicates or less:
+ *   (old_filter) && @filter_string
+ * A new seccomp_filters instance is returned on success and a ERR_PTR on
+ * failure.
+ */
+static int seccomp_extend_filter(struct seccomp_filters *filters,
+				 int syscall_nr, char *filter_string)
+{
+	struct event_filter *filter;
+	uint16_t id = seccomp_filter_id(filters, syscall_nr);
+	char *merged = NULL;
+	int ret = -EINVAL, expected;
+
+	/* No extending with a "1". */
+	if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
+		goto out;
+
+	filter = seccomp_dynamic_filter(filters, id);
+	ret = -ENOENT;
+	if (!filter)
+		goto out;
+
+	merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
+	ret = -ENOMEM;
+	if (!merged)
+		goto out;
+
+	expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && %s",
+			    get_filter_string(filter), filter_string);
+	ret = -E2BIG;
+	if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
+		goto out;
+
+	/* Free the old filter */
+	free_event_filter(filter);
+	set_seccomp_filter(filters, syscall_nr, id, NULL);
+
+	/* Replace it */
+	filter = alloc_event_filter(syscall_nr, merged);
+	if (IS_ERR(filter)) {
+		ret = PTR_ERR(filter);
+		goto out;
+	}
+	set_seccomp_filter(filters, syscall_nr, id, filter);
+	ret = 0;
+
+out:
+	kfree(merged);
+	return ret;
+}
+
+/**
+ * seccomp_add_filter - adds a filter for an unfiltered syscall
+ * @filters: filters object to add a filter/action to
+ * @syscall_nr: system call number to add a filter for
+ * @filter_string: the filter string to apply
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
+			      char *filter_string)
+{
+	struct event_filter *filter;
+	int ret = 0;
+
+	if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
+		set_seccomp_filter(filters, syscall_nr,
+				   SECCOMP_ACTION_ALLOW, NULL);
+		goto out;
+	}
+
+	filter = alloc_event_filter(syscall_nr, filter_string);
+	if (IS_ERR(filter)) {
+		ret = PTR_ERR(filter);
+		goto out;
+	}
+	/* Always add to the last slot available since additions are
+	 * are only done one at a time.
+	 */
+	set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
+out:
+	return ret;
+}
+
+/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
+static int filter_match_current(struct event_filter *event_filter)
+{
+	int err = 0;
+#ifdef CONFIG_FTRACE_SYSCALLS
+	uint8_t syscall_state[64];
+
+	memset(syscall_state, 0, sizeof(syscall_state));
+
+	/* The generic tracing entry can remain zeroed. */
+	err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
+					 NULL);
+	if (err)
+		return 0;
+
+	err = filter_match_preds(event_filter, syscall_state);
+#endif
+	return err;
+}
+
+static const char *syscall_nr_to_name(int syscall)
+{
+	const char *syscall_name = "unknown";
+	struct syscall_metadata *data = syscall_nr_to_meta(syscall);
+	if (data)
+		syscall_name = data->name;
+	return syscall_name;
+}
+
+static void filters_set_compat(struct seccomp_filters *filters)
+{
+#ifdef CONFIG_COMPAT
+	if (is_compat_task())
+		filters->flags.compat = 1;
+#endif
+}
+
+static inline int filters_compat_mismatch(struct seccomp_filters *filters)
+{
+	int ret = 0;
+	if (!filters)
+		return 0;
+#ifdef CONFIG_COMPAT
+	if (!!(is_compat_task()) == filters->flags.compat)
+		ret = 1;
+#endif
+	return ret;
+}
+
+static inline int syscall_is_execve(int syscall)
+{
+	int nr = __NR_execve;
+#ifdef CONFIG_COMPAT
+	if (is_compat_task())
+		nr = __NR_seccomp_execve_32;
+#endif
+	return syscall == nr;
+}
+
+#ifndef KSTK_EIP
+#define KSTK_EIP(x) 0L
+#endif
+
+void seccomp_filter_log_failure(int syscall)
+{
+	pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
+		current->comm, task_pid_nr(current), syscall,
+		syscall_nr_to_name(syscall), KSTK_EIP(current));
+}
+
+/* put_seccomp_state - decrements the reference count of @orig and may free. */
+void put_seccomp_filters(struct seccomp_filters *orig)
+{
+	if (!orig)
+		return;
+
+	if (atomic_dec_and_test(&orig->usage))
+		__put_seccomp_filters(orig);
+}
+
+/* get_seccomp_state - increments the reference count of @orig */
+struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)
+{
+	if (!orig)
+		return NULL;
+	atomic_inc(&orig->usage);
+	return orig;
+}
+
+/**
+ * seccomp_test_filters - tests 'current' against the given syscall
+ * @state: seccomp_state of current to use.
+ * @syscall: number of the system call to test
+ *
+ * Returns 0 on ok and non-zero on error/failure.
+ */
+int seccomp_test_filters(int syscall)
+{
+	uint16_t id;
+	struct event_filter *filter;
+	struct seccomp_filters *filters;
+	int ret = -EACCES;
+
+	rcu_read_lock();
+	filters = get_seccomp_filters(current->seccomp.filters);
+	rcu_read_unlock();
+
+	if (!filters)
+		goto out;
+
+	if (filters_compat_mismatch(filters)) {
+		pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
+			current->comm, task_pid_nr(current));
+		goto out;
+	}
+
+	/* execve is never allowed. */
+	if (syscall_is_execve(syscall))
+		goto out;
+
+	ret = 0;
+	id = seccomp_filter_id(filters, syscall);
+	if (seccomp_filter_allow(id))
+		goto out;
+
+	ret = -EACCES;
+	if (!seccomp_filter_dynamic(id))
+		goto out;
+
+	filter = seccomp_dynamic_filter(filters, id);
+	if (filter && filter_match_current(filter))
+		ret = 0;
+out:
+	put_seccomp_filters(filters);
+	return ret;
+}
+
+/**
+ * seccomp_show_filters - prints the current filter state to a seq_file
+ * @filters: properly get()'d filters object
+ * @m: the prepared seq_file to receive the data
+ *
+ * Returns 0 on a successful write.
+ */
+int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
+{
+	int syscall;
+	seq_printf(m, "Mode: %d\n", current->seccomp.mode);
+	if (!filters)
+		goto out;
+
+	for (syscall = 0; syscall < NR_syscalls; ++syscall) {
+		uint16_t id = seccomp_filter_id(filters, syscall);
+		const char *filter_string = SECCOMP_FILTER_ALLOW;
+		if (seccomp_filter_deny(id))
+			continue;
+		seq_printf(m, "%d (%s): ",
+			      syscall,
+			      syscall_nr_to_name(syscall));
+		if (seccomp_filter_dynamic(id))
+			filter_string = get_filter_string(
+					  seccomp_dynamic_filter(filters, id));
+		seq_printf(m, "%s\n", filter_string);
+	}
+out:
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seccomp_show_filters);
+
+/**
+ * seccomp_get_filter - copies the filter_string into "buf"
+ * @syscall_nr: system call number to look up
+ * @buf: destination buffer
+ * @bufsize: available space in the buffer.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ *          operates on current. current must be attempting a system call
+ *          when this is called.
+ *
+ * Looks up the filter for the given system call number on current.  If found,
+ * the string length of the NUL-terminated buffer is returned and < 0 is
+ * returned on error. The NUL byte is not included in the length.
+ */
+long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
+{
+	struct seccomp_filters *filters;
+	struct event_filter *filter;
+	long ret = -EINVAL;
+	uint16_t id;
+
+	if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
+		bufsize = SECCOMP_MAX_FILTER_LENGTH;
+
+	rcu_read_lock();
+	filters = get_seccomp_filters(current->seccomp.filters);
+	rcu_read_unlock();
+
+	if (!filters)
+		goto out;
+
+	ret = -ENOENT;
+	id = seccomp_filter_id(filters, syscall_nr);
+	if (seccomp_filter_deny(id))
+		goto out;
+
+	if (seccomp_filter_allow(id)) {
+		ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
+		goto copied;
+	}
+
+	filter = seccomp_dynamic_filter(filters, id);
+	if (!filter)
+		goto out;
+	ret = strlcpy(buf, get_filter_string(filter), bufsize);
+
+copied:
+	if (ret >= bufsize) {
+		ret = -ENOSPC;
+		goto out;
+	}
+	/* Zero out any remaining buffer, just in case. */
+	memset(buf + ret, 0, bufsize - ret);
+out:
+	put_seccomp_filters(filters);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_get_filter);
+
+/**
+ * seccomp_clear_filter: clears the seccomp filter for a syscall.
+ * @syscall_nr: the system call number to clear filters for.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ *          operates on current. current must be attempting a system call
+ *          when this is called.
+ *
+ * Returns 0 on success.
+ */
+long seccomp_clear_filter(int syscall_nr)
+{
+	struct seccomp_filters *filters = NULL, *orig_filters;
+	uint16_t id;
+	int ret = -EINVAL;
+
+	rcu_read_lock();
+	orig_filters = get_seccomp_filters(current->seccomp.filters);
+	rcu_read_unlock();
+
+	if (!orig_filters)
+		goto out;
+
+	if (filters_compat_mismatch(orig_filters))
+		goto out;
+
+	id = seccomp_filter_id(orig_filters, syscall_nr);
+	if (seccomp_filter_deny(id))
+		goto out;
+
+	/* Create a new filters object for the task */
+	if (seccomp_filter_dynamic(id))
+		filters = seccomp_filters_new(orig_filters->count - 1);
+	else
+		filters = seccomp_filters_new(orig_filters->count);
+
+	if (IS_ERR(filters)) {
+		ret = PTR_ERR(filters);
+		goto out;
+	}
+
+	/* Copy, but drop the requested entry. */
+	ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
+	if (ret)
+		goto out;
+	get_seccomp_filters(filters);  /* simplify the out: path */
+
+	rcu_assign_pointer(current->seccomp.filters, filters);
+	synchronize_rcu();
+	put_seccomp_filters(orig_filters);  /* for the task */
+out:
+	put_seccomp_filters(orig_filters);  /* for the get */
+	put_seccomp_filters(filters);  /* for the extra get */
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_clear_filter);
+
+/**
+ * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
+ * @syscall_nr: system call number to apply the filter to.
+ * @filter: ftrace filter string to apply.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ *          operates on current. current must be attempting a system call
+ *          when this is called.
+ *
+ * New filters may be added for system calls when the current task is
+ * not in a secure computing mode (seccomp).  Otherwise, existing filters may
+ * be extended.
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+long seccomp_set_filter(int syscall_nr, char *filter)
+{
+	struct seccomp_filters *filters = NULL, *orig_filters = NULL;
+	uint16_t id;
+	long ret = -EINVAL;
+	uint16_t filters_needed;
+
+	if (!filter)
+		goto out;
+
+	filter = strstrip(filter);
+	/* Disallow empty strings. */
+	if (filter[0] == 0)
+		goto out;
+
+	rcu_read_lock();
+	orig_filters = get_seccomp_filters(current->seccomp.filters);
+	rcu_read_unlock();
+
+	/* After the first call, compatibility mode is selected permanently. */
+	ret = -EACCES;
+	if (filters_compat_mismatch(orig_filters))
+		goto out;
+
+	filters_needed = orig_filters ? orig_filters->count : 0;
+	id = seccomp_filter_id(orig_filters, syscall_nr);
+	if (seccomp_filter_deny(id)) {
+		/* Don't allow DENYs to be changed when in a seccomp mode */
+		ret = -EACCES;
+		if (current->seccomp.mode)
+			goto out;
+		filters_needed++;
+	}
+
+	filters = seccomp_filters_new(filters_needed);
+	if (IS_ERR(filters)) {
+		ret = PTR_ERR(filters);
+		goto out;
+	}
+
+	filters_set_compat(filters);
+	if (orig_filters) {
+		ret = seccomp_filters_copy(filters, orig_filters, -1);
+		if (ret)
+			goto out;
+	}
+
+	if (seccomp_filter_deny(id))
+		ret = seccomp_add_filter(filters, syscall_nr, filter);
+	else
+		ret = seccomp_extend_filter(filters, syscall_nr, filter);
+	if (ret)
+		goto out;
+	get_seccomp_filters(filters);  /* simplify the error paths */
+
+	rcu_assign_pointer(current->seccomp.filters, filters);
+	synchronize_rcu();
+	put_seccomp_filters(orig_filters);  /* for the task */
+out:
+	put_seccomp_filters(orig_filters);  /* for the get */
+	put_seccomp_filters(filters);  /* for get or task, on err */
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_set_filter);
+
+long prctl_set_seccomp_filter(unsigned long syscall_nr,
+			      char __user *user_filter)
+{
+	int nr;
+	long ret;
+	char *filter = NULL;
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls)
+		goto out;
+
+	ret = -EFAULT;
+	if (!user_filter)
+		goto out;
+
+	filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
+	ret = -ENOMEM;
+	if (!filter)
+		goto out;
+
+	ret = -EFAULT;
+	if (strncpy_from_user(filter, user_filter,
+			      SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
+		goto out;
+
+	nr = (int) syscall_nr;
+	ret = seccomp_set_filter(nr, filter);
+
+out:
+	kfree(filter);
+	return ret;
+}
+
+long prctl_clear_seccomp_filter(unsigned long syscall_nr)
+{
+	int nr = -1;
+	long ret;
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls)
+		goto out;
+
+	nr = (int) syscall_nr;
+	ret = seccomp_clear_filter(nr);
+
+out:
+	return ret;
+}
+
+long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
+			      unsigned long available)
+{
+	int ret, nr;
+	unsigned long copied;
+	char *buf = NULL;
+	ret = -EINVAL;
+	if (!available)
+		goto out;
+	/* Ignore extra buffer space. */
+	if (available > SECCOMP_MAX_FILTER_LENGTH)
+		available = SECCOMP_MAX_FILTER_LENGTH;
+
+	ret = -EINVAL;
+	if (syscall_nr >= NR_syscalls)
+		goto out;
+	nr = (int) syscall_nr;
+
+	ret = -ENOMEM;
+	buf = kmalloc(available, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	ret = seccomp_get_filter(nr, buf, available);
+	if (ret < 0)
+		goto out;
+
+	/* Include the NUL byte in the copy. */
+	copied = copy_to_user(dst, buf, ret + 1);
+	ret = -ENOSPC;
+	if (copied)
+		goto out;
+	ret = 0;
+out:
+	kfree(buf);
+	return ret;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index af468ed..ed60d06 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1698,13 +1698,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_ENDIAN:
 			error = SET_ENDIAN(me, arg2);
 			break;
-
 		case PR_GET_SECCOMP:
 			error = prctl_get_seccomp();
 			break;
 		case PR_SET_SECCOMP:
 			error = prctl_set_seccomp(arg2);
 			break;
+		case PR_SET_SECCOMP_FILTER:
+			error = prctl_set_seccomp_filter(arg2,
+							 (char __user *) arg3);
+			break;
+		case PR_CLEAR_SECCOMP_FILTER:
+			error = prctl_clear_seccomp_filter(arg2);
+			break;
+		case PR_GET_SECCOMP_FILTER:
+			error = prctl_get_seccomp_filter(arg2,
+							 (char __user *) arg3,
+							 arg4);
+			break;
 		case PR_GET_TSC:
 			error = GET_TSC_CTL(arg2);
 			break;
diff --git a/security/Kconfig b/security/Kconfig
index 95accd4..c76adf2 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -2,6 +2,10 @@
 # Security configuration
 #
 
+# Make seccomp filter Kconfig switch below available
+config HAVE_SECCOMP_FILTER
+       bool
+
 menu "Security options"
 
 config KEYS
@@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
 
 	  If you are unsure how to answer this question, answer N.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	select SECCOMP
+	depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
+	help
+	  This kernel feature expands CONFIG_SECCOMP to allow computing
+	  in environments with reduced kernel access dictated by the
+	  application itself through prctl calls.  If
+	  CONFIG_FTRACE_SYSCALLS is available, then system call
+	  argument-based filtering predicates may be used.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config SECURITY
 	bool "Enable different security models"
 	depends on SYSFS
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 04/13] seccomp_filter: add process state reporting
  2011-05-26 18:49                                                                                                       ` Will Drewry
                                                                                                                           ` (2 preceding siblings ...)
  2011-06-01  3:10                                                                                                         ` [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters Will Drewry
@ 2011-06-01  3:10                                                                                                         ` Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
                                                                                                                           ` (8 subsequent siblings)
  12 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry

Adds seccomp and seccomp_filter status reporting to proc.
/proc/<pid>/seccomp_filter provides the current seccomp mode
and the list of allowed or dynamically filtered system calls.

v3: changed to using filters directly.
v2: removed status entry, added seccomp file.
    (requested by kosaki.motohiro@jp.fujitsu.com)
    allowed S_IRUGO reading of entries
    (requested by viro@zeniv.linux.org.uk)
    added flags
    got rid of the seccomp_t type
    dropped seccomp file

Signed-off-by: Will Drewry <wad@chromium.org>
---
 fs/proc/base.c |   25 +++++++++++++++++++++++++
 1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa5327..01473fe 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -73,6 +73,7 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
+#include <linux/seccomp.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
@@ -579,6 +580,24 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
+/*
+ * Print out the current seccomp filter set for the task.
+ */
+#ifdef CONFIG_SECCOMP_FILTER
+int proc_pid_seccomp_filter_show(struct seq_file *m, struct pid_namespace *ns,
+				 struct pid *pid, struct task_struct *task)
+{
+	struct seccomp_filters *filters;
+
+	rcu_read_lock();
+	filters = get_seccomp_filters(task->seccomp.filters);
+	rcu_read_unlock();
+	seccomp_show_filters(filters, m);
+	put_seccomp_filters(filters);
+	return 0;
+}
+#endif /* CONFIG_SECCOMP_FILTER */
+
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -2838,6 +2857,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",    S_IRUGO, proc_pid_syscall),
 #endif
+#ifdef CONFIG_SECCOMP_FILTER
+	ONE("seccomp_filter",     S_IRUGO, proc_pid_seccomp_filter_show),
+#endif
 	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
 	ONE("stat",       S_IRUGO, proc_tgid_stat),
 	ONE("statm",      S_IRUGO, proc_pid_statm),
@@ -3180,6 +3202,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",   S_IRUGO, proc_pid_syscall),
 #endif
+#ifdef CONFIG_SECCOMP_FILTER
+	ONE("seccomp_filter",     S_IRUGO, proc_pid_seccomp_filter_show),
+#endif
 	INF("cmdline",   S_IRUGO, proc_pid_cmdline),
 	ONE("stat",      S_IRUGO, proc_tid_stat),
 	ONE("statm",     S_IRUGO, proc_pid_statm),
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-05-26 18:49                                                                                                       ` Will Drewry
                                                                                                                           ` (3 preceding siblings ...)
  2011-06-01  3:10                                                                                                         ` [PATCH v3 04/13] seccomp_filter: add process state reporting Will Drewry
@ 2011-06-01  3:10                                                                                                         ` Will Drewry
  2011-06-01 21:23                                                                                                           ` Kees Cook
  2011-06-01  3:10                                                                                                         ` [PATCH v3 06/13] x86: add HAVE_SECCOMP_FILTER and seccomp_execve Will Drewry
                                                                                                                           ` (7 subsequent siblings)
  12 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Randy Dunlap, linux-doc

 Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
 implemented presently, and what it may be used for.  In addition,
 the limitations and caveats of the proposed implementation are
 included.

 v3: a little more cleanup
 v2: moved to prctl/
     updated for the v2 syntax.
     adds a note about compat behavior

 Signed-off-by: Will Drewry <wad@chromium.org>
---
 Documentation/prctl/seccomp_filter.txt |  145 ++++++++++++++++++++++++++++++++
 1 files changed, 145 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/prctl/seccomp_filter.txt

diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
new file mode 100644
index 0000000..27ac5af
--- /dev/null
+++ b/Documentation/prctl/seccomp_filter.txt
@@ -0,0 +1,145 @@
+		Seccomp filtering
+		=================
+
+Introduction
+------------
+
+A large number of system calls are exposed to every userland process
+with many of them going unused for the entire lifetime of the process.
+As system calls change and mature, bugs are found and eradicated.  A
+certain subset of userland applications benefit by having a reduce set
+of available system calls.  The reduced set reduces the total kernel
+surface exposed to the application.  System call filtering is meant for
+use with those applications.
+
+The implementation currently leverages both the existing seccomp
+infrastructure and the kernel tracing infrastructure.  By centralizing
+hooks for attack surface reduction in seccomp, it is possible to assure
+attention to security that is less relevant in normal ftrace scenarios,
+such as time-of-check, time-of-use attacks.  However, ftrace provides a
+rich, human-friendly environment for interfacing with system call
+specific arguments.  (As such, this requires FTRACE_SYSCALLS for any
+introspective filtering support.)
+
+
+What it isn't
+-------------
+
+System call filtering isn't a sandbox.  It provides a clearly defined
+mechanism for minimizing the exposed kernel surface.  Beyond that,
+policy for logical behavior and information flow should be managed with
+a combinations of other system hardening techniques and, potentially, a
+LSM of your choosing.  Expressive, dynamic filters based on the ftrace
+filter engine provide further options down this path (avoiding
+pathological sizes or selecting which of the multiplexed system calls in
+socketcall() is allowed, for instance) which could be construed,
+incorrectly, as a more complete sandboxing solution.
+
+
+Usage
+-----
+
+An additional seccomp mode is exposed through mode '2'.
+This mode depends on CONFIG_SECCOMP_FILTER.  By default, it provides
+only the most trivial of filter support "1" or cleared.  However, if
+CONFIG_FTRACE_SYSCALLS is enabled, the ftrace filter engine may be used
+for more expressive filters.
+
+A collection of filters may be supplied via prctl, and the current set
+of filters is exposed in /proc/<pid>/seccomp_filter.
+
+Interacting with seccomp filters can be done through three new prctl calls
+and one existing one.
+
+PR_SET_SECCOMP:
+	A pre-existing option for enabling strict seccomp mode (1) or
+	filtering seccomp (2).
+
+	Usage:
+		prctl(PR_SET_SECCOMP, 1);  /* strict */
+		prctl(PR_SET_SECCOMP, 2);  /* filters */
+
+PR_SET_SECCOMP_FILTER:
+	Allows the specification of a new filter for a given system
+	call, by number, and filter string. If CONFIG_FTRACE_SYSCALLS is
+	supported, the filter string may be any valid value for the
+	given system call.  If it is not supported, the filter string
+	may only be "1".
+
+	All calls to PR_SET_SECCOMP_FILTER for a given system
+	call will append the supplied string to any existing filters.
+	Filter construction looks as follows:
+		(Nothing) + "fd == 1 || fd == 2" => fd == 1 || fd == 2
+		... + "fd != 2" => (fd == 1 || fd == 2) && fd != 2
+		... + "size < 100" =>
+			((fd == 1 || fd == 2) && fd != 2) && size < 100
+	If there is no filter and the seccomp mode has already
+	transitioned to filtering, additions cannot be made.  Filters
+	may only be added that reduce the available kernel surface.
+
+	Usage (per the construction example above):
+		prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd == 1 || fd == 2");
+		prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
+		prctl(PR_SET_SECCOMP_FILTER, __NR_write, "size < 100");
+
+PR_CLEAR_SECCOMP_FILTER:
+	Removes all filter entries for a given system call number.  When
+	called prior to entering seccomp filtering mode, it allows for
+	new filters to be applied to the same system call.  After
+	transition, however, it completely drops access to the call.
+
+	Usage:
+		prctl(PR_CLEAR_SECCOMP_FILTER, __NR_open);
+
+PR_GET_SECCOMP_FILTER: Returns the aggregated filter string for a system
+	call into a user-supplied buffer of a given length.
+
+	Usage:
+		prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf,
+		      sizeof(buf));
+
+All of the above calls return 0 on success and non-zero on error.
+
+
+Example
+-------
+
+Assume a process would like to cleanly read and write to stdin/out/err
+as well as access its filters after seccomp enforcement begins.  This
+may be done as follows:
+
+  prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 0");
+  prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd == 1 || fd == 2");
+  prctl(PR_SET_SECCOMP_FILTER, __NR_exit, "1");
+  prctl(PR_SET_SECCOMP_FILTER, __NR_prctl, "1");
+
+  prctl(PR_SET_SECCOMP, PR_SECCOMP_MODE_FILTER, 0);
+
+  /* Do stuff with fdset . . .*/
+
+  /* Drop read access and keep only write access to fd 1. */
+  prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
+  prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
+
+  /* Perform any final processing . . . */
+  syscall(__NR_exit, 0);
+
+
+Caveats
+-------
+
+- The filter event subsystem comes from CONFIG_TRACE_EVENTS, and the
+system call events come from CONFIG_FTRACE_SYSCALLS.  However, if
+neither are available, a filter string of "1" will be honored, and it may
+be removed using PR_CLEAR_SECCOMP_FILTER.  With ftrace filtering,
+calling PR_SET_SECCOMP_FILTER with a filter of "0" would have similar
+affect but would not be consistent on a kernel without the support.
+
+- Some platforms support a 32-bit userspace with 64-bit kernels.  In
+these cases (CONFIG_COMPAT), system call numbers may not match across
+64-bit and 32-bit system calls. When the first PRCTL_SET_SECCOMP_FILTER
+is called, the in-memory filters state is annotated with whether the
+call has been made via the compat interface.  All subsequent calls will
+be checked for compat call mismatch.  In the long run, it may make sense
+to store compat and non-compat filters separately, but that is not
+supported at present.
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 06/13] x86: add HAVE_SECCOMP_FILTER and seccomp_execve
  2011-05-26 18:49                                                                                                       ` Will Drewry
                                                                                                                           ` (4 preceding siblings ...)
  2011-06-01  3:10                                                                                                         ` [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
@ 2011-06-01  3:10                                                                                                         ` Will Drewry
  2011-06-01  3:10                                                                                                           ` Will Drewry
                                                                                                                           ` (6 subsequent siblings)
  12 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Ingo Molnar, H. Peter Anvin, x86

Adds support to the x86 architecture by providing a compatibility
mode wrapper for sys_execve's number and selecting HAVE_SECCOMP_FILTER

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/x86/Kconfig                   |    1 +
 arch/x86/include/asm/ia32_unistd.h |    1 +
 arch/x86/include/asm/seccomp_64.h  |    2 ++
 3 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a..1843d17 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -64,6 +64,7 @@ config X86
 	select HAVE_TEXT_POKE_SMP
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_SPARSE_IRQ
+	select HAVE_SECCOMP_FILTER
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_FIND_NEXT_BIT
 	select GENERIC_IRQ_PROBE
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
index 976f6ec..8ed2922 100644
--- a/arch/x86/include/asm/ia32_unistd.h
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -12,6 +12,7 @@
 #define __NR_ia32_exit		  1
 #define __NR_ia32_read		  3
 #define __NR_ia32_write		  4
+#define __NR_ia32_execve	 11
 #define __NR_ia32_sigreturn	119
 #define __NR_ia32_rt_sigreturn	173
 
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
index 84ec1bd..85c4219 100644
--- a/arch/x86/include/asm/seccomp_64.h
+++ b/arch/x86/include/asm/seccomp_64.h
@@ -8,10 +8,12 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 #define __NR_seccomp_read_32 __NR_ia32_read
 #define __NR_seccomp_write_32 __NR_ia32_write
 #define __NR_seccomp_exit_32 __NR_ia32_exit
 #define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
+#define __NR_seccomp_execve_32 __NR_ia32_execve
 
 #endif /* _ASM_X86_SECCOMP_64_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 07/13] arm: select HAVE_SECCOMP_FILTER
  2011-05-26 18:49                                                                                                       ` Will Drewry
@ 2011-06-01  3:10                                                                                                           ` Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
                                                                                                                             ` (11 subsequent siblings)
  12 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Russell King, linux-arm-kernel

Enable support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER by
default.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/arm/Kconfig |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 377a7a5..4725fbc 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -16,6 +16,7 @@ config ARM
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
+	select HAVE_SECCOMP_FILTER
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 07/13] arm: select HAVE_SECCOMP_FILTER
@ 2011-06-01  3:10                                                                                                           ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-arm-kernel

Enable support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER by
default.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/arm/Kconfig |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 377a7a5..4725fbc 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -16,6 +16,7 @@ config ARM
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
+	select HAVE_SECCOMP_FILTER
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-05-26 18:49                                                                                                       ` Will Drewry
                                                                                                                           ` (6 preceding siblings ...)
  2011-06-01  3:10                                                                                                           ` Will Drewry
@ 2011-06-01  3:10                                                                                                         ` Will Drewry
  2011-06-01  5:37                                                                                                           ` Michal Simek
  2011-06-01  3:10                                                                                                         ` [PATCH v3 09/13] mips: " Will Drewry
                                                                                                                           ` (4 subsequent siblings)
  12 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Michal Simek, microblaze-uclinux

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/microblaze/Kconfig               |    1 +
 arch/microblaze/include/asm/seccomp.h |    2 ++
 2 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index eccdefe..30ef677 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -1,6 +1,7 @@
 config MICROBLAZE
 	def_bool y
 	select HAVE_MEMBLOCK
+	select HAVE_SECCOMP_FILTER
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/microblaze/include/asm/seccomp.h b/arch/microblaze/include/asm/seccomp.h
index 0d91275..0e38eed 100644
--- a/arch/microblaze/include/asm/seccomp.h
+++ b/arch/microblaze/include/asm/seccomp.h
@@ -7,10 +7,12 @@
 #define __NR_seccomp_write		__NR_write
 #define __NR_seccomp_exit		__NR_exit
 #define __NR_seccomp_sigreturn		__NR_sigreturn
+#define __NR_seccomp_execve		__NR_execve
 
 #define __NR_seccomp_read_32		__NR_read
 #define __NR_seccomp_write_32		__NR_write
 #define __NR_seccomp_exit_32		__NR_exit
 #define __NR_seccomp_sigreturn_32	__NR_sigreturn
+#define __NR_seccomp_execve_32		__NR_execve
 
 #endif	/* _ASM_MICROBLAZE_SECCOMP_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 09/13] mips: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-05-26 18:49                                                                                                       ` Will Drewry
                                                                                                                           ` (7 preceding siblings ...)
  2011-06-01  3:10                                                                                                         ` [PATCH v3 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve Will Drewry
@ 2011-06-01  3:10                                                                                                         ` Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 10/13] s390: " Will Drewry
                                                                                                                           ` (3 subsequent siblings)
  12 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Ralf Baechle, linux-mips

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/mips/Kconfig               |    1 +
 arch/mips/include/asm/seccomp.h |    3 +++
 2 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8e256cc..d376f68 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -10,6 +10,7 @@ config MIPS
 	select HAVE_ARCH_KGDB
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_SECCOMP_FILTER
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_C_RECORDMCOUNT
diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h
index ae6306e..4014a3a 100644
--- a/arch/mips/include/asm/seccomp.h
+++ b/arch/mips/include/asm/seccomp.h
@@ -6,6 +6,7 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 /*
  * Kludge alert:
@@ -19,6 +20,7 @@
 #define __NR_seccomp_write_32		4004
 #define __NR_seccomp_exit_32		4001
 #define __NR_seccomp_sigreturn_32	4193	/* rt_sigreturn */
+#define __NR_seccomp_execve_32		4011
 
 #elif defined(CONFIG_MIPS32_N32)
 
@@ -26,6 +28,7 @@
 #define __NR_seccomp_write_32		6001
 #define __NR_seccomp_exit_32		6058
 #define __NR_seccomp_sigreturn_32	6211	/* rt_sigreturn */
+#define __NR_seccomp_execve_32		6057
 
 #endif /* CONFIG_MIPS32_O32 */
 
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 10/13] s390: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-05-26 18:49                                                                                                       ` Will Drewry
                                                                                                                           ` (8 preceding siblings ...)
  2011-06-01  3:10                                                                                                         ` [PATCH v3 09/13] mips: " Will Drewry
@ 2011-06-01  3:10                                                                                                         ` Will Drewry
  2011-06-01  3:10                                                                                                           ` Will Drewry
                                                                                                                           ` (2 subsequent siblings)
  12 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Martin Schwidefsky, Heiko Carstens, linux390, linux-s390

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/s390/Kconfig               |    1 +
 arch/s390/include/asm/seccomp.h |    3 ++-
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 2508a6f..9382198 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -64,6 +64,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 config S390
 	def_bool y
 	select USE_GENERIC_SMP_HELPERS if SMP
+	select HAVE_SECCOMP_FILTER
 	select HAVE_SYSCALL_WRAPPERS
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h
index 781a9cf..e5792f5 100644
--- a/arch/s390/include/asm/seccomp.h
+++ b/arch/s390/include/asm/seccomp.h
@@ -7,10 +7,11 @@
 #define __NR_seccomp_write	__NR_write
 #define __NR_seccomp_exit	__NR_exit
 #define __NR_seccomp_sigreturn	__NR_sigreturn
+#define __NR_seccomp_execve	__NR_execve
 
 #define __NR_seccomp_read_32	__NR_read
 #define __NR_seccomp_write_32	__NR_write
 #define __NR_seccomp_exit_32	__NR_exit
-#define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32	__NR_execve
 
 #endif	/* _ASM_S390_SECCOMP_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-05-26 18:49                                                                                                       ` Will Drewry
@ 2011-06-01  3:10                                                                                                           ` Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
                                                                                                                             ` (11 subsequent siblings)
  12 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Benjamin Herrenschmidt, Paul Mackerras, linuxppc-dev

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/powerpc/Kconfig               |    1 +
 arch/powerpc/include/asm/seccomp.h |    2 ++
 2 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8f4d50b..0bd4574 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_SPARSE_IRQ
+	select HAVE_SECCOMP_FILTER
 	select IRQ_PER_CPU
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL
diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
index 00c1d91..3cb9cc1 100644
--- a/arch/powerpc/include/asm/seccomp.h
+++ b/arch/powerpc/include/asm/seccomp.h
@@ -7,10 +7,12 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 #define __NR_seccomp_read_32 __NR_read
 #define __NR_seccomp_write_32 __NR_write
 #define __NR_seccomp_exit_32 __NR_exit
 #define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
 
 #endif	/* _ASM_POWERPC_SECCOMP_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
@ 2011-06-01  3:10                                                                                                           ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: Will Drewry, linuxppc-dev, jmorris, rostedt, tglx,
	Paul Mackerras, kees.cook, torvalds, mingo

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/powerpc/Kconfig               |    1 +
 arch/powerpc/include/asm/seccomp.h |    2 ++
 2 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8f4d50b..0bd4574 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_SPARSE_IRQ
+	select HAVE_SECCOMP_FILTER
 	select IRQ_PER_CPU
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL
diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
index 00c1d91..3cb9cc1 100644
--- a/arch/powerpc/include/asm/seccomp.h
+++ b/arch/powerpc/include/asm/seccomp.h
@@ -7,10 +7,12 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 #define __NR_seccomp_read_32 __NR_read
 #define __NR_seccomp_write_32 __NR_write
 #define __NR_seccomp_exit_32 __NR_exit
 #define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
 
 #endif	/* _ASM_POWERPC_SECCOMP_H */
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 12/13] sparc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-05-26 18:49                                                                                                       ` Will Drewry
@ 2011-06-01  3:10                                                                                                           ` Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
                                                                                                                             ` (11 subsequent siblings)
  12 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	David S. Miller, sparclinux

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/sparc/Kconfig               |    2 ++
 arch/sparc/include/asm/seccomp.h |    2 ++
 2 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e560d10..5249760 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,6 +25,7 @@ config SPARC
 	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
 	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_SECCOMP_FILTER
 
 config SPARC32
 	def_bool !64BIT
@@ -39,6 +40,7 @@ config SPARC64
 	select HAVE_KRETPROBES
 	select HAVE_KPROBES
 	select HAVE_MEMBLOCK
+	select HAVE_SECCOMP_FILTER
 	select HAVE_SYSCALL_WRAPPERS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/sparc/include/asm/seccomp.h b/arch/sparc/include/asm/seccomp.h
index adca1bc..a1dac08 100644
--- a/arch/sparc/include/asm/seccomp.h
+++ b/arch/sparc/include/asm/seccomp.h
@@ -6,10 +6,12 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 #define __NR_seccomp_read_32 __NR_read
 #define __NR_seccomp_write_32 __NR_write
 #define __NR_seccomp_exit_32 __NR_exit
 #define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
 
 #endif /* _ASM_SECCOMP_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 12/13] sparc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
@ 2011-06-01  3:10                                                                                                           ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	David S. Miller, sparclinux

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/sparc/Kconfig               |    2 ++
 arch/sparc/include/asm/seccomp.h |    2 ++
 2 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e560d10..5249760 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -25,6 +25,7 @@ config SPARC
 	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
 	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_SECCOMP_FILTER
 
 config SPARC32
 	def_bool !64BIT
@@ -39,6 +40,7 @@ config SPARC64
 	select HAVE_KRETPROBES
 	select HAVE_KPROBES
 	select HAVE_MEMBLOCK
+	select HAVE_SECCOMP_FILTER
 	select HAVE_SYSCALL_WRAPPERS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/sparc/include/asm/seccomp.h b/arch/sparc/include/asm/seccomp.h
index adca1bc..a1dac08 100644
--- a/arch/sparc/include/asm/seccomp.h
+++ b/arch/sparc/include/asm/seccomp.h
@@ -6,10 +6,12 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 #define __NR_seccomp_read_32 __NR_read
 #define __NR_seccomp_write_32 __NR_write
 #define __NR_seccomp_exit_32 __NR_exit
 #define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
 
 #endif /* _ASM_SECCOMP_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 13/13] sh: select HAVE_SECCOMP_FILTER
  2011-05-26 18:49                                                                                                       ` Will Drewry
@ 2011-06-01  3:10                                                                                                           ` Will Drewry
  2011-06-01  3:10                                                                                                         ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
                                                                                                                             ` (11 subsequent siblings)
  12 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Paul Mundt, linux-sh

Add support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER.
---
 arch/sh/Kconfig |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4b89da2..41ea3a7 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -10,6 +10,7 @@ config SUPERH
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
 	select HAVE_IRQ_WORK
+	select HAVE_SECCOMP_FILTER
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_KERNEL_GZIP
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* [PATCH v3 13/13] sh: select HAVE_SECCOMP_FILTER
@ 2011-06-01  3:10                                                                                                           ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01  3:10 UTC (permalink / raw)
  To: linux-kernel
  Cc: kees.cook, torvalds, tglx, mingo, rostedt, jmorris, Will Drewry,
	Paul Mundt, linux-sh

Add support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER.
---
 arch/sh/Kconfig |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4b89da2..41ea3a7 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -10,6 +10,7 @@ config SUPERH
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
 	select HAVE_IRQ_WORK
+	select HAVE_SECCOMP_FILTER
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_KERNEL_GZIP
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 12/13] sparc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-06-01  3:10                                                                                                           ` Will Drewry
@ 2011-06-01  3:35                                                                                                             ` David Miller
  -1 siblings, 0 replies; 406+ messages in thread
From: David Miller @ 2011-06-01  3:35 UTC (permalink / raw)
  To: wad
  Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
	sparclinux

From: Will Drewry <wad@chromium.org>
Date: Tue, 31 May 2011 22:10:44 -0500

> Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
> system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
> 
> Signed-off-by: Will Drewry <wad@chromium.org>

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 12/13] sparc: select HAVE_SECCOMP_FILTER and provide
@ 2011-06-01  3:35                                                                                                             ` David Miller
  0 siblings, 0 replies; 406+ messages in thread
From: David Miller @ 2011-06-01  3:35 UTC (permalink / raw)
  To: wad
  Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
	sparclinux

From: Will Drewry <wad@chromium.org>
Date: Tue, 31 May 2011 22:10:44 -0500

> Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
> system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
> 
> Signed-off-by: Will Drewry <wad@chromium.org>

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-06-01  3:10                                                                                                         ` [PATCH v3 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve Will Drewry
@ 2011-06-01  5:37                                                                                                           ` Michal Simek
  0 siblings, 0 replies; 406+ messages in thread
From: Michal Simek @ 2011-06-01  5:37 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
	microblaze-uclinux

Will Drewry wrote:
> Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
> system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
> 
> Signed-off-by: Will Drewry <wad@chromium.org>
> ---
>  arch/microblaze/Kconfig               |    1 +
>  arch/microblaze/include/asm/seccomp.h |    2 ++
>  2 files changed, 3 insertions(+), 0 deletions(-)

Acked-by: Michal Simek <monstr@monstr.eu>

-- 
Michal Simek, Ing. (M.Eng)
w: www.monstr.eu p: +42-0-721842854
Maintainer of Linux kernel 2.6 Microblaze Linux - http://www.monstr.eu/fdt/
Microblaze U-BOOT custodian

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 02/13] tracing: split out syscall_trace_enter construction
  2011-06-01  3:10                                                                                                         ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
@ 2011-06-01  7:00                                                                                                           ` Ingo Molnar
  2011-06-01 17:15                                                                                                             ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Ingo Molnar @ 2011-06-01  7:00 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, torvalds, tglx, rostedt, jmorris,
	Frederic Weisbecker, Ingo Molnar


* Will Drewry <wad@chromium.org> wrote:

> perf appears to be the primary consumer of the CONFIG_FTRACE_SYSCALLS
> infrastructure.  As such, many the helpers target at perf can be split
> into a peerf-focused helper and a generic CONFIG_FTRACE_SYSCALLS
> consumer interface.
> 
> This change splits out syscall_trace_enter construction from
> perf_syscall_enter for current into two helpers:
> - ftrace_syscall_enter_state
> - ftrace_syscall_enter_state_size
> 
> And adds another helper for completeness:
> - ftrace_syscall_exit_state_size
> 
> These helpers allow for shared code between perf ftrace events and
> any other consumers of CONFIG_FTRACE_SYSCALLS events.  The proposed
> seccomp_filter patches use this code.
> 
> Signed-off-by: Will Drewry <wad@chromium.org>
> ---
>  include/trace/syscall.h       |    4 ++
>  kernel/trace/trace_syscalls.c |   96 +++++++++++++++++++++++++++++++++++------
>  2 files changed, 86 insertions(+), 14 deletions(-)

So, looking at the diffstat comparison again:

       bitmask (2009):  6 files changed,  194 insertions(+), 22 deletions(-)
 filter engine (2010): 18 files changed, 1100 insertions(+), 21 deletions(-)
 event filters (2011):  5 files changed,   82 insertions(+), 16 deletions(-)

you went back to the middle solution again which is the worst of them 
- why?

If you want this to be a stupid, limited hack then go for the v1 
bitmask.

If you agree with my observation that filters allow the clean 
user-space implementation of LSM equivalent security solutions (of 
which sandboxes are just a *narrow special case*) then please use the 
main highlevel abstraction we have defined around them: event 
filters.

Now, my observation was not uncontested so let me try to sum up the
rather large discussion that erupted around it, as i see it.

I saw four main counter arguments:

 - "Sandboxing is special and should stay separate from LSMs."

   I think this is a technically bogus argument, see:

         https://lkml.org/lkml/2011/5/26/85

   That answer of mine went unchallenged.

 - "Events should only be observers."

   Even ignoring the question of why on earth it should be a problem 
   for a willing call-site to use event filtering results sensibly, 
   this argument misses the plain fact that events are *already* 
   active participants, see:

         http://www.spinics.net/lists/mips/msg41075.html

   That answer of mine went unchallenged too.

 - "This feature is too simplistic."

   That's wrong i think, the feature is highly flexible:

         http://www.mail-archive.com/linuxppc-dev@lists.ozlabs.org/msg51387.html

   This reply of mine went unchallenged as well.

 - "Is this feature actually useful enough for applications, does it 
    justify the complexity?"

  This is the *only* valid technical counter-argument i saw, and it's 
  a crutial one that is not fully answered yet. Since i think the feature
  is an LSM equivalent i think it's at least as useful as any LSM is.

 - [ if i missed any important argument then someone please insert it 
     here. ]

But what you do here is to use the filter engine directly which is 
both a limited hack *and* complex (beyond the linecount it doubles 
our ABI exposure, amongst other things), so i find that approach 
rather counter-productive, now that i've seen the real thing.

Will this feature be just another example of the LSM status quo 
dragging down a newcomer into the mud, until it's just as sucky and 
limited as any existing LSMs? That would be a sad outcome!

Thanks,

	Ingo

ps. Please start a new discussion thread for the next iteration!
    This one is *way* too deep already.

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 02/13] tracing: split out syscall_trace_enter construction
  2011-06-01  7:00                                                                                                           ` Ingo Molnar
@ 2011-06-01 17:15                                                                                                             ` Will Drewry
  2011-06-02 14:29                                                                                                               ` Ingo Molnar
  0 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-06-01 17:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, kees.cook, torvalds, tglx, rostedt, jmorris,
	Frederic Weisbecker, Ingo Molnar

On Wed, Jun 1, 2011 at 2:00 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> perf appears to be the primary consumer of the CONFIG_FTRACE_SYSCALLS
>> infrastructure.  As such, many the helpers target at perf can be split
>> into a peerf-focused helper and a generic CONFIG_FTRACE_SYSCALLS
>> consumer interface.
>>
>> This change splits out syscall_trace_enter construction from
>> perf_syscall_enter for current into two helpers:
>> - ftrace_syscall_enter_state
>> - ftrace_syscall_enter_state_size
>>
>> And adds another helper for completeness:
>> - ftrace_syscall_exit_state_size
>>
>> These helpers allow for shared code between perf ftrace events and
>> any other consumers of CONFIG_FTRACE_SYSCALLS events.  The proposed
>> seccomp_filter patches use this code.
>>
>> Signed-off-by: Will Drewry <wad@chromium.org>
>> ---
>>  include/trace/syscall.h       |    4 ++
>>  kernel/trace/trace_syscalls.c |   96 +++++++++++++++++++++++++++++++++++------
>>  2 files changed, 86 insertions(+), 14 deletions(-)
>
> So, looking at the diffstat comparison again:
>
>       bitmask (2009):  6 files changed,  194 insertions(+), 22 deletions(-)
>  filter engine (2010): 18 files changed, 1100 insertions(+), 21 deletions(-)
>  event filters (2011):  5 files changed,   82 insertions(+), 16 deletions(-)
>
> you went back to the middle solution again which is the worst of them
> - why?

In short, design for the future and implement now.  I'll elaborate a
bit more below.

> If you want this to be a stupid, limited hack then go for the v1
> bitmask.

I only aim for the finest!

(bitmasks were bad for the other consumers of this patch series:
socketcall mulitplexing issues and ioctl # filtering).

> If you agree with my observation that filters allow the clean
> user-space implementation of LSM equivalent security solutions (of
> which sandboxes are just a *narrow special case*) then please use the
> main highlevel abstraction we have defined around them: event
> filters.

I agree that LSM-equivalent security solutions can be moved over to an
ftrace based infrastructure.  However, LSMs and seccomp have different
semantics.  Reducing the kernel attack surface in a
"sandboxing"-sort-of-way requires a default-deny interface that is
resilient to kernel changes (like new system calls) without
immediately degrading robustness.  LSMs provide a fail-open mechanism
for taking an active role in kernel-defined pinch points.  It is
possible to implement a default-deny LSM, but it requires a "hook" for
every security event and the addition of a security event results in a
hole in the not-so-default-deny infrastructure.  ftrace + event
filters are the same.

Based on my observations while exploring the code, it appears that the
LSM security_* calls could easily become active trace events and the
LSM infrastructure moved over to use those as tracepoints or via
event_filters.  There will be a need for new predicates for the
various new types (inode *, etc), and so on.  However, the
trace_sys_enter/__secure_computing model will still be a special case.
 Even if they fed into security event subsystem or something like
that, the absence of filters on a traced process would need to
default-deny as well as when there are no active matches.  So while a
brand-new shared ABI may be possible (security_event_open,
active_event_open, ?), there will still be trickiness in making the
behaviors not have implicit side effects and ensure that newly added
system calls, for instance, that lack the macro wrapper don't poke a
hole in the "sandbox" model.  There are a lot of options for designing
it though.  Like making TIF_SECCOMP mean that any security_* filter
failure or match count of 0 == process death.  It's just that
designing this new approach will be incredibly hairy, and we really
lack many of the concrete requirements that would be needed, in my
opinion.

> Now, my observation was not uncontested so let me try to sum up the
> rather large discussion that erupted around it, as i see it.
>
> I saw four main counter arguments:
>
>  - "Sandboxing is special and should stay separate from LSMs."
>
>   I think this is a technically bogus argument, see:
>
>         https://lkml.org/lkml/2011/5/26/85
>
>   That answer of mine went unchallenged.

I may have spoken to this above.  I dunno.

>  - "Events should only be observers."
>
>   Even ignoring the question of why on earth it should be a problem
>   for a willing call-site to use event filtering results sensibly,
>   this argument misses the plain fact that events are *already*
>   active participants, see:
>
>         http://www.spinics.net/lists/mips/msg41075.html
>
>   That answer of mine went unchallenged too.
>
>  - "This feature is too simplistic."
>
>   That's wrong i think, the feature is highly flexible:
>
>         http://www.mail-archive.com/linuxppc-dev@lists.ozlabs.org/msg51387.html
>
>   This reply of mine went unchallenged as well.

Well I did only implement a PoC.  It couldn't handle attack surface
reduction after-the-fact, nor did I add a GET_FILTER call, etc.  The
code was minimal in many ways because the functionality was too.

>  - "Is this feature actually useful enough for applications, does it
>    justify the complexity?"
>
>  This is the *only* valid technical counter-argument i saw, and it's
>  a crutial one that is not fully answered yet. Since i think the feature
>  is an LSM equivalent i think it's at least as useful as any LSM is.
>
>  - [ if i missed any important argument then someone please insert it
>     here. ]
>
> But what you do here is to use the filter engine directly which is
> both a limited hack *and* complex (beyond the linecount it doubles
> our ABI exposure, amongst other things), so i find that approach
> rather counter-productive, now that i've seen the real thing.
>
> Will this feature be just another example of the LSM status quo
> dragging down a newcomer into the mud, until it's just as sucky and
> limited as any existing LSMs? That would be a sad outcome!

I hope not.  I believe it will be easy to move the backend of
seccomp_filter over to a per-task ftrace event filter infrastructure
when that comes in the future.  But for now, I'm trying to meet the
needs of possible consumers now: chromium, qemu, lxc, and lay
groundwork for a ftrace-future.

If this is a total fail, then perhaps we should have a separate
discussion over how we can tackle a lot of these needs.  I was hoping
that we could push some of that off to the LinuxSecuritySummit -- I've
proposed/requested a QA panel on this topic :)  But I'd love to not
wait until then for everything.

> ps. Please start a new discussion thread for the next iteration!
>    This one is *way* too deep already.

Sorry - will do!

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-06-01  3:10                                                                                                         ` [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
@ 2011-06-01 21:23                                                                                                           ` Kees Cook
  2011-06-01 23:03                                                                                                             ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Kees Cook @ 2011-06-01 21:23 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, torvalds, tglx, mingo, rostedt, jmorris,
	Randy Dunlap, linux-doc

Hi Will,

Minor typo corrections below...

On Tue, May 31, 2011 at 10:10:37PM -0500, Will Drewry wrote:
>  Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
>  implemented presently, and what it may be used for.  In addition,
>  the limitations and caveats of the proposed implementation are
>  included.
> 
> --- /dev/null
> +++ b/Documentation/prctl/seccomp_filter.txt
> @@ -0,0 +1,145 @@
> ...
> +certain subset of userland applications benefit by having a reduce set
                                                               reduced

> +of available system calls.  The reduced set reduces the total kernel

Maybe "The resulting set reduces ... " ?


-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-06-01 21:23                                                                                                           ` Kees Cook
@ 2011-06-01 23:03                                                                                                             ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-01 23:03 UTC (permalink / raw)
  To: Kees Cook
  Cc: linux-kernel, torvalds, tglx, mingo, rostedt, jmorris,
	Randy Dunlap, linux-doc

On Wed, Jun 1, 2011 at 4:23 PM, Kees Cook <kees.cook@canonical.com> wrote:
> Hi Will,
>
> Minor typo corrections below...
>
> On Tue, May 31, 2011 at 10:10:37PM -0500, Will Drewry wrote:
>>  Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
>>  implemented presently, and what it may be used for.  In addition,
>>  the limitations and caveats of the proposed implementation are
>>  included.
>>
>> --- /dev/null
>> +++ b/Documentation/prctl/seccomp_filter.txt
>> @@ -0,0 +1,145 @@
>> ...
>> +certain subset of userland applications benefit by having a reduce set
>                                                               reduced
>
>> +of available system calls.  The reduced set reduces the total kernel
>
> Maybe "The resulting set reduces ... " ?

Cool - I'll clean it up in the next cut.

Thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 13/13] sh: select HAVE_SECCOMP_FILTER
  2011-06-01  3:10                                                                                                           ` Will Drewry
@ 2011-06-02  5:27                                                                                                             ` Paul Mundt
  -1 siblings, 0 replies; 406+ messages in thread
From: Paul Mundt @ 2011-06-02  5:27 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
	linux-sh

On Tue, May 31, 2011 at 10:10:45PM -0500, Will Drewry wrote:
> Add support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER.
> ---
>  arch/sh/Kconfig |    1 +
>  1 files changed, 1 insertions(+), 0 deletions(-)
> 
Acked-by: Paul Mundt <lethal@linux-sh.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 13/13] sh: select HAVE_SECCOMP_FILTER
@ 2011-06-02  5:27                                                                                                             ` Paul Mundt
  0 siblings, 0 replies; 406+ messages in thread
From: Paul Mundt @ 2011-06-02  5:27 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
	linux-sh

On Tue, May 31, 2011 at 10:10:45PM -0500, Will Drewry wrote:
> Add support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER.
> ---
>  arch/sh/Kconfig |    1 +
>  1 files changed, 1 insertions(+), 0 deletions(-)
> 
Acked-by: Paul Mundt <lethal@linux-sh.org>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 02/13] tracing: split out syscall_trace_enter construction
  2011-06-01 17:15                                                                                                             ` Will Drewry
@ 2011-06-02 14:29                                                                                                               ` Ingo Molnar
  2011-06-02 15:18                                                                                                                 ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Ingo Molnar @ 2011-06-02 14:29 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, torvalds, tglx, rostedt, jmorris,
	Frederic Weisbecker, Ingo Molnar


* Will Drewry <wad@chromium.org> wrote:

> > If you agree with my observation that filters allow the clean 
> > user-space implementation of LSM equivalent security solutions 
> > (of which sandboxes are just a *narrow special case*) then please 
> > use the main highlevel abstraction we have defined around them: 
> > event filters.
> 
> I agree that LSM-equivalent security solutions can be moved over to 
> an ftrace based infrastructure.  However, LSMs and seccomp have 
> different semantics.  Reducing the kernel attack surface in a 
> "sandboxing"-sort-of-way requires a default-deny interface that is 
> resilient to kernel changes (like new system calls) without 
> immediately degrading robustness. [...]

Correct. Because seccomp is the user of those syscall-surface events 
it can use them in such a way - i see no problem there: unknown or 
not permitted syscalls get denied for seccomp-mode-2 tasks.

> [...] LSMs provide a fail-open mechanism for taking an active role 
> in kernel-defined pinch points.  It is possible to implement a 
> default-deny LSM, but it requires a "hook" for every security event 
> and the addition of a security event results in a hole in the 
> not-so-default-deny infrastructure.  ftrace + event filters are the 
> same.

Well, i only suggested that it's LSM-equivalent security 
functionality, i did not suggest that you should implement an LSM in 
security/. I do not think the LSM modularization is particularly well 
fit for seccomp.

> Based on my observations while exploring the code, it appears that 
> the LSM security_* calls could easily become active trace events 
> and the LSM infrastructure moved over to use those as tracepoints 
> or via event_filters.  There will be a need for new predicates for 
> the various new types (inode *, etc), and so on.  However, the 
> trace_sys_enter/__secure_computing model will still be a special 
> case.

Yes, and that special event will not go away!

I did not suggest to *replace* those events with the security events. 
I suggested to *combine* them - or at least have a model that 
smoothly extends to those events as well and does not limit itself to 
the syscall surface alone.

We'll want to have both.

But by hardcoding to only those events, and creating a 
syscall-numbering special ABI, a wall will be risen between this 
implementation and any future enhancement to cover other events. My 
suggestion would be to use the event filter approach - that way 
there's not a wall but an open door towards future extensions ;-)

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 02/13] tracing: split out syscall_trace_enter construction
  2011-06-02 14:29                                                                                                               ` Ingo Molnar
@ 2011-06-02 15:18                                                                                                                 ` Will Drewry
  0 siblings, 0 replies; 406+ messages in thread
From: Will Drewry @ 2011-06-02 15:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, kees.cook, torvalds, tglx, rostedt, jmorris,
	Frederic Weisbecker, Ingo Molnar

On Thu, Jun 2, 2011 at 9:29 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
[...]
>
>> Based on my observations while exploring the code, it appears that
>> the LSM security_* calls could easily become active trace events
>> and the LSM infrastructure moved over to use those as tracepoints
>> or via event_filters.  There will be a need for new predicates for
>> the various new types (inode *, etc), and so on.  However, the
>> trace_sys_enter/__secure_computing model will still be a special
>> case.
>
> Yes, and that special event will not go away!
>
> I did not suggest to *replace* those events with the security events.
> I suggested to *combine* them - or at least have a model that
> smoothly extends to those events as well and does not limit itself to
> the syscall surface alone.
>
> We'll want to have both.
>
> But by hardcoding to only those events, and creating a
> syscall-numbering special ABI, a wall will be risen between this
> implementation and any future enhancement to cover other events. My
> suggestion would be to use the event filter approach - that way
> there's not a wall but an open door towards future extensions ;-)

Yeah, I can definitely see that.  We could have the prctl interface
take in the event id, but that introduces dependency on
CONFIG_PERF_EVENTS in addition
(to get the id exported) and means we'll have much more limited
coverage of syscalls until the syscall wrapping matures.

Could this be resolved in the proposed change by supporting both
mechanisms? Or is that just asking for trouble?

E.g., it could be an extra field:
  prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_TYPE_EVENT, event_id,
filter_string);
  prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_TYPE_SYSCALL,
__NR_somesyscall, filter_string);
  [and the same for CLEAR_FILTER and GET_FILTER]

or even reserve negative values for event ids and positive for
syscalls (which feels more hackish).  Adding event_id support wouldn't
be much more additional code (since it's just a layer of
dereferencing).  Since there will likely be syscall-indexed entry
behavior no matter what (like there is for ftrace/perf_sysenter), it
won't necessarily be a large diversion in the future either.

If not, seccomp_filter could depend on both FTRACE_SYSCALLS and
exported PERF_EVENTS (or make "id"s not perf_event specific), then it
could just use the sys_enter event ids.  Doing so does have some other
properties that I'm not as fond of, like requiring debugfs to be
compiled in, mounted, and readable by the caller in order to construct
a filterset, so I can still see some benefit for the syscall number
use in some cases (much easier to deploy on a server without debugfs
access, etc).  Right now, having both interfaces doesn't really give
us anything, but having the field set aside for future exploration
isn't necessarily a bad thing!

What do you think? Would a change to support both be too crazy/dumb or
just crazy/dumb enough?  Or do you see another path that could avoid
isolating any current work from a more fruitful future?

thanks!
will

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
  2011-06-01  3:10                                                                                                         ` [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters Will Drewry
@ 2011-06-02 17:36                                                                                                           ` Paul E. McKenney
  2011-06-02 18:14                                                                                                             ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Paul E. McKenney @ 2011-06-02 17:36 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
	Peter Zijlstra, Frederic Weisbecker, linux-security-module

On Tue, May 31, 2011 at 10:10:35PM -0500, Will Drewry wrote:
> This change adds a new seccomp mode which specifies the allowed system
> calls dynamically.  When in the new mode (2), all system calls are
> checked against process-defined filters - first by system call number,
> then by a filter string.  If an entry exists for a given system call and
> all filter predicates evaluate to true, then the task may proceed.
> Otherwise, the task is killed.

A few questions below -- I can't say that I understand the RCU usage.

							Thanx, Paul

> Filter string parsing and evaluation is handled by the ftrace filter
> engine.  Related patches tweak to the perf filter trace and free
> allowing the calls to be shared. Filters inherit their understanding of
> types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
> subsystem which already populates this information in syscall_metadata
> associated enter_event (and exit_event) structures. If
> CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
> will be allowed.
> 
> The net result is a process may have its system calls filtered using the
> ftrace filter engine's inherent understanding of systems calls.  The set
> of filters is specified through the PR_SET_SECCOMP_FILTER argument in
> prctl(). For example, a filterset for a process, like pdftotext, that
> should only process read-only input could (roughly) look like:
>   sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
>   prctl(PR_SET_SECCOMP_FILTER, __NR_open, rdonly);
>   prctl(PR_SET_SECCOMP_FILTER, __NR__llseek, "1");
>   prctl(PR_SET_SECCOMP_FILTER, __NR_brk, "1");
>   prctl(PR_SET_SECCOMP_FILTER, __NR_close, "1");
>   prctl(PR_SET_SECCOMP_FILTER, __NR_exit_group, "1");
>   prctl(PR_SET_SECCOMP_FILTER, __NR_fstat64, "1");
>   prctl(PR_SET_SECCOMP_FILTER, __NR_mmap2, "1");
>   prctl(PR_SET_SECCOMP_FILTER, __NR_munmap, "1");
>   prctl(PR_SET_SECCOMP_FILTER, __NR_read, "1");
>   prctl(PR_SET_SECCOMP_FILTER, __NR_write, "(fd == 1 | fd == 2)");
>   prctl(PR_SET_SECCOMP, 2);
> 
> Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
> be &&'d together to ensure that attack surface may only be reduced:
>   prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
> 
> With the earlier example, the active filter becomes:
>   "(fd == 1 || fd == 2) && fd != 2"
> 
> The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
> The latter returns the current filter for a system call to userspace:
> 
>   prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf, bufsize);
> 
> while the former clears any filters for a given system call changing it
> back to a defaulty deny:
> 
>   prctl(PR_CLEAR_SECCOMP_FILTER, __NR_write);
> 
> v3: - always block execve calls (as per linus torvalds)
>     - add __NR_seccomp_execve(_32) to seccomp-supporting arches
>     - ensure compat tasks can't reach ftrace:syscalls
>     - dropped new defines for seccomp modes.
>     - two level array instead of hlists (sugg. by olof johansson)
>     - added generic Kconfig entry that is not connected.
>     - dropped internal seccomp.h
>     - move prctl helpers to seccomp_filter
>     - killed seccomp_t typedef (as per checkpatch)
> v2: - changed to use the existing syscall number ABI.
>     - prctl changes to minimize parsing in the kernel:
>       prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
>       prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
>       prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
>       prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
>     - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
>     - added flags
>     - provide a default fail syscall_nr_to_meta in ftrace
>     - provides fallback for unhooked system calls
>     - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
>     - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
>     - moved to a hlist and 4 bit hash of linked lists
>     - added support to operate without CONFIG_FTRACE_SYSCALLS
>     - moved Kconfig support next to SECCOMP
>     - made Kconfig entries dependent on EXPERIMENTAL
>     - added macros to avoid ifdefs from kernel/fork.c
>     - added compat task/filter matching
>     - drop seccomp.h inclusion in sched.h and drop seccomp_t
>     - added Filtering to "show" output
>     - added on_exec state dup'ing when enabling after a fast-path accept.
> 
> Signed-off-by: Will Drewry <wad@chromium.org>
> ---
>  include/linux/prctl.h   |    5 +
>  include/linux/sched.h   |    2 +-
>  include/linux/seccomp.h |   98 ++++++-
>  include/trace/syscall.h |    7 +
>  kernel/Makefile         |    3 +
>  kernel/fork.c           |    3 +
>  kernel/seccomp.c        |   38 ++-
>  kernel/seccomp_filter.c |  784 +++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sys.c            |   13 +-
>  security/Kconfig        |   17 +
>  10 files changed, 954 insertions(+), 16 deletions(-)
>  create mode 100644 kernel/seccomp_filter.c
> 
> diff --git a/include/linux/prctl.h b/include/linux/prctl.h
> index a3baeb2..44723ce 100644
> --- a/include/linux/prctl.h
> +++ b/include/linux/prctl.h
> @@ -64,6 +64,11 @@
>  #define PR_GET_SECCOMP	21
>  #define PR_SET_SECCOMP	22
> 
> +/* Get/set process seccomp filters */
> +#define PR_GET_SECCOMP_FILTER	35
> +#define PR_SET_SECCOMP_FILTER	36
> +#define PR_CLEAR_SECCOMP_FILTER	37
> +
>  /* Get/set the capability bounding set (as per security/commoncap.c) */
>  #define PR_CAPBSET_READ 23
>  #define PR_CAPBSET_DROP 24
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 18d63ce..3f0bc8d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1374,7 +1374,7 @@ struct task_struct {
>  	uid_t loginuid;
>  	unsigned int sessionid;
>  #endif
> -	seccomp_t seccomp;
> +	struct seccomp_struct seccomp;
> 
>  /* Thread group tracking */
>     	u32 parent_exec_id;
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 167c333..f4434ca 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -1,13 +1,33 @@
>  #ifndef _LINUX_SECCOMP_H
>  #define _LINUX_SECCOMP_H
> 
> +struct seq_file;
> 
>  #ifdef CONFIG_SECCOMP
> 
> +#include <linux/errno.h>
>  #include <linux/thread_info.h>
> +#include <linux/types.h>
>  #include <asm/seccomp.h>
> 
> -typedef struct { int mode; } seccomp_t;
> +struct seccomp_filters;
> +/**
> + * struct seccomp_struct - the state of a seccomp'ed process
> + *
> + * @mode:
> + *     if this is 1, the process is under standard seccomp rules
> + *             is 2, the process is only allowed to make system calls where
> + *                   associated filters evaluate successfully.
> + * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
> + *           filters assignment/use should be RCU-protected and its contents
> + *           should never be modified when attached to a seccomp_struct.
> + */
> +struct seccomp_struct {
> +	uint16_t mode;
> +#ifdef CONFIG_SECCOMP_FILTER
> +	struct seccomp_filters *filters;
> +#endif
> +};
> 
>  extern void __secure_computing(int);
>  static inline void secure_computing(int this_syscall)
> @@ -16,15 +36,14 @@ static inline void secure_computing(int this_syscall)
>  		__secure_computing(this_syscall);
>  }
> 
> -extern long prctl_get_seccomp(void);
>  extern long prctl_set_seccomp(unsigned long);
> +extern long prctl_get_seccomp(void);
> 
>  #else /* CONFIG_SECCOMP */
> 
>  #include <linux/errno.h>
> 
> -typedef struct { } seccomp_t;
> -
> +struct seccomp_struct { };
>  #define secure_computing(x) do { } while (0)
> 
>  static inline long prctl_get_seccomp(void)
> @@ -32,11 +51,80 @@ static inline long prctl_get_seccomp(void)
>  	return -EINVAL;
>  }
> 
> -static inline long prctl_set_seccomp(unsigned long arg2)
> +static inline long prctl_set_seccomp(unsigned long a2);
>  {
>  	return -EINVAL;
>  }
> 
>  #endif /* CONFIG_SECCOMP */
> 
> +#ifdef CONFIG_SECCOMP_FILTER
> +
> +#define inherit_tsk_seccomp(_child, _orig) do { \
> +	_child->seccomp.mode = _orig->seccomp.mode; \
> +	_child->seccomp.filters = get_seccomp_filters(_orig->seccomp.filters); \
> +	} while (0)
> +#define put_tsk_seccomp(_tsk) put_seccomp_filters(_tsk->seccomp.filters)
> +
> +extern int seccomp_show_filters(struct seccomp_filters *filters,
> +				struct seq_file *);
> +extern long seccomp_set_filter(int, char *);
> +extern long seccomp_clear_filter(int);
> +extern long seccomp_get_filter(int, char *, unsigned long);
> +
> +extern long prctl_set_seccomp_filter(unsigned long, char __user *);
> +extern long prctl_get_seccomp_filter(unsigned long, char __user *,
> +				     unsigned long);
> +extern long prctl_clear_seccomp_filter(unsigned long);
> +
> +extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
> +extern void put_seccomp_filters(struct seccomp_filters *);
> +
> +extern int seccomp_test_filters(int);
> +extern void seccomp_filter_log_failure(int);
> +
> +#else  /* CONFIG_SECCOMP_FILTER */
> +
> +struct seccomp_filters { };
> +#define inherit_tsk_seccomp(_child, _orig) do { } while (0)
> +#define put_tsk_seccomp(_tsk) do { } while (0)
> +
> +static inline int seccomp_show_filters(struct seccomp_filters *filters,
> +				       struct seq_file *m)
> +{
> +	return -ENOSYS;
> +}
> +
> +static inline long seccomp_set_filter(int syscall_nr, char *filter)
> +{
> +	return -ENOSYS;
> +}
> +
> +static inline long seccomp_clear_filter(int syscall_nr)
> +{
> +	return -ENOSYS;
> +}
> +
> +static inline long seccomp_get_filter(int syscall_nr,
> +				      char *buf, unsigned long available)
> +{
> +	return -ENOSYS;
> +}
> +
> +static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
> +{
> +	return -ENOSYS;
> +}
> +
> +static inline long prctl_clear_seccomp_filter(unsigned long a2)
> +{
> +	return -ENOSYS;
> +}
> +
> +static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
> +					    unsigned long a4)
> +{
> +	return -ENOSYS;
> +}
> +#endif  /* CONFIG_SECCOMP_FILTER */
>  #endif /* _LINUX_SECCOMP_H */
> diff --git a/include/trace/syscall.h b/include/trace/syscall.h
> index 242ae04..e061ad0 100644
> --- a/include/trace/syscall.h
> +++ b/include/trace/syscall.h
> @@ -35,6 +35,8 @@ struct syscall_metadata {
>  extern unsigned long arch_syscall_addr(int nr);
>  extern int init_syscall_trace(struct ftrace_event_call *call);
> 
> +extern struct syscall_metadata *syscall_nr_to_meta(int);
> +
>  extern int reg_event_syscall_enter(struct ftrace_event_call *call);
>  extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
>  extern int reg_event_syscall_exit(struct ftrace_event_call *call);
> @@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
>  				      struct trace_event *event);
>  enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
>  				     struct trace_event *event);
> +#else
> +static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
> +{
> +	return NULL;
> +}
>  #endif
> 
>  #ifdef CONFIG_PERF_EVENTS
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 85cbfb3..84e7dfb 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
>  obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
>  obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
>  obj-$(CONFIG_SECCOMP) += seccomp.o
> +ifeq ($(CONFIG_SECCOMP_FILTER),y)
> +obj-$(CONFIG_SECCOMP) += seccomp_filter.o
> +endif
>  obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
>  obj-$(CONFIG_TREE_RCU) += rcutree.o
>  obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
> diff --git a/kernel/fork.c b/kernel/fork.c
> index e7548de..6f835e0 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -34,6 +34,7 @@
>  #include <linux/cgroup.h>
>  #include <linux/security.h>
>  #include <linux/hugetlb.h>
> +#include <linux/seccomp.h>
>  #include <linux/swap.h>
>  #include <linux/syscalls.h>
>  #include <linux/jiffies.h>
> @@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
>  	free_thread_info(tsk->stack);
>  	rt_mutex_debug_task_free(tsk);
>  	ftrace_graph_exit_task(tsk);
> +	put_tsk_seccomp(tsk);
>  	free_task_struct(tsk);
>  }
>  EXPORT_SYMBOL(free_task);
> @@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
>  	if (err)
>  		goto out;
> 
> +	inherit_tsk_seccomp(tsk, orig);
>  	setup_thread_stack(tsk, orig);
>  	clear_user_return_notifier(tsk);
>  	clear_tsk_need_resched(tsk);
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 57d4b13..0a942be 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -2,16 +2,20 @@
>   * linux/kernel/seccomp.c
>   *
>   * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
>   *
>   * This defines a simple but solid secure-computing mode.
>   */
> 
>  #include <linux/seccomp.h>
>  #include <linux/sched.h>
> +#include <linux/slab.h>
>  #include <linux/compat.h>
> +#include <linux/unistd.h>
> +#include <linux/ftrace_event.h>
> 
> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
>  /* #define SECCOMP_DEBUG 1 */
> -#define NR_SECCOMP_MODES 1
> 
>  /*
>   * Secure computing mode 1 allows only read/write/exit/sigreturn.
> @@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
> 
>  void __secure_computing(int this_syscall)
>  {
> -	int mode = current->seccomp.mode;
>  	int * syscall;
> 
> -	switch (mode) {
> +	switch (current->seccomp.mode) {
>  	case 1:
>  		syscall = mode1_syscalls;
>  #ifdef CONFIG_COMPAT
> @@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
>  				return;
>  		} while (*++syscall);
>  		break;
> +#ifdef CONFIG_SECCOMP_FILTER
> +	case 2:
> +		if (this_syscall >= NR_syscalls || this_syscall < 0)
> +			break;
> +
> +		if (!seccomp_test_filters(this_syscall))
> +			return;
> +
> +		seccomp_filter_log_failure(this_syscall);
> +		break;
> +#endif
>  	default:
>  		BUG();
>  	}
> @@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
>  	if (unlikely(current->seccomp.mode))
>  		goto out;
> 
> -	ret = -EINVAL;
> -	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
> -		current->seccomp.mode = seccomp_mode;
> -		set_thread_flag(TIF_SECCOMP);
> +	ret = 0;
> +	switch (seccomp_mode) {
> +	case 1:
>  #ifdef TIF_NOTSC
>  		disable_TSC();
>  #endif
> -		ret = 0;
> +#ifdef CONFIG_SECCOMP_FILTER
> +	case 2:
> +#endif
> +		current->seccomp.mode = seccomp_mode;
> +		set_thread_flag(TIF_SECCOMP);
> +		break;
> +	default:
> +		ret = -EINVAL;
>  	}
> 
> - out:
> +out:
>  	return ret;
>  }
> diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
> new file mode 100644
> index 0000000..9782f25
> --- /dev/null
> +++ b/kernel/seccomp_filter.c
> @@ -0,0 +1,784 @@
> +/* filter engine-based seccomp system call filtering
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> + *
> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
> + */
> +
> +#include <linux/compat.h>
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +#include <linux/ftrace_event.h>
> +#include <linux/seccomp.h>
> +#include <linux/seq_file.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/uaccess.h>
> +
> +#include <asm/syscall.h>
> +#include <trace/syscall.h>
> +
> +
> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
> +
> +#define SECCOMP_FILTER_ALLOW "1"
> +#define SECCOMP_ACTION_DENY 0xffff
> +#define SECCOMP_ACTION_ALLOW 0xfffe
> +
> +/**
> + * struct seccomp_filters - container for seccomp filterset
> + *
> + * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
> + *            May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
> + * @event_filters: array of pointers to ftrace event objects
> + * @count: size of @event_filters
> + * @flags: anonymous struct to wrap filters-specific flags
> + * @usage: reference count to simplify use.
> + */
> +struct seccomp_filters {
> +	uint16_t syscalls[NR_syscalls];
> +	struct event_filter **event_filters;
> +	uint16_t count;
> +	struct {
> +		uint32_t compat:1,
> +			 __reserved:31;
> +	} flags;
> +	atomic_t usage;
> +};
> +
> +/* Handle ftrace symbol non-existence */
> +#ifdef CONFIG_FTRACE_SYSCALLS
> +#define create_event_filter(_ef_pptr, _event_type, _str) \
> +	ftrace_parse_filter(_ef_pptr, _event_type, _str)
> +#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
> +#define free_event_filter(_f) ftrace_free_filter(_f)
> +
> +#else
> +
> +#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
> +#define get_filter_string(_ef) (NULL)
> +#define free_event_filter(_f) do { } while (0)
> +#endif
> +
> +/**
> + * seccomp_filters_new - allocates a new filters object
> + * @count: count to allocate for the event_filters array
> + *
> + * Returns ERR_PTR on error or an allocated object.
> + */
> +static struct seccomp_filters *seccomp_filters_new(uint16_t count)
> +{
> +	struct seccomp_filters *f;
> +
> +	if (count >= SECCOMP_ACTION_ALLOW)
> +		return ERR_PTR(-EINVAL);
> +
> +	f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
> +	if (!f)
> +		return ERR_PTR(-ENOMEM);
> +
> +	/* Lazy SECCOMP_ACTION_DENY assignment. */
> +	memset(f->syscalls, 0xff, sizeof(f->syscalls));
> +	atomic_set(&f->usage, 1);
> +
> +	f->event_filters = NULL;
> +	f->count = count;
> +	if (!count)
> +		return f;
> +
> +	f->event_filters = kzalloc(count * sizeof(struct event_filter *),
> +				   GFP_KERNEL);
> +	if (!f->event_filters) {
> +		kfree(f);
> +		f = ERR_PTR(-ENOMEM);
> +	}
> +	return f;
> +}
> +
> +/**
> + * seccomp_filters_free - cleans up the filter list and frees the table
> + * @filters: NULL or live object to be completely destructed.
> + */
> +static void seccomp_filters_free(struct seccomp_filters *filters)
> +{
> +	uint16_t count = 0;
> +	if (!filters)
> +		return;
> +	while (count < filters->count) {
> +		struct event_filter *f = filters->event_filters[count];
> +		free_event_filter(f);
> +		count++;
> +	}
> +	kfree(filters->event_filters);
> +	kfree(filters);
> +}
> +
> +static void __put_seccomp_filters(struct seccomp_filters *orig)
> +{
> +	WARN_ON(atomic_read(&orig->usage));
> +	seccomp_filters_free(orig);
> +}
> +
> +#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
> +#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
> +#define seccomp_filter_dynamic(_id) \
> +	(!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
> +static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
> +					 int syscall_nr)
> +{
> +	if (!f)
> +		return SECCOMP_ACTION_DENY;
> +	return f->syscalls[syscall_nr];
> +}
> +
> +static inline struct event_filter *seccomp_dynamic_filter(
> +		const struct seccomp_filters *filters, uint16_t id)
> +{
> +	if (!seccomp_filter_dynamic(id))
> +		return NULL;
> +	return filters->event_filters[id];
> +}
> +
> +static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
> +					 int syscall_nr, uint16_t id)
> +{
> +	filters->syscalls[syscall_nr] = id;
> +}
> +
> +static inline void set_seccomp_filter(struct seccomp_filters *filters,
> +				      int syscall_nr, uint16_t id,
> +				      struct event_filter *dynamic_filter)
> +{
> +	filters->syscalls[syscall_nr] = id;
> +	if (seccomp_filter_dynamic(id))
> +		filters->event_filters[id] = dynamic_filter;
> +}
> +
> +static struct event_filter *alloc_event_filter(int syscall_nr,
> +					       const char *filter_string)
> +{
> +	struct syscall_metadata *data;
> +	struct event_filter *filter = NULL;
> +	int err;
> +
> +	data = syscall_nr_to_meta(syscall_nr);
> +	/* Argument-based filtering only works on ftrace-hooked syscalls. */
> +	err = -ENOSYS;
> +	if (!data)
> +		goto fail;
> +	err = create_event_filter(&filter,
> +				  data->enter_event->event.type,
> +				  filter_string);
> +	if (err)
> +		goto fail;
> +
> +	return filter;
> +fail:
> +	kfree(filter);
> +	return ERR_PTR(err);
> +}
> +
> +/**
> + * seccomp_filters_copy - copies filters from src to dst.
> + *
> + * @dst: seccomp_filters to populate.
> + * @src: table to read from.
> + * @skip: specifies an entry, by system call, to skip.
> + *
> + * Returns non-zero on failure.
> + * Both the source and the destination should have no simultaneous
> + * writers, and dst should be exclusive to the caller.
> + * If @skip is < 0, it is ignored.
> + */
> +static int seccomp_filters_copy(struct seccomp_filters *dst,
> +				const struct seccomp_filters *src,
> +				int skip)
> +{
> +	int id = 0, ret = 0, nr;
> +	memcpy(&dst->flags, &src->flags, sizeof(src->flags));
> +	memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
> +	if (!src->count)
> +		goto done;
> +	for (nr = 0; nr < NR_syscalls; ++nr) {
> +		struct event_filter *filter;
> +		const char *str;
> +		uint16_t src_id = seccomp_filter_id(src, nr);
> +		if (nr == skip) {
> +			set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
> +					   NULL);
> +			continue;
> +		}
> +		if (!seccomp_filter_dynamic(src_id))
> +			continue;
> +		if (id >= dst->count) {
> +			ret = -EINVAL;
> +			goto done;
> +		}
> +		str = get_filter_string(seccomp_dynamic_filter(src, src_id));
> +		filter = alloc_event_filter(nr, str);
> +		if (IS_ERR(filter)) {
> +			ret = PTR_ERR(filter);
> +			goto done;
> +		}
> +		set_seccomp_filter(dst, nr, id, filter);
> +		id++;
> +	}
> +
> +done:
> +	return ret;
> +}
> +
> +/**
> + * seccomp_extend_filter - appends more text to a syscall_nr's filter
> + * @filters: unattached filter object to operate on
> + * @syscall_nr: syscall number to update filters for
> + * @filter_string: string to append to the existing filter
> + *
> + * The new string will be &&'d to the original filter string to ensure that it
> + * always matches the existing predicates or less:
> + *   (old_filter) && @filter_string
> + * A new seccomp_filters instance is returned on success and a ERR_PTR on
> + * failure.
> + */
> +static int seccomp_extend_filter(struct seccomp_filters *filters,
> +				 int syscall_nr, char *filter_string)
> +{
> +	struct event_filter *filter;
> +	uint16_t id = seccomp_filter_id(filters, syscall_nr);
> +	char *merged = NULL;
> +	int ret = -EINVAL, expected;
> +
> +	/* No extending with a "1". */
> +	if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
> +		goto out;
> +
> +	filter = seccomp_dynamic_filter(filters, id);
> +	ret = -ENOENT;
> +	if (!filter)
> +		goto out;
> +
> +	merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
> +	ret = -ENOMEM;
> +	if (!merged)
> +		goto out;
> +
> +	expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && %s",
> +			    get_filter_string(filter), filter_string);
> +	ret = -E2BIG;
> +	if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
> +		goto out;
> +
> +	/* Free the old filter */
> +	free_event_filter(filter);
> +	set_seccomp_filter(filters, syscall_nr, id, NULL);
> +
> +	/* Replace it */
> +	filter = alloc_event_filter(syscall_nr, merged);
> +	if (IS_ERR(filter)) {
> +		ret = PTR_ERR(filter);
> +		goto out;
> +	}
> +	set_seccomp_filter(filters, syscall_nr, id, filter);
> +	ret = 0;
> +
> +out:
> +	kfree(merged);
> +	return ret;
> +}
> +
> +/**
> + * seccomp_add_filter - adds a filter for an unfiltered syscall
> + * @filters: filters object to add a filter/action to
> + * @syscall_nr: system call number to add a filter for
> + * @filter_string: the filter string to apply
> + *
> + * Returns 0 on success and non-zero otherwise.
> + */
> +static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
> +			      char *filter_string)
> +{
> +	struct event_filter *filter;
> +	int ret = 0;
> +
> +	if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
> +		set_seccomp_filter(filters, syscall_nr,
> +				   SECCOMP_ACTION_ALLOW, NULL);
> +		goto out;
> +	}
> +
> +	filter = alloc_event_filter(syscall_nr, filter_string);
> +	if (IS_ERR(filter)) {
> +		ret = PTR_ERR(filter);
> +		goto out;
> +	}
> +	/* Always add to the last slot available since additions are
> +	 * are only done one at a time.
> +	 */
> +	set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
> +out:
> +	return ret;
> +}
> +
> +/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
> +static int filter_match_current(struct event_filter *event_filter)
> +{
> +	int err = 0;
> +#ifdef CONFIG_FTRACE_SYSCALLS
> +	uint8_t syscall_state[64];
> +
> +	memset(syscall_state, 0, sizeof(syscall_state));
> +
> +	/* The generic tracing entry can remain zeroed. */
> +	err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
> +					 NULL);
> +	if (err)
> +		return 0;
> +
> +	err = filter_match_preds(event_filter, syscall_state);
> +#endif
> +	return err;
> +}
> +
> +static const char *syscall_nr_to_name(int syscall)
> +{
> +	const char *syscall_name = "unknown";
> +	struct syscall_metadata *data = syscall_nr_to_meta(syscall);
> +	if (data)
> +		syscall_name = data->name;
> +	return syscall_name;
> +}
> +
> +static void filters_set_compat(struct seccomp_filters *filters)
> +{
> +#ifdef CONFIG_COMPAT
> +	if (is_compat_task())
> +		filters->flags.compat = 1;
> +#endif
> +}
> +
> +static inline int filters_compat_mismatch(struct seccomp_filters *filters)
> +{
> +	int ret = 0;
> +	if (!filters)
> +		return 0;
> +#ifdef CONFIG_COMPAT
> +	if (!!(is_compat_task()) == filters->flags.compat)
> +		ret = 1;
> +#endif
> +	return ret;
> +}
> +
> +static inline int syscall_is_execve(int syscall)
> +{
> +	int nr = __NR_execve;
> +#ifdef CONFIG_COMPAT
> +	if (is_compat_task())
> +		nr = __NR_seccomp_execve_32;
> +#endif
> +	return syscall == nr;
> +}
> +
> +#ifndef KSTK_EIP
> +#define KSTK_EIP(x) 0L
> +#endif
> +
> +void seccomp_filter_log_failure(int syscall)
> +{
> +	pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
> +		current->comm, task_pid_nr(current), syscall,
> +		syscall_nr_to_name(syscall), KSTK_EIP(current));
> +}
> +
> +/* put_seccomp_state - decrements the reference count of @orig and may free. */
> +void put_seccomp_filters(struct seccomp_filters *orig)
> +{
> +	if (!orig)
> +		return;
> +
> +	if (atomic_dec_and_test(&orig->usage))
> +		__put_seccomp_filters(orig);
> +}
> +
> +/* get_seccomp_state - increments the reference count of @orig */
> +struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)

Nit: the name does not match the comment.

> +{
> +	if (!orig)
> +		return NULL;
> +	atomic_inc(&orig->usage);
> +	return orig;

This is called in an RCU read-side critical section.  What exactly is
RCU protecting?  I would expect an rcu_dereference() or one of the
RCU list-traversal primitives somewhere, either here or at the caller.

> +}
> +
> +/**
> + * seccomp_test_filters - tests 'current' against the given syscall
> + * @state: seccomp_state of current to use.
> + * @syscall: number of the system call to test
> + *
> + * Returns 0 on ok and non-zero on error/failure.
> + */
> +int seccomp_test_filters(int syscall)
> +{
> +	uint16_t id;
> +	struct event_filter *filter;
> +	struct seccomp_filters *filters;
> +	int ret = -EACCES;
> +
> +	rcu_read_lock();
> +	filters = get_seccomp_filters(current->seccomp.filters);
> +	rcu_read_unlock();
> +
> +	if (!filters)
> +		goto out;
> +
> +	if (filters_compat_mismatch(filters)) {
> +		pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
> +			current->comm, task_pid_nr(current));
> +		goto out;
> +	}
> +
> +	/* execve is never allowed. */
> +	if (syscall_is_execve(syscall))
> +		goto out;
> +
> +	ret = 0;
> +	id = seccomp_filter_id(filters, syscall);
> +	if (seccomp_filter_allow(id))
> +		goto out;
> +
> +	ret = -EACCES;
> +	if (!seccomp_filter_dynamic(id))
> +		goto out;
> +
> +	filter = seccomp_dynamic_filter(filters, id);
> +	if (filter && filter_match_current(filter))
> +		ret = 0;
> +out:
> +	put_seccomp_filters(filters);
> +	return ret;
> +}
> +
> +/**
> + * seccomp_show_filters - prints the current filter state to a seq_file
> + * @filters: properly get()'d filters object
> + * @m: the prepared seq_file to receive the data
> + *
> + * Returns 0 on a successful write.
> + */
> +int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
> +{
> +	int syscall;
> +	seq_printf(m, "Mode: %d\n", current->seccomp.mode);
> +	if (!filters)
> +		goto out;
> +
> +	for (syscall = 0; syscall < NR_syscalls; ++syscall) {
> +		uint16_t id = seccomp_filter_id(filters, syscall);
> +		const char *filter_string = SECCOMP_FILTER_ALLOW;
> +		if (seccomp_filter_deny(id))
> +			continue;
> +		seq_printf(m, "%d (%s): ",
> +			      syscall,
> +			      syscall_nr_to_name(syscall));
> +		if (seccomp_filter_dynamic(id))
> +			filter_string = get_filter_string(
> +					  seccomp_dynamic_filter(filters, id));
> +		seq_printf(m, "%s\n", filter_string);
> +	}
> +out:
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(seccomp_show_filters);
> +
> +/**
> + * seccomp_get_filter - copies the filter_string into "buf"
> + * @syscall_nr: system call number to look up
> + * @buf: destination buffer
> + * @bufsize: available space in the buffer.
> + *
> + * Context: User context only. This function may sleep on allocation and
> + *          operates on current. current must be attempting a system call
> + *          when this is called.
> + *
> + * Looks up the filter for the given system call number on current.  If found,
> + * the string length of the NUL-terminated buffer is returned and < 0 is
> + * returned on error. The NUL byte is not included in the length.
> + */
> +long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
> +{
> +	struct seccomp_filters *filters;
> +	struct event_filter *filter;
> +	long ret = -EINVAL;
> +	uint16_t id;
> +
> +	if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
> +		bufsize = SECCOMP_MAX_FILTER_LENGTH;
> +
> +	rcu_read_lock();
> +	filters = get_seccomp_filters(current->seccomp.filters);
> +	rcu_read_unlock();
> +
> +	if (!filters)
> +		goto out;
> +
> +	ret = -ENOENT;
> +	id = seccomp_filter_id(filters, syscall_nr);
> +	if (seccomp_filter_deny(id))
> +		goto out;
> +
> +	if (seccomp_filter_allow(id)) {
> +		ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
> +		goto copied;
> +	}
> +
> +	filter = seccomp_dynamic_filter(filters, id);
> +	if (!filter)
> +		goto out;
> +	ret = strlcpy(buf, get_filter_string(filter), bufsize);
> +
> +copied:
> +	if (ret >= bufsize) {
> +		ret = -ENOSPC;
> +		goto out;
> +	}
> +	/* Zero out any remaining buffer, just in case. */
> +	memset(buf + ret, 0, bufsize - ret);
> +out:
> +	put_seccomp_filters(filters);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(seccomp_get_filter);
> +
> +/**
> + * seccomp_clear_filter: clears the seccomp filter for a syscall.
> + * @syscall_nr: the system call number to clear filters for.
> + *
> + * Context: User context only. This function may sleep on allocation and
> + *          operates on current. current must be attempting a system call
> + *          when this is called.
> + *
> + * Returns 0 on success.
> + */
> +long seccomp_clear_filter(int syscall_nr)
> +{
> +	struct seccomp_filters *filters = NULL, *orig_filters;
> +	uint16_t id;
> +	int ret = -EINVAL;
> +
> +	rcu_read_lock();
> +	orig_filters = get_seccomp_filters(current->seccomp.filters);
> +	rcu_read_unlock();
> +
> +	if (!orig_filters)
> +		goto out;
> +
> +	if (filters_compat_mismatch(orig_filters))
> +		goto out;
> +
> +	id = seccomp_filter_id(orig_filters, syscall_nr);
> +	if (seccomp_filter_deny(id))
> +		goto out;
> +
> +	/* Create a new filters object for the task */
> +	if (seccomp_filter_dynamic(id))
> +		filters = seccomp_filters_new(orig_filters->count - 1);
> +	else
> +		filters = seccomp_filters_new(orig_filters->count);
> +
> +	if (IS_ERR(filters)) {
> +		ret = PTR_ERR(filters);
> +		goto out;
> +	}
> +
> +	/* Copy, but drop the requested entry. */
> +	ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
> +	if (ret)
> +		goto out;
> +	get_seccomp_filters(filters);  /* simplify the out: path */
> +
> +	rcu_assign_pointer(current->seccomp.filters, filters);

What prevents two copies of seccomp_clear_filter() from running
concurrently?

> +	synchronize_rcu();
> +	put_seccomp_filters(orig_filters);  /* for the task */
> +out:
> +	put_seccomp_filters(orig_filters);  /* for the get */
> +	put_seccomp_filters(filters);  /* for the extra get */
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(seccomp_clear_filter);
> +
> +/**
> + * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
> + * @syscall_nr: system call number to apply the filter to.
> + * @filter: ftrace filter string to apply.
> + *
> + * Context: User context only. This function may sleep on allocation and
> + *          operates on current. current must be attempting a system call
> + *          when this is called.
> + *
> + * New filters may be added for system calls when the current task is
> + * not in a secure computing mode (seccomp).  Otherwise, existing filters may
> + * be extended.
> + *
> + * Returns 0 on success or an errno on failure.
> + */
> +long seccomp_set_filter(int syscall_nr, char *filter)
> +{
> +	struct seccomp_filters *filters = NULL, *orig_filters = NULL;
> +	uint16_t id;
> +	long ret = -EINVAL;
> +	uint16_t filters_needed;
> +
> +	if (!filter)
> +		goto out;
> +
> +	filter = strstrip(filter);
> +	/* Disallow empty strings. */
> +	if (filter[0] == 0)
> +		goto out;
> +
> +	rcu_read_lock();
> +	orig_filters = get_seccomp_filters(current->seccomp.filters);
> +	rcu_read_unlock();
> +
> +	/* After the first call, compatibility mode is selected permanently. */
> +	ret = -EACCES;
> +	if (filters_compat_mismatch(orig_filters))
> +		goto out;
> +
> +	filters_needed = orig_filters ? orig_filters->count : 0;
> +	id = seccomp_filter_id(orig_filters, syscall_nr);
> +	if (seccomp_filter_deny(id)) {
> +		/* Don't allow DENYs to be changed when in a seccomp mode */
> +		ret = -EACCES;
> +		if (current->seccomp.mode)
> +			goto out;
> +		filters_needed++;
> +	}
> +
> +	filters = seccomp_filters_new(filters_needed);
> +	if (IS_ERR(filters)) {
> +		ret = PTR_ERR(filters);
> +		goto out;
> +	}
> +
> +	filters_set_compat(filters);
> +	if (orig_filters) {
> +		ret = seccomp_filters_copy(filters, orig_filters, -1);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	if (seccomp_filter_deny(id))
> +		ret = seccomp_add_filter(filters, syscall_nr, filter);
> +	else
> +		ret = seccomp_extend_filter(filters, syscall_nr, filter);
> +	if (ret)
> +		goto out;
> +	get_seccomp_filters(filters);  /* simplify the error paths */
> +
> +	rcu_assign_pointer(current->seccomp.filters, filters);

Again, what prevents two copies of seccomp_set_filter() from running
concurrently?

> +	synchronize_rcu();
> +	put_seccomp_filters(orig_filters);  /* for the task */
> +out:
> +	put_seccomp_filters(orig_filters);  /* for the get */
> +	put_seccomp_filters(filters);  /* for get or task, on err */
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(seccomp_set_filter);
> +
> +long prctl_set_seccomp_filter(unsigned long syscall_nr,
> +			      char __user *user_filter)
> +{
> +	int nr;
> +	long ret;
> +	char *filter = NULL;
> +
> +	ret = -EINVAL;
> +	if (syscall_nr >= NR_syscalls)
> +		goto out;
> +
> +	ret = -EFAULT;
> +	if (!user_filter)
> +		goto out;
> +
> +	filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
> +	ret = -ENOMEM;
> +	if (!filter)
> +		goto out;
> +
> +	ret = -EFAULT;
> +	if (strncpy_from_user(filter, user_filter,
> +			      SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
> +		goto out;
> +
> +	nr = (int) syscall_nr;
> +	ret = seccomp_set_filter(nr, filter);
> +
> +out:
> +	kfree(filter);
> +	return ret;
> +}
> +
> +long prctl_clear_seccomp_filter(unsigned long syscall_nr)
> +{
> +	int nr = -1;
> +	long ret;
> +
> +	ret = -EINVAL;
> +	if (syscall_nr >= NR_syscalls)
> +		goto out;
> +
> +	nr = (int) syscall_nr;
> +	ret = seccomp_clear_filter(nr);
> +
> +out:
> +	return ret;
> +}
> +
> +long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
> +			      unsigned long available)
> +{
> +	int ret, nr;
> +	unsigned long copied;
> +	char *buf = NULL;
> +	ret = -EINVAL;
> +	if (!available)
> +		goto out;
> +	/* Ignore extra buffer space. */
> +	if (available > SECCOMP_MAX_FILTER_LENGTH)
> +		available = SECCOMP_MAX_FILTER_LENGTH;
> +
> +	ret = -EINVAL;
> +	if (syscall_nr >= NR_syscalls)
> +		goto out;
> +	nr = (int) syscall_nr;
> +
> +	ret = -ENOMEM;
> +	buf = kmalloc(available, GFP_KERNEL);
> +	if (!buf)
> +		goto out;
> +
> +	ret = seccomp_get_filter(nr, buf, available);
> +	if (ret < 0)
> +		goto out;
> +
> +	/* Include the NUL byte in the copy. */
> +	copied = copy_to_user(dst, buf, ret + 1);
> +	ret = -ENOSPC;
> +	if (copied)
> +		goto out;
> +	ret = 0;
> +out:
> +	kfree(buf);
> +	return ret;
> +}
> diff --git a/kernel/sys.c b/kernel/sys.c
> index af468ed..ed60d06 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -1698,13 +1698,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  		case PR_SET_ENDIAN:
>  			error = SET_ENDIAN(me, arg2);
>  			break;
> -
>  		case PR_GET_SECCOMP:
>  			error = prctl_get_seccomp();
>  			break;
>  		case PR_SET_SECCOMP:
>  			error = prctl_set_seccomp(arg2);
>  			break;
> +		case PR_SET_SECCOMP_FILTER:
> +			error = prctl_set_seccomp_filter(arg2,
> +							 (char __user *) arg3);
> +			break;
> +		case PR_CLEAR_SECCOMP_FILTER:
> +			error = prctl_clear_seccomp_filter(arg2);
> +			break;
> +		case PR_GET_SECCOMP_FILTER:
> +			error = prctl_get_seccomp_filter(arg2,
> +							 (char __user *) arg3,
> +							 arg4);
> +			break;
>  		case PR_GET_TSC:
>  			error = GET_TSC_CTL(arg2);
>  			break;
> diff --git a/security/Kconfig b/security/Kconfig
> index 95accd4..c76adf2 100644
> --- a/security/Kconfig
> +++ b/security/Kconfig
> @@ -2,6 +2,10 @@
>  # Security configuration
>  #
> 
> +# Make seccomp filter Kconfig switch below available
> +config HAVE_SECCOMP_FILTER
> +       bool
> +
>  menu "Security options"
> 
>  config KEYS
> @@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
> 
>  	  If you are unsure how to answer this question, answer N.
> 
> +config SECCOMP_FILTER
> +	bool "Enable seccomp-based system call filtering"
> +	select SECCOMP
> +	depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
> +	help
> +	  This kernel feature expands CONFIG_SECCOMP to allow computing
> +	  in environments with reduced kernel access dictated by the
> +	  application itself through prctl calls.  If
> +	  CONFIG_FTRACE_SYSCALLS is available, then system call
> +	  argument-based filtering predicates may be used.
> +
> +	  See Documentation/prctl/seccomp_filter.txt for more detail.
> +
>  config SECURITY
>  	bool "Enable different security models"
>  	depends on SYSFS
> -- 
> 1.7.0.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
  2011-06-02 17:36                                                                                                           ` Paul E. McKenney
@ 2011-06-02 18:14                                                                                                             ` Will Drewry
  2011-06-02 19:42                                                                                                               ` Paul E. McKenney
  0 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-06-02 18:14 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
	Peter Zijlstra, Frederic Weisbecker, linux-security-module

On Thu, Jun 2, 2011 at 12:36 PM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Tue, May 31, 2011 at 10:10:35PM -0500, Will Drewry wrote:
>> This change adds a new seccomp mode which specifies the allowed system
>> calls dynamically.  When in the new mode (2), all system calls are
>> checked against process-defined filters - first by system call number,
>> then by a filter string.  If an entry exists for a given system call and
>> all filter predicates evaluate to true, then the task may proceed.
>> Otherwise, the task is killed.
>
> A few questions below -- I can't say that I understand the RCU usage.
>
>                                                        Thanx, Paul
>
>> Filter string parsing and evaluation is handled by the ftrace filter
>> engine.  Related patches tweak to the perf filter trace and free
>> allowing the calls to be shared. Filters inherit their understanding of
>> types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
>> subsystem which already populates this information in syscall_metadata
>> associated enter_event (and exit_event) structures. If
>> CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
>> will be allowed.
>>
>> The net result is a process may have its system calls filtered using the
>> ftrace filter engine's inherent understanding of systems calls.  The set
>> of filters is specified through the PR_SET_SECCOMP_FILTER argument in
>> prctl(). For example, a filterset for a process, like pdftotext, that
>> should only process read-only input could (roughly) look like:
>>   sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
>>   prctl(PR_SET_SECCOMP_FILTER, __NR_open, rdonly);
>>   prctl(PR_SET_SECCOMP_FILTER, __NR__llseek, "1");
>>   prctl(PR_SET_SECCOMP_FILTER, __NR_brk, "1");
>>   prctl(PR_SET_SECCOMP_FILTER, __NR_close, "1");
>>   prctl(PR_SET_SECCOMP_FILTER, __NR_exit_group, "1");
>>   prctl(PR_SET_SECCOMP_FILTER, __NR_fstat64, "1");
>>   prctl(PR_SET_SECCOMP_FILTER, __NR_mmap2, "1");
>>   prctl(PR_SET_SECCOMP_FILTER, __NR_munmap, "1");
>>   prctl(PR_SET_SECCOMP_FILTER, __NR_read, "1");
>>   prctl(PR_SET_SECCOMP_FILTER, __NR_write, "(fd == 1 | fd == 2)");
>>   prctl(PR_SET_SECCOMP, 2);
>>
>> Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
>> be &&'d together to ensure that attack surface may only be reduced:
>>   prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
>>
>> With the earlier example, the active filter becomes:
>>   "(fd == 1 || fd == 2) && fd != 2"
>>
>> The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
>> The latter returns the current filter for a system call to userspace:
>>
>>   prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf, bufsize);
>>
>> while the former clears any filters for a given system call changing it
>> back to a defaulty deny:
>>
>>   prctl(PR_CLEAR_SECCOMP_FILTER, __NR_write);
>>
>> v3: - always block execve calls (as per linus torvalds)
>>     - add __NR_seccomp_execve(_32) to seccomp-supporting arches
>>     - ensure compat tasks can't reach ftrace:syscalls
>>     - dropped new defines for seccomp modes.
>>     - two level array instead of hlists (sugg. by olof johansson)
>>     - added generic Kconfig entry that is not connected.
>>     - dropped internal seccomp.h
>>     - move prctl helpers to seccomp_filter
>>     - killed seccomp_t typedef (as per checkpatch)
>> v2: - changed to use the existing syscall number ABI.
>>     - prctl changes to minimize parsing in the kernel:
>>       prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
>>       prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
>>       prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
>>       prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
>>     - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
>>     - added flags
>>     - provide a default fail syscall_nr_to_meta in ftrace
>>     - provides fallback for unhooked system calls
>>     - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
>>     - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
>>     - moved to a hlist and 4 bit hash of linked lists
>>     - added support to operate without CONFIG_FTRACE_SYSCALLS
>>     - moved Kconfig support next to SECCOMP
>>     - made Kconfig entries dependent on EXPERIMENTAL
>>     - added macros to avoid ifdefs from kernel/fork.c
>>     - added compat task/filter matching
>>     - drop seccomp.h inclusion in sched.h and drop seccomp_t
>>     - added Filtering to "show" output
>>     - added on_exec state dup'ing when enabling after a fast-path accept.
>>
>> Signed-off-by: Will Drewry <wad@chromium.org>
>> ---
>>  include/linux/prctl.h   |    5 +
>>  include/linux/sched.h   |    2 +-
>>  include/linux/seccomp.h |   98 ++++++-
>>  include/trace/syscall.h |    7 +
>>  kernel/Makefile         |    3 +
>>  kernel/fork.c           |    3 +
>>  kernel/seccomp.c        |   38 ++-
>>  kernel/seccomp_filter.c |  784 +++++++++++++++++++++++++++++++++++++++++++++++
>>  kernel/sys.c            |   13 +-
>>  security/Kconfig        |   17 +
>>  10 files changed, 954 insertions(+), 16 deletions(-)
>>  create mode 100644 kernel/seccomp_filter.c
>>
>> diff --git a/include/linux/prctl.h b/include/linux/prctl.h
>> index a3baeb2..44723ce 100644
>> --- a/include/linux/prctl.h
>> +++ b/include/linux/prctl.h
>> @@ -64,6 +64,11 @@
>>  #define PR_GET_SECCOMP       21
>>  #define PR_SET_SECCOMP       22
>>
>> +/* Get/set process seccomp filters */
>> +#define PR_GET_SECCOMP_FILTER        35
>> +#define PR_SET_SECCOMP_FILTER        36
>> +#define PR_CLEAR_SECCOMP_FILTER      37
>> +
>>  /* Get/set the capability bounding set (as per security/commoncap.c) */
>>  #define PR_CAPBSET_READ 23
>>  #define PR_CAPBSET_DROP 24
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 18d63ce..3f0bc8d 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1374,7 +1374,7 @@ struct task_struct {
>>       uid_t loginuid;
>>       unsigned int sessionid;
>>  #endif
>> -     seccomp_t seccomp;
>> +     struct seccomp_struct seccomp;
>>
>>  /* Thread group tracking */
>>       u32 parent_exec_id;
>> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>> index 167c333..f4434ca 100644
>> --- a/include/linux/seccomp.h
>> +++ b/include/linux/seccomp.h
>> @@ -1,13 +1,33 @@
>>  #ifndef _LINUX_SECCOMP_H
>>  #define _LINUX_SECCOMP_H
>>
>> +struct seq_file;
>>
>>  #ifdef CONFIG_SECCOMP
>>
>> +#include <linux/errno.h>
>>  #include <linux/thread_info.h>
>> +#include <linux/types.h>
>>  #include <asm/seccomp.h>
>>
>> -typedef struct { int mode; } seccomp_t;
>> +struct seccomp_filters;
>> +/**
>> + * struct seccomp_struct - the state of a seccomp'ed process
>> + *
>> + * @mode:
>> + *     if this is 1, the process is under standard seccomp rules
>> + *             is 2, the process is only allowed to make system calls where
>> + *                   associated filters evaluate successfully.
>> + * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
>> + *           filters assignment/use should be RCU-protected and its contents
>> + *           should never be modified when attached to a seccomp_struct.
>> + */
>> +struct seccomp_struct {
>> +     uint16_t mode;
>> +#ifdef CONFIG_SECCOMP_FILTER
>> +     struct seccomp_filters *filters;
>> +#endif
>> +};
>>
>>  extern void __secure_computing(int);
>>  static inline void secure_computing(int this_syscall)
>> @@ -16,15 +36,14 @@ static inline void secure_computing(int this_syscall)
>>               __secure_computing(this_syscall);
>>  }
>>
>> -extern long prctl_get_seccomp(void);
>>  extern long prctl_set_seccomp(unsigned long);
>> +extern long prctl_get_seccomp(void);
>>
>>  #else /* CONFIG_SECCOMP */
>>
>>  #include <linux/errno.h>
>>
>> -typedef struct { } seccomp_t;
>> -
>> +struct seccomp_struct { };
>>  #define secure_computing(x) do { } while (0)
>>
>>  static inline long prctl_get_seccomp(void)
>> @@ -32,11 +51,80 @@ static inline long prctl_get_seccomp(void)
>>       return -EINVAL;
>>  }
>>
>> -static inline long prctl_set_seccomp(unsigned long arg2)
>> +static inline long prctl_set_seccomp(unsigned long a2);
>>  {
>>       return -EINVAL;
>>  }
>>
>>  #endif /* CONFIG_SECCOMP */
>>
>> +#ifdef CONFIG_SECCOMP_FILTER
>> +
>> +#define inherit_tsk_seccomp(_child, _orig) do { \
>> +     _child->seccomp.mode = _orig->seccomp.mode; \
>> +     _child->seccomp.filters = get_seccomp_filters(_orig->seccomp.filters); \
>> +     } while (0)
>> +#define put_tsk_seccomp(_tsk) put_seccomp_filters(_tsk->seccomp.filters)
>> +
>> +extern int seccomp_show_filters(struct seccomp_filters *filters,
>> +                             struct seq_file *);
>> +extern long seccomp_set_filter(int, char *);
>> +extern long seccomp_clear_filter(int);
>> +extern long seccomp_get_filter(int, char *, unsigned long);
>> +
>> +extern long prctl_set_seccomp_filter(unsigned long, char __user *);
>> +extern long prctl_get_seccomp_filter(unsigned long, char __user *,
>> +                                  unsigned long);
>> +extern long prctl_clear_seccomp_filter(unsigned long);
>> +
>> +extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
>> +extern void put_seccomp_filters(struct seccomp_filters *);
>> +
>> +extern int seccomp_test_filters(int);
>> +extern void seccomp_filter_log_failure(int);
>> +
>> +#else  /* CONFIG_SECCOMP_FILTER */
>> +
>> +struct seccomp_filters { };
>> +#define inherit_tsk_seccomp(_child, _orig) do { } while (0)
>> +#define put_tsk_seccomp(_tsk) do { } while (0)
>> +
>> +static inline int seccomp_show_filters(struct seccomp_filters *filters,
>> +                                    struct seq_file *m)
>> +{
>> +     return -ENOSYS;
>> +}
>> +
>> +static inline long seccomp_set_filter(int syscall_nr, char *filter)
>> +{
>> +     return -ENOSYS;
>> +}
>> +
>> +static inline long seccomp_clear_filter(int syscall_nr)
>> +{
>> +     return -ENOSYS;
>> +}
>> +
>> +static inline long seccomp_get_filter(int syscall_nr,
>> +                                   char *buf, unsigned long available)
>> +{
>> +     return -ENOSYS;
>> +}
>> +
>> +static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
>> +{
>> +     return -ENOSYS;
>> +}
>> +
>> +static inline long prctl_clear_seccomp_filter(unsigned long a2)
>> +{
>> +     return -ENOSYS;
>> +}
>> +
>> +static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
>> +                                         unsigned long a4)
>> +{
>> +     return -ENOSYS;
>> +}
>> +#endif  /* CONFIG_SECCOMP_FILTER */
>>  #endif /* _LINUX_SECCOMP_H */
>> diff --git a/include/trace/syscall.h b/include/trace/syscall.h
>> index 242ae04..e061ad0 100644
>> --- a/include/trace/syscall.h
>> +++ b/include/trace/syscall.h
>> @@ -35,6 +35,8 @@ struct syscall_metadata {
>>  extern unsigned long arch_syscall_addr(int nr);
>>  extern int init_syscall_trace(struct ftrace_event_call *call);
>>
>> +extern struct syscall_metadata *syscall_nr_to_meta(int);
>> +
>>  extern int reg_event_syscall_enter(struct ftrace_event_call *call);
>>  extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
>>  extern int reg_event_syscall_exit(struct ftrace_event_call *call);
>> @@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
>>                                     struct trace_event *event);
>>  enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
>>                                    struct trace_event *event);
>> +#else
>> +static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
>> +{
>> +     return NULL;
>> +}
>>  #endif
>>
>>  #ifdef CONFIG_PERF_EVENTS
>> diff --git a/kernel/Makefile b/kernel/Makefile
>> index 85cbfb3..84e7dfb 100644
>> --- a/kernel/Makefile
>> +++ b/kernel/Makefile
>> @@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
>>  obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
>>  obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
>>  obj-$(CONFIG_SECCOMP) += seccomp.o
>> +ifeq ($(CONFIG_SECCOMP_FILTER),y)
>> +obj-$(CONFIG_SECCOMP) += seccomp_filter.o
>> +endif
>>  obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
>>  obj-$(CONFIG_TREE_RCU) += rcutree.o
>>  obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index e7548de..6f835e0 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -34,6 +34,7 @@
>>  #include <linux/cgroup.h>
>>  #include <linux/security.h>
>>  #include <linux/hugetlb.h>
>> +#include <linux/seccomp.h>
>>  #include <linux/swap.h>
>>  #include <linux/syscalls.h>
>>  #include <linux/jiffies.h>
>> @@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
>>       free_thread_info(tsk->stack);
>>       rt_mutex_debug_task_free(tsk);
>>       ftrace_graph_exit_task(tsk);
>> +     put_tsk_seccomp(tsk);
>>       free_task_struct(tsk);
>>  }
>>  EXPORT_SYMBOL(free_task);
>> @@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
>>       if (err)
>>               goto out;
>>
>> +     inherit_tsk_seccomp(tsk, orig);
>>       setup_thread_stack(tsk, orig);
>>       clear_user_return_notifier(tsk);
>>       clear_tsk_need_resched(tsk);
>> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> index 57d4b13..0a942be 100644
>> --- a/kernel/seccomp.c
>> +++ b/kernel/seccomp.c
>> @@ -2,16 +2,20 @@
>>   * linux/kernel/seccomp.c
>>   *
>>   * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
>> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
>>   *
>>   * This defines a simple but solid secure-computing mode.
>>   */
>>
>>  #include <linux/seccomp.h>
>>  #include <linux/sched.h>
>> +#include <linux/slab.h>
>>  #include <linux/compat.h>
>> +#include <linux/unistd.h>
>> +#include <linux/ftrace_event.h>
>>
>> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
>>  /* #define SECCOMP_DEBUG 1 */
>> -#define NR_SECCOMP_MODES 1
>>
>>  /*
>>   * Secure computing mode 1 allows only read/write/exit/sigreturn.
>> @@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
>>
>>  void __secure_computing(int this_syscall)
>>  {
>> -     int mode = current->seccomp.mode;
>>       int * syscall;
>>
>> -     switch (mode) {
>> +     switch (current->seccomp.mode) {
>>       case 1:
>>               syscall = mode1_syscalls;
>>  #ifdef CONFIG_COMPAT
>> @@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
>>                               return;
>>               } while (*++syscall);
>>               break;
>> +#ifdef CONFIG_SECCOMP_FILTER
>> +     case 2:
>> +             if (this_syscall >= NR_syscalls || this_syscall < 0)
>> +                     break;
>> +
>> +             if (!seccomp_test_filters(this_syscall))
>> +                     return;
>> +
>> +             seccomp_filter_log_failure(this_syscall);
>> +             break;
>> +#endif
>>       default:
>>               BUG();
>>       }
>> @@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
>>       if (unlikely(current->seccomp.mode))
>>               goto out;
>>
>> -     ret = -EINVAL;
>> -     if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
>> -             current->seccomp.mode = seccomp_mode;
>> -             set_thread_flag(TIF_SECCOMP);
>> +     ret = 0;
>> +     switch (seccomp_mode) {
>> +     case 1:
>>  #ifdef TIF_NOTSC
>>               disable_TSC();
>>  #endif
>> -             ret = 0;
>> +#ifdef CONFIG_SECCOMP_FILTER
>> +     case 2:
>> +#endif
>> +             current->seccomp.mode = seccomp_mode;
>> +             set_thread_flag(TIF_SECCOMP);
>> +             break;
>> +     default:
>> +             ret = -EINVAL;
>>       }
>>
>> - out:
>> +out:
>>       return ret;
>>  }
>> diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
>> new file mode 100644
>> index 0000000..9782f25
>> --- /dev/null
>> +++ b/kernel/seccomp_filter.c
>> @@ -0,0 +1,784 @@
>> +/* filter engine-based seccomp system call filtering
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
>> + *
>> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
>> + */
>> +
>> +#include <linux/compat.h>
>> +#include <linux/err.h>
>> +#include <linux/errno.h>
>> +#include <linux/ftrace_event.h>
>> +#include <linux/seccomp.h>
>> +#include <linux/seq_file.h>
>> +#include <linux/sched.h>
>> +#include <linux/slab.h>
>> +#include <linux/uaccess.h>
>> +
>> +#include <asm/syscall.h>
>> +#include <trace/syscall.h>
>> +
>> +
>> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
>> +
>> +#define SECCOMP_FILTER_ALLOW "1"
>> +#define SECCOMP_ACTION_DENY 0xffff
>> +#define SECCOMP_ACTION_ALLOW 0xfffe
>> +
>> +/**
>> + * struct seccomp_filters - container for seccomp filterset
>> + *
>> + * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
>> + *            May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
>> + * @event_filters: array of pointers to ftrace event objects
>> + * @count: size of @event_filters
>> + * @flags: anonymous struct to wrap filters-specific flags
>> + * @usage: reference count to simplify use.
>> + */
>> +struct seccomp_filters {
>> +     uint16_t syscalls[NR_syscalls];
>> +     struct event_filter **event_filters;
>> +     uint16_t count;
>> +     struct {
>> +             uint32_t compat:1,
>> +                      __reserved:31;
>> +     } flags;
>> +     atomic_t usage;
>> +};
>> +
>> +/* Handle ftrace symbol non-existence */
>> +#ifdef CONFIG_FTRACE_SYSCALLS
>> +#define create_event_filter(_ef_pptr, _event_type, _str) \
>> +     ftrace_parse_filter(_ef_pptr, _event_type, _str)
>> +#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
>> +#define free_event_filter(_f) ftrace_free_filter(_f)
>> +
>> +#else
>> +
>> +#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
>> +#define get_filter_string(_ef) (NULL)
>> +#define free_event_filter(_f) do { } while (0)
>> +#endif
>> +
>> +/**
>> + * seccomp_filters_new - allocates a new filters object
>> + * @count: count to allocate for the event_filters array
>> + *
>> + * Returns ERR_PTR on error or an allocated object.
>> + */
>> +static struct seccomp_filters *seccomp_filters_new(uint16_t count)
>> +{
>> +     struct seccomp_filters *f;
>> +
>> +     if (count >= SECCOMP_ACTION_ALLOW)
>> +             return ERR_PTR(-EINVAL);
>> +
>> +     f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
>> +     if (!f)
>> +             return ERR_PTR(-ENOMEM);
>> +
>> +     /* Lazy SECCOMP_ACTION_DENY assignment. */
>> +     memset(f->syscalls, 0xff, sizeof(f->syscalls));
>> +     atomic_set(&f->usage, 1);
>> +
>> +     f->event_filters = NULL;
>> +     f->count = count;
>> +     if (!count)
>> +             return f;
>> +
>> +     f->event_filters = kzalloc(count * sizeof(struct event_filter *),
>> +                                GFP_KERNEL);
>> +     if (!f->event_filters) {
>> +             kfree(f);
>> +             f = ERR_PTR(-ENOMEM);
>> +     }
>> +     return f;
>> +}
>> +
>> +/**
>> + * seccomp_filters_free - cleans up the filter list and frees the table
>> + * @filters: NULL or live object to be completely destructed.
>> + */
>> +static void seccomp_filters_free(struct seccomp_filters *filters)
>> +{
>> +     uint16_t count = 0;
>> +     if (!filters)
>> +             return;
>> +     while (count < filters->count) {
>> +             struct event_filter *f = filters->event_filters[count];
>> +             free_event_filter(f);
>> +             count++;
>> +     }
>> +     kfree(filters->event_filters);
>> +     kfree(filters);
>> +}
>> +
>> +static void __put_seccomp_filters(struct seccomp_filters *orig)
>> +{
>> +     WARN_ON(atomic_read(&orig->usage));
>> +     seccomp_filters_free(orig);
>> +}
>> +
>> +#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
>> +#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
>> +#define seccomp_filter_dynamic(_id) \
>> +     (!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
>> +static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
>> +                                      int syscall_nr)
>> +{
>> +     if (!f)
>> +             return SECCOMP_ACTION_DENY;
>> +     return f->syscalls[syscall_nr];
>> +}
>> +
>> +static inline struct event_filter *seccomp_dynamic_filter(
>> +             const struct seccomp_filters *filters, uint16_t id)
>> +{
>> +     if (!seccomp_filter_dynamic(id))
>> +             return NULL;
>> +     return filters->event_filters[id];
>> +}
>> +
>> +static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
>> +                                      int syscall_nr, uint16_t id)
>> +{
>> +     filters->syscalls[syscall_nr] = id;
>> +}
>> +
>> +static inline void set_seccomp_filter(struct seccomp_filters *filters,
>> +                                   int syscall_nr, uint16_t id,
>> +                                   struct event_filter *dynamic_filter)
>> +{
>> +     filters->syscalls[syscall_nr] = id;
>> +     if (seccomp_filter_dynamic(id))
>> +             filters->event_filters[id] = dynamic_filter;
>> +}
>> +
>> +static struct event_filter *alloc_event_filter(int syscall_nr,
>> +                                            const char *filter_string)
>> +{
>> +     struct syscall_metadata *data;
>> +     struct event_filter *filter = NULL;
>> +     int err;
>> +
>> +     data = syscall_nr_to_meta(syscall_nr);
>> +     /* Argument-based filtering only works on ftrace-hooked syscalls. */
>> +     err = -ENOSYS;
>> +     if (!data)
>> +             goto fail;
>> +     err = create_event_filter(&filter,
>> +                               data->enter_event->event.type,
>> +                               filter_string);
>> +     if (err)
>> +             goto fail;
>> +
>> +     return filter;
>> +fail:
>> +     kfree(filter);
>> +     return ERR_PTR(err);
>> +}
>> +
>> +/**
>> + * seccomp_filters_copy - copies filters from src to dst.
>> + *
>> + * @dst: seccomp_filters to populate.
>> + * @src: table to read from.
>> + * @skip: specifies an entry, by system call, to skip.
>> + *
>> + * Returns non-zero on failure.
>> + * Both the source and the destination should have no simultaneous
>> + * writers, and dst should be exclusive to the caller.
>> + * If @skip is < 0, it is ignored.
>> + */
>> +static int seccomp_filters_copy(struct seccomp_filters *dst,
>> +                             const struct seccomp_filters *src,
>> +                             int skip)
>> +{
>> +     int id = 0, ret = 0, nr;
>> +     memcpy(&dst->flags, &src->flags, sizeof(src->flags));
>> +     memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
>> +     if (!src->count)
>> +             goto done;
>> +     for (nr = 0; nr < NR_syscalls; ++nr) {
>> +             struct event_filter *filter;
>> +             const char *str;
>> +             uint16_t src_id = seccomp_filter_id(src, nr);
>> +             if (nr == skip) {
>> +                     set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
>> +                                        NULL);
>> +                     continue;
>> +             }
>> +             if (!seccomp_filter_dynamic(src_id))
>> +                     continue;
>> +             if (id >= dst->count) {
>> +                     ret = -EINVAL;
>> +                     goto done;
>> +             }
>> +             str = get_filter_string(seccomp_dynamic_filter(src, src_id));
>> +             filter = alloc_event_filter(nr, str);
>> +             if (IS_ERR(filter)) {
>> +                     ret = PTR_ERR(filter);
>> +                     goto done;
>> +             }
>> +             set_seccomp_filter(dst, nr, id, filter);
>> +             id++;
>> +     }
>> +
>> +done:
>> +     return ret;
>> +}
>> +
>> +/**
>> + * seccomp_extend_filter - appends more text to a syscall_nr's filter
>> + * @filters: unattached filter object to operate on
>> + * @syscall_nr: syscall number to update filters for
>> + * @filter_string: string to append to the existing filter
>> + *
>> + * The new string will be &&'d to the original filter string to ensure that it
>> + * always matches the existing predicates or less:
>> + *   (old_filter) && @filter_string
>> + * A new seccomp_filters instance is returned on success and a ERR_PTR on
>> + * failure.
>> + */
>> +static int seccomp_extend_filter(struct seccomp_filters *filters,
>> +                              int syscall_nr, char *filter_string)
>> +{
>> +     struct event_filter *filter;
>> +     uint16_t id = seccomp_filter_id(filters, syscall_nr);
>> +     char *merged = NULL;
>> +     int ret = -EINVAL, expected;
>> +
>> +     /* No extending with a "1". */
>> +     if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
>> +             goto out;
>> +
>> +     filter = seccomp_dynamic_filter(filters, id);
>> +     ret = -ENOENT;
>> +     if (!filter)
>> +             goto out;
>> +
>> +     merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
>> +     ret = -ENOMEM;
>> +     if (!merged)
>> +             goto out;
>> +
>> +     expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && %s",
>> +                         get_filter_string(filter), filter_string);
>> +     ret = -E2BIG;
>> +     if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
>> +             goto out;
>> +
>> +     /* Free the old filter */
>> +     free_event_filter(filter);
>> +     set_seccomp_filter(filters, syscall_nr, id, NULL);
>> +
>> +     /* Replace it */
>> +     filter = alloc_event_filter(syscall_nr, merged);
>> +     if (IS_ERR(filter)) {
>> +             ret = PTR_ERR(filter);
>> +             goto out;
>> +     }
>> +     set_seccomp_filter(filters, syscall_nr, id, filter);
>> +     ret = 0;
>> +
>> +out:
>> +     kfree(merged);
>> +     return ret;
>> +}
>> +
>> +/**
>> + * seccomp_add_filter - adds a filter for an unfiltered syscall
>> + * @filters: filters object to add a filter/action to
>> + * @syscall_nr: system call number to add a filter for
>> + * @filter_string: the filter string to apply
>> + *
>> + * Returns 0 on success and non-zero otherwise.
>> + */
>> +static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
>> +                           char *filter_string)
>> +{
>> +     struct event_filter *filter;
>> +     int ret = 0;
>> +
>> +     if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
>> +             set_seccomp_filter(filters, syscall_nr,
>> +                                SECCOMP_ACTION_ALLOW, NULL);
>> +             goto out;
>> +     }
>> +
>> +     filter = alloc_event_filter(syscall_nr, filter_string);
>> +     if (IS_ERR(filter)) {
>> +             ret = PTR_ERR(filter);
>> +             goto out;
>> +     }
>> +     /* Always add to the last slot available since additions are
>> +      * are only done one at a time.
>> +      */
>> +     set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
>> +out:
>> +     return ret;
>> +}
>> +
>> +/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
>> +static int filter_match_current(struct event_filter *event_filter)
>> +{
>> +     int err = 0;
>> +#ifdef CONFIG_FTRACE_SYSCALLS
>> +     uint8_t syscall_state[64];
>> +
>> +     memset(syscall_state, 0, sizeof(syscall_state));
>> +
>> +     /* The generic tracing entry can remain zeroed. */
>> +     err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
>> +                                      NULL);
>> +     if (err)
>> +             return 0;
>> +
>> +     err = filter_match_preds(event_filter, syscall_state);
>> +#endif
>> +     return err;
>> +}
>> +
>> +static const char *syscall_nr_to_name(int syscall)
>> +{
>> +     const char *syscall_name = "unknown";
>> +     struct syscall_metadata *data = syscall_nr_to_meta(syscall);
>> +     if (data)
>> +             syscall_name = data->name;
>> +     return syscall_name;
>> +}
>> +
>> +static void filters_set_compat(struct seccomp_filters *filters)
>> +{
>> +#ifdef CONFIG_COMPAT
>> +     if (is_compat_task())
>> +             filters->flags.compat = 1;
>> +#endif
>> +}
>> +
>> +static inline int filters_compat_mismatch(struct seccomp_filters *filters)
>> +{
>> +     int ret = 0;
>> +     if (!filters)
>> +             return 0;
>> +#ifdef CONFIG_COMPAT
>> +     if (!!(is_compat_task()) == filters->flags.compat)
>> +             ret = 1;
>> +#endif
>> +     return ret;
>> +}
>> +
>> +static inline int syscall_is_execve(int syscall)
>> +{
>> +     int nr = __NR_execve;
>> +#ifdef CONFIG_COMPAT
>> +     if (is_compat_task())
>> +             nr = __NR_seccomp_execve_32;
>> +#endif
>> +     return syscall == nr;
>> +}
>> +
>> +#ifndef KSTK_EIP
>> +#define KSTK_EIP(x) 0L
>> +#endif
>> +
>> +void seccomp_filter_log_failure(int syscall)
>> +{
>> +     pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
>> +             current->comm, task_pid_nr(current), syscall,
>> +             syscall_nr_to_name(syscall), KSTK_EIP(current));
>> +}
>> +
>> +/* put_seccomp_state - decrements the reference count of @orig and may free. */
>> +void put_seccomp_filters(struct seccomp_filters *orig)
>> +{
>> +     if (!orig)
>> +             return;
>> +
>> +     if (atomic_dec_and_test(&orig->usage))
>> +             __put_seccomp_filters(orig);
>> +}
>> +
>> +/* get_seccomp_state - increments the reference count of @orig */
>> +struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)
>
> Nit: the name does not match the comment.

Will fix it here and above. Thanks!

>> +{
>> +     if (!orig)
>> +             return NULL;
>> +     atomic_inc(&orig->usage);
>> +     return orig;
>
> This is called in an RCU read-side critical section.  What exactly is
> RCU protecting?  I would expect an rcu_dereference() or one of the
> RCU list-traversal primitives somewhere, either here or at the caller.

Ah, I spaced on rcu_dereference().  The goal was to make the
assignment and replacement of the seccomp_filters pointer
RCU-protected (in seccomp_state) so there's no concern over it being
replaced partial on platforms where pointer assignments are non-atomic
- such as via /proc/<pid>/seccomp_filters access or a call via the
exported symbols.  Object lifetime is managed by reference counting so
that I don't have to worry about extending the RCU read-side critical
section by much or deal with pre-allocations.

I'll add rcu_dereference() to all the get_seccomp_filters() uses where
it makes sense, so that it is called safely.  Just to make sure, does
it make sense to continue to rcu protect the specific pointer?

>> +}
>> +
>> +/**
>> + * seccomp_test_filters - tests 'current' against the given syscall
>> + * @state: seccomp_state of current to use.
>> + * @syscall: number of the system call to test
>> + *
>> + * Returns 0 on ok and non-zero on error/failure.
>> + */
>> +int seccomp_test_filters(int syscall)
>> +{
>> +     uint16_t id;
>> +     struct event_filter *filter;
>> +     struct seccomp_filters *filters;
>> +     int ret = -EACCES;
>> +
>> +     rcu_read_lock();
>> +     filters = get_seccomp_filters(current->seccomp.filters);
>> +     rcu_read_unlock();
>> +
>> +     if (!filters)
>> +             goto out;
>> +
>> +     if (filters_compat_mismatch(filters)) {
>> +             pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
>> +                     current->comm, task_pid_nr(current));
>> +             goto out;
>> +     }
>> +
>> +     /* execve is never allowed. */
>> +     if (syscall_is_execve(syscall))
>> +             goto out;
>> +
>> +     ret = 0;
>> +     id = seccomp_filter_id(filters, syscall);
>> +     if (seccomp_filter_allow(id))
>> +             goto out;
>> +
>> +     ret = -EACCES;
>> +     if (!seccomp_filter_dynamic(id))
>> +             goto out;
>> +
>> +     filter = seccomp_dynamic_filter(filters, id);
>> +     if (filter && filter_match_current(filter))
>> +             ret = 0;
>> +out:
>> +     put_seccomp_filters(filters);
>> +     return ret;
>> +}
>> +
>> +/**
>> + * seccomp_show_filters - prints the current filter state to a seq_file
>> + * @filters: properly get()'d filters object
>> + * @m: the prepared seq_file to receive the data
>> + *
>> + * Returns 0 on a successful write.
>> + */
>> +int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
>> +{
>> +     int syscall;
>> +     seq_printf(m, "Mode: %d\n", current->seccomp.mode);
>> +     if (!filters)
>> +             goto out;
>> +
>> +     for (syscall = 0; syscall < NR_syscalls; ++syscall) {
>> +             uint16_t id = seccomp_filter_id(filters, syscall);
>> +             const char *filter_string = SECCOMP_FILTER_ALLOW;
>> +             if (seccomp_filter_deny(id))
>> +                     continue;
>> +             seq_printf(m, "%d (%s): ",
>> +                           syscall,
>> +                           syscall_nr_to_name(syscall));
>> +             if (seccomp_filter_dynamic(id))
>> +                     filter_string = get_filter_string(
>> +                                       seccomp_dynamic_filter(filters, id));
>> +             seq_printf(m, "%s\n", filter_string);
>> +     }
>> +out:
>> +     return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(seccomp_show_filters);
>> +
>> +/**
>> + * seccomp_get_filter - copies the filter_string into "buf"
>> + * @syscall_nr: system call number to look up
>> + * @buf: destination buffer
>> + * @bufsize: available space in the buffer.
>> + *
>> + * Context: User context only. This function may sleep on allocation and
>> + *          operates on current. current must be attempting a system call
>> + *          when this is called.
>> + *
>> + * Looks up the filter for the given system call number on current.  If found,
>> + * the string length of the NUL-terminated buffer is returned and < 0 is
>> + * returned on error. The NUL byte is not included in the length.
>> + */
>> +long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
>> +{
>> +     struct seccomp_filters *filters;
>> +     struct event_filter *filter;
>> +     long ret = -EINVAL;
>> +     uint16_t id;
>> +
>> +     if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
>> +             bufsize = SECCOMP_MAX_FILTER_LENGTH;
>> +
>> +     rcu_read_lock();
>> +     filters = get_seccomp_filters(current->seccomp.filters);
>> +     rcu_read_unlock();
>> +
>> +     if (!filters)
>> +             goto out;
>> +
>> +     ret = -ENOENT;
>> +     id = seccomp_filter_id(filters, syscall_nr);
>> +     if (seccomp_filter_deny(id))
>> +             goto out;
>> +
>> +     if (seccomp_filter_allow(id)) {
>> +             ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
>> +             goto copied;
>> +     }
>> +
>> +     filter = seccomp_dynamic_filter(filters, id);
>> +     if (!filter)
>> +             goto out;
>> +     ret = strlcpy(buf, get_filter_string(filter), bufsize);
>> +
>> +copied:
>> +     if (ret >= bufsize) {
>> +             ret = -ENOSPC;
>> +             goto out;
>> +     }
>> +     /* Zero out any remaining buffer, just in case. */
>> +     memset(buf + ret, 0, bufsize - ret);
>> +out:
>> +     put_seccomp_filters(filters);
>> +     return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(seccomp_get_filter);
>> +
>> +/**
>> + * seccomp_clear_filter: clears the seccomp filter for a syscall.
>> + * @syscall_nr: the system call number to clear filters for.
>> + *
>> + * Context: User context only. This function may sleep on allocation and
>> + *          operates on current. current must be attempting a system call
>> + *          when this is called.
>> + *
>> + * Returns 0 on success.
>> + */
>> +long seccomp_clear_filter(int syscall_nr)
>> +{
>> +     struct seccomp_filters *filters = NULL, *orig_filters;
>> +     uint16_t id;
>> +     int ret = -EINVAL;
>> +
>> +     rcu_read_lock();
>> +     orig_filters = get_seccomp_filters(current->seccomp.filters);
>> +     rcu_read_unlock();
>> +
>> +     if (!orig_filters)
>> +             goto out;
>> +
>> +     if (filters_compat_mismatch(orig_filters))
>> +             goto out;
>> +
>> +     id = seccomp_filter_id(orig_filters, syscall_nr);
>> +     if (seccomp_filter_deny(id))
>> +             goto out;
>> +
>> +     /* Create a new filters object for the task */
>> +     if (seccomp_filter_dynamic(id))
>> +             filters = seccomp_filters_new(orig_filters->count - 1);
>> +     else
>> +             filters = seccomp_filters_new(orig_filters->count);
>> +
>> +     if (IS_ERR(filters)) {
>> +             ret = PTR_ERR(filters);
>> +             goto out;
>> +     }
>> +
>> +     /* Copy, but drop the requested entry. */
>> +     ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
>> +     if (ret)
>> +             goto out;
>> +     get_seccomp_filters(filters);  /* simplify the out: path */
>> +
>> +     rcu_assign_pointer(current->seccomp.filters, filters);
>
> What prevents two copies of seccomp_clear_filter() from running
> concurrently?

Nothing - the last one wins assignment, but the objects themselves
should be internally consistent to the parallel calls.  If that's a
concern, a per-task writer mutex could be used just to ensure
simultaneous calls to clear and set are performed serially.  Would
that make more sense?


>> +     synchronize_rcu();
>> +     put_seccomp_filters(orig_filters);  /* for the task */
>> +out:
>> +     put_seccomp_filters(orig_filters);  /* for the get */
>> +     put_seccomp_filters(filters);  /* for the extra get */
>> +     return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(seccomp_clear_filter);
>> +
>> +/**
>> + * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
>> + * @syscall_nr: system call number to apply the filter to.
>> + * @filter: ftrace filter string to apply.
>> + *
>> + * Context: User context only. This function may sleep on allocation and
>> + *          operates on current. current must be attempting a system call
>> + *          when this is called.
>> + *
>> + * New filters may be added for system calls when the current task is
>> + * not in a secure computing mode (seccomp).  Otherwise, existing filters may
>> + * be extended.
>> + *
>> + * Returns 0 on success or an errno on failure.
>> + */
>> +long seccomp_set_filter(int syscall_nr, char *filter)
>> +{
>> +     struct seccomp_filters *filters = NULL, *orig_filters = NULL;
>> +     uint16_t id;
>> +     long ret = -EINVAL;
>> +     uint16_t filters_needed;
>> +
>> +     if (!filter)
>> +             goto out;
>> +
>> +     filter = strstrip(filter);
>> +     /* Disallow empty strings. */
>> +     if (filter[0] == 0)
>> +             goto out;
>> +
>> +     rcu_read_lock();
>> +     orig_filters = get_seccomp_filters(current->seccomp.filters);
>> +     rcu_read_unlock();
>> +
>> +     /* After the first call, compatibility mode is selected permanently. */
>> +     ret = -EACCES;
>> +     if (filters_compat_mismatch(orig_filters))
>> +             goto out;
>> +
>> +     filters_needed = orig_filters ? orig_filters->count : 0;
>> +     id = seccomp_filter_id(orig_filters, syscall_nr);
>> +     if (seccomp_filter_deny(id)) {
>> +             /* Don't allow DENYs to be changed when in a seccomp mode */
>> +             ret = -EACCES;
>> +             if (current->seccomp.mode)
>> +                     goto out;
>> +             filters_needed++;
>> +     }
>> +
>> +     filters = seccomp_filters_new(filters_needed);
>> +     if (IS_ERR(filters)) {
>> +             ret = PTR_ERR(filters);
>> +             goto out;
>> +     }
>> +
>> +     filters_set_compat(filters);
>> +     if (orig_filters) {
>> +             ret = seccomp_filters_copy(filters, orig_filters, -1);
>> +             if (ret)
>> +                     goto out;
>> +     }
>> +
>> +     if (seccomp_filter_deny(id))
>> +             ret = seccomp_add_filter(filters, syscall_nr, filter);
>> +     else
>> +             ret = seccomp_extend_filter(filters, syscall_nr, filter);
>> +     if (ret)
>> +             goto out;
>> +     get_seccomp_filters(filters);  /* simplify the error paths */
>> +
>> +     rcu_assign_pointer(current->seccomp.filters, filters);
>
> Again, what prevents two copies of seccomp_set_filter() from running
> concurrently?

Same deal - nothing, but I'd be happy to add a guard if it makes sense.

Thanks!

>> +     synchronize_rcu();
>> +     put_seccomp_filters(orig_filters);  /* for the task */
>> +out:
>> +     put_seccomp_filters(orig_filters);  /* for the get */
>> +     put_seccomp_filters(filters);  /* for get or task, on err */
>> +     return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(seccomp_set_filter);
>> +
>> +long prctl_set_seccomp_filter(unsigned long syscall_nr,
>> +                           char __user *user_filter)
>> +{
>> +     int nr;
>> +     long ret;
>> +     char *filter = NULL;
>> +
>> +     ret = -EINVAL;
>> +     if (syscall_nr >= NR_syscalls)
>> +             goto out;
>> +
>> +     ret = -EFAULT;
>> +     if (!user_filter)
>> +             goto out;
>> +
>> +     filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
>> +     ret = -ENOMEM;
>> +     if (!filter)
>> +             goto out;
>> +
>> +     ret = -EFAULT;
>> +     if (strncpy_from_user(filter, user_filter,
>> +                           SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
>> +             goto out;
>> +
>> +     nr = (int) syscall_nr;
>> +     ret = seccomp_set_filter(nr, filter);
>> +
>> +out:
>> +     kfree(filter);
>> +     return ret;
>> +}
>> +
>> +long prctl_clear_seccomp_filter(unsigned long syscall_nr)
>> +{
>> +     int nr = -1;
>> +     long ret;
>> +
>> +     ret = -EINVAL;
>> +     if (syscall_nr >= NR_syscalls)
>> +             goto out;
>> +
>> +     nr = (int) syscall_nr;
>> +     ret = seccomp_clear_filter(nr);
>> +
>> +out:
>> +     return ret;
>> +}
>> +
>> +long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
>> +                           unsigned long available)
>> +{
>> +     int ret, nr;
>> +     unsigned long copied;
>> +     char *buf = NULL;
>> +     ret = -EINVAL;
>> +     if (!available)
>> +             goto out;
>> +     /* Ignore extra buffer space. */
>> +     if (available > SECCOMP_MAX_FILTER_LENGTH)
>> +             available = SECCOMP_MAX_FILTER_LENGTH;
>> +
>> +     ret = -EINVAL;
>> +     if (syscall_nr >= NR_syscalls)
>> +             goto out;
>> +     nr = (int) syscall_nr;
>> +
>> +     ret = -ENOMEM;
>> +     buf = kmalloc(available, GFP_KERNEL);
>> +     if (!buf)
>> +             goto out;
>> +
>> +     ret = seccomp_get_filter(nr, buf, available);
>> +     if (ret < 0)
>> +             goto out;
>> +
>> +     /* Include the NUL byte in the copy. */
>> +     copied = copy_to_user(dst, buf, ret + 1);
>> +     ret = -ENOSPC;
>> +     if (copied)
>> +             goto out;
>> +     ret = 0;
>> +out:
>> +     kfree(buf);
>> +     return ret;
>> +}
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index af468ed..ed60d06 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -1698,13 +1698,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>>               case PR_SET_ENDIAN:
>>                       error = SET_ENDIAN(me, arg2);
>>                       break;
>> -
>>               case PR_GET_SECCOMP:
>>                       error = prctl_get_seccomp();
>>                       break;
>>               case PR_SET_SECCOMP:
>>                       error = prctl_set_seccomp(arg2);
>>                       break;
>> +             case PR_SET_SECCOMP_FILTER:
>> +                     error = prctl_set_seccomp_filter(arg2,
>> +                                                      (char __user *) arg3);
>> +                     break;
>> +             case PR_CLEAR_SECCOMP_FILTER:
>> +                     error = prctl_clear_seccomp_filter(arg2);
>> +                     break;
>> +             case PR_GET_SECCOMP_FILTER:
>> +                     error = prctl_get_seccomp_filter(arg2,
>> +                                                      (char __user *) arg3,
>> +                                                      arg4);
>> +                     break;
>>               case PR_GET_TSC:
>>                       error = GET_TSC_CTL(arg2);
>>                       break;
>> diff --git a/security/Kconfig b/security/Kconfig
>> index 95accd4..c76adf2 100644
>> --- a/security/Kconfig
>> +++ b/security/Kconfig
>> @@ -2,6 +2,10 @@
>>  # Security configuration
>>  #
>>
>> +# Make seccomp filter Kconfig switch below available
>> +config HAVE_SECCOMP_FILTER
>> +       bool
>> +
>>  menu "Security options"
>>
>>  config KEYS
>> @@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
>>
>>         If you are unsure how to answer this question, answer N.
>>
>> +config SECCOMP_FILTER
>> +     bool "Enable seccomp-based system call filtering"
>> +     select SECCOMP
>> +     depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
>> +     help
>> +       This kernel feature expands CONFIG_SECCOMP to allow computing
>> +       in environments with reduced kernel access dictated by the
>> +       application itself through prctl calls.  If
>> +       CONFIG_FTRACE_SYSCALLS is available, then system call
>> +       argument-based filtering predicates may be used.
>> +
>> +       See Documentation/prctl/seccomp_filter.txt for more detail.
>> +
>>  config SECURITY
>>       bool "Enable different security models"
>>       depends on SYSFS
>> --
>> 1.7.0.4
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
  2011-06-02 18:14                                                                                                             ` Will Drewry
@ 2011-06-02 19:42                                                                                                               ` Paul E. McKenney
  2011-06-02 20:28                                                                                                                 ` Will Drewry
  0 siblings, 1 reply; 406+ messages in thread
From: Paul E. McKenney @ 2011-06-02 19:42 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
	Peter Zijlstra, Frederic Weisbecker, linux-security-module

On Thu, Jun 02, 2011 at 01:14:54PM -0500, Will Drewry wrote:
> On Thu, Jun 2, 2011 at 12:36 PM, Paul E. McKenney
> <paulmck@linux.vnet.ibm.com> wrote:
> > On Tue, May 31, 2011 at 10:10:35PM -0500, Will Drewry wrote:
> >> This change adds a new seccomp mode which specifies the allowed system
> >> calls dynamically.  When in the new mode (2), all system calls are
> >> checked against process-defined filters - first by system call number,
> >> then by a filter string.  If an entry exists for a given system call and
> >> all filter predicates evaluate to true, then the task may proceed.
> >> Otherwise, the task is killed.
> >
> > A few questions below -- I can't say that I understand the RCU usage.
> >
> >                                                        Thanx, Paul
> >
> >> Filter string parsing and evaluation is handled by the ftrace filter
> >> engine.  Related patches tweak to the perf filter trace and free
> >> allowing the calls to be shared. Filters inherit their understanding of
> >> types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
> >> subsystem which already populates this information in syscall_metadata
> >> associated enter_event (and exit_event) structures. If
> >> CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
> >> will be allowed.
> >>
> >> The net result is a process may have its system calls filtered using the
> >> ftrace filter engine's inherent understanding of systems calls.  The set
> >> of filters is specified through the PR_SET_SECCOMP_FILTER argument in
> >> prctl(). For example, a filterset for a process, like pdftotext, that
> >> should only process read-only input could (roughly) look like:
> >>   sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_open, rdonly);
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR__llseek, "1");
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_brk, "1");
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_close, "1");
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_exit_group, "1");
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_fstat64, "1");
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_mmap2, "1");
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_munmap, "1");
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_read, "1");
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_write, "(fd == 1 | fd == 2)");
> >>   prctl(PR_SET_SECCOMP, 2);
> >>
> >> Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
> >> be &&'d together to ensure that attack surface may only be reduced:
> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
> >>
> >> With the earlier example, the active filter becomes:
> >>   "(fd == 1 || fd == 2) && fd != 2"
> >>
> >> The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
> >> The latter returns the current filter for a system call to userspace:
> >>
> >>   prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf, bufsize);
> >>
> >> while the former clears any filters for a given system call changing it
> >> back to a defaulty deny:
> >>
> >>   prctl(PR_CLEAR_SECCOMP_FILTER, __NR_write);
> >>
> >> v3: - always block execve calls (as per linus torvalds)
> >>     - add __NR_seccomp_execve(_32) to seccomp-supporting arches
> >>     - ensure compat tasks can't reach ftrace:syscalls
> >>     - dropped new defines for seccomp modes.
> >>     - two level array instead of hlists (sugg. by olof johansson)
> >>     - added generic Kconfig entry that is not connected.
> >>     - dropped internal seccomp.h
> >>     - move prctl helpers to seccomp_filter
> >>     - killed seccomp_t typedef (as per checkpatch)
> >> v2: - changed to use the existing syscall number ABI.
> >>     - prctl changes to minimize parsing in the kernel:
> >>       prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
> >>       prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
> >>       prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
> >>       prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
> >>     - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
> >>     - added flags
> >>     - provide a default fail syscall_nr_to_meta in ftrace
> >>     - provides fallback for unhooked system calls
> >>     - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
> >>     - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
> >>     - moved to a hlist and 4 bit hash of linked lists
> >>     - added support to operate without CONFIG_FTRACE_SYSCALLS
> >>     - moved Kconfig support next to SECCOMP
> >>     - made Kconfig entries dependent on EXPERIMENTAL
> >>     - added macros to avoid ifdefs from kernel/fork.c
> >>     - added compat task/filter matching
> >>     - drop seccomp.h inclusion in sched.h and drop seccomp_t
> >>     - added Filtering to "show" output
> >>     - added on_exec state dup'ing when enabling after a fast-path accept.
> >>
> >> Signed-off-by: Will Drewry <wad@chromium.org>
> >> ---
> >>  include/linux/prctl.h   |    5 +
> >>  include/linux/sched.h   |    2 +-
> >>  include/linux/seccomp.h |   98 ++++++-
> >>  include/trace/syscall.h |    7 +
> >>  kernel/Makefile         |    3 +
> >>  kernel/fork.c           |    3 +
> >>  kernel/seccomp.c        |   38 ++-
> >>  kernel/seccomp_filter.c |  784 +++++++++++++++++++++++++++++++++++++++++++++++
> >>  kernel/sys.c            |   13 +-
> >>  security/Kconfig        |   17 +
> >>  10 files changed, 954 insertions(+), 16 deletions(-)
> >>  create mode 100644 kernel/seccomp_filter.c
> >>
> >> diff --git a/include/linux/prctl.h b/include/linux/prctl.h
> >> index a3baeb2..44723ce 100644
> >> --- a/include/linux/prctl.h
> >> +++ b/include/linux/prctl.h
> >> @@ -64,6 +64,11 @@
> >>  #define PR_GET_SECCOMP       21
> >>  #define PR_SET_SECCOMP       22
> >>
> >> +/* Get/set process seccomp filters */
> >> +#define PR_GET_SECCOMP_FILTER        35
> >> +#define PR_SET_SECCOMP_FILTER        36
> >> +#define PR_CLEAR_SECCOMP_FILTER      37
> >> +
> >>  /* Get/set the capability bounding set (as per security/commoncap.c) */
> >>  #define PR_CAPBSET_READ 23
> >>  #define PR_CAPBSET_DROP 24
> >> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >> index 18d63ce..3f0bc8d 100644
> >> --- a/include/linux/sched.h
> >> +++ b/include/linux/sched.h
> >> @@ -1374,7 +1374,7 @@ struct task_struct {
> >>       uid_t loginuid;
> >>       unsigned int sessionid;
> >>  #endif
> >> -     seccomp_t seccomp;
> >> +     struct seccomp_struct seccomp;
> >>
> >>  /* Thread group tracking */
> >>       u32 parent_exec_id;
> >> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> >> index 167c333..f4434ca 100644
> >> --- a/include/linux/seccomp.h
> >> +++ b/include/linux/seccomp.h
> >> @@ -1,13 +1,33 @@
> >>  #ifndef _LINUX_SECCOMP_H
> >>  #define _LINUX_SECCOMP_H
> >>
> >> +struct seq_file;
> >>
> >>  #ifdef CONFIG_SECCOMP
> >>
> >> +#include <linux/errno.h>
> >>  #include <linux/thread_info.h>
> >> +#include <linux/types.h>
> >>  #include <asm/seccomp.h>
> >>
> >> -typedef struct { int mode; } seccomp_t;
> >> +struct seccomp_filters;
> >> +/**
> >> + * struct seccomp_struct - the state of a seccomp'ed process
> >> + *
> >> + * @mode:
> >> + *     if this is 1, the process is under standard seccomp rules
> >> + *             is 2, the process is only allowed to make system calls where
> >> + *                   associated filters evaluate successfully.
> >> + * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
> >> + *           filters assignment/use should be RCU-protected and its contents
> >> + *           should never be modified when attached to a seccomp_struct.
> >> + */
> >> +struct seccomp_struct {
> >> +     uint16_t mode;
> >> +#ifdef CONFIG_SECCOMP_FILTER
> >> +     struct seccomp_filters *filters;
> >> +#endif
> >> +};
> >>
> >>  extern void __secure_computing(int);
> >>  static inline void secure_computing(int this_syscall)
> >> @@ -16,15 +36,14 @@ static inline void secure_computing(int this_syscall)
> >>               __secure_computing(this_syscall);
> >>  }
> >>
> >> -extern long prctl_get_seccomp(void);
> >>  extern long prctl_set_seccomp(unsigned long);
> >> +extern long prctl_get_seccomp(void);
> >>
> >>  #else /* CONFIG_SECCOMP */
> >>
> >>  #include <linux/errno.h>
> >>
> >> -typedef struct { } seccomp_t;
> >> -
> >> +struct seccomp_struct { };
> >>  #define secure_computing(x) do { } while (0)
> >>
> >>  static inline long prctl_get_seccomp(void)
> >> @@ -32,11 +51,80 @@ static inline long prctl_get_seccomp(void)
> >>       return -EINVAL;
> >>  }
> >>
> >> -static inline long prctl_set_seccomp(unsigned long arg2)
> >> +static inline long prctl_set_seccomp(unsigned long a2);
> >>  {
> >>       return -EINVAL;
> >>  }
> >>
> >>  #endif /* CONFIG_SECCOMP */
> >>
> >> +#ifdef CONFIG_SECCOMP_FILTER
> >> +
> >> +#define inherit_tsk_seccomp(_child, _orig) do { \
> >> +     _child->seccomp.mode = _orig->seccomp.mode; \
> >> +     _child->seccomp.filters = get_seccomp_filters(_orig->seccomp.filters); \
> >> +     } while (0)
> >> +#define put_tsk_seccomp(_tsk) put_seccomp_filters(_tsk->seccomp.filters)
> >> +
> >> +extern int seccomp_show_filters(struct seccomp_filters *filters,
> >> +                             struct seq_file *);
> >> +extern long seccomp_set_filter(int, char *);
> >> +extern long seccomp_clear_filter(int);
> >> +extern long seccomp_get_filter(int, char *, unsigned long);
> >> +
> >> +extern long prctl_set_seccomp_filter(unsigned long, char __user *);
> >> +extern long prctl_get_seccomp_filter(unsigned long, char __user *,
> >> +                                  unsigned long);
> >> +extern long prctl_clear_seccomp_filter(unsigned long);
> >> +
> >> +extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
> >> +extern void put_seccomp_filters(struct seccomp_filters *);
> >> +
> >> +extern int seccomp_test_filters(int);
> >> +extern void seccomp_filter_log_failure(int);
> >> +
> >> +#else  /* CONFIG_SECCOMP_FILTER */
> >> +
> >> +struct seccomp_filters { };
> >> +#define inherit_tsk_seccomp(_child, _orig) do { } while (0)
> >> +#define put_tsk_seccomp(_tsk) do { } while (0)
> >> +
> >> +static inline int seccomp_show_filters(struct seccomp_filters *filters,
> >> +                                    struct seq_file *m)
> >> +{
> >> +     return -ENOSYS;
> >> +}
> >> +
> >> +static inline long seccomp_set_filter(int syscall_nr, char *filter)
> >> +{
> >> +     return -ENOSYS;
> >> +}
> >> +
> >> +static inline long seccomp_clear_filter(int syscall_nr)
> >> +{
> >> +     return -ENOSYS;
> >> +}
> >> +
> >> +static inline long seccomp_get_filter(int syscall_nr,
> >> +                                   char *buf, unsigned long available)
> >> +{
> >> +     return -ENOSYS;
> >> +}
> >> +
> >> +static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
> >> +{
> >> +     return -ENOSYS;
> >> +}
> >> +
> >> +static inline long prctl_clear_seccomp_filter(unsigned long a2)
> >> +{
> >> +     return -ENOSYS;
> >> +}
> >> +
> >> +static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
> >> +                                         unsigned long a4)
> >> +{
> >> +     return -ENOSYS;
> >> +}
> >> +#endif  /* CONFIG_SECCOMP_FILTER */
> >>  #endif /* _LINUX_SECCOMP_H */
> >> diff --git a/include/trace/syscall.h b/include/trace/syscall.h
> >> index 242ae04..e061ad0 100644
> >> --- a/include/trace/syscall.h
> >> +++ b/include/trace/syscall.h
> >> @@ -35,6 +35,8 @@ struct syscall_metadata {
> >>  extern unsigned long arch_syscall_addr(int nr);
> >>  extern int init_syscall_trace(struct ftrace_event_call *call);
> >>
> >> +extern struct syscall_metadata *syscall_nr_to_meta(int);
> >> +
> >>  extern int reg_event_syscall_enter(struct ftrace_event_call *call);
> >>  extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
> >>  extern int reg_event_syscall_exit(struct ftrace_event_call *call);
> >> @@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
> >>                                     struct trace_event *event);
> >>  enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
> >>                                    struct trace_event *event);
> >> +#else
> >> +static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
> >> +{
> >> +     return NULL;
> >> +}
> >>  #endif
> >>
> >>  #ifdef CONFIG_PERF_EVENTS
> >> diff --git a/kernel/Makefile b/kernel/Makefile
> >> index 85cbfb3..84e7dfb 100644
> >> --- a/kernel/Makefile
> >> +++ b/kernel/Makefile
> >> @@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
> >>  obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
> >>  obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
> >>  obj-$(CONFIG_SECCOMP) += seccomp.o
> >> +ifeq ($(CONFIG_SECCOMP_FILTER),y)
> >> +obj-$(CONFIG_SECCOMP) += seccomp_filter.o
> >> +endif
> >>  obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
> >>  obj-$(CONFIG_TREE_RCU) += rcutree.o
> >>  obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
> >> diff --git a/kernel/fork.c b/kernel/fork.c
> >> index e7548de..6f835e0 100644
> >> --- a/kernel/fork.c
> >> +++ b/kernel/fork.c
> >> @@ -34,6 +34,7 @@
> >>  #include <linux/cgroup.h>
> >>  #include <linux/security.h>
> >>  #include <linux/hugetlb.h>
> >> +#include <linux/seccomp.h>
> >>  #include <linux/swap.h>
> >>  #include <linux/syscalls.h>
> >>  #include <linux/jiffies.h>
> >> @@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
> >>       free_thread_info(tsk->stack);
> >>       rt_mutex_debug_task_free(tsk);
> >>       ftrace_graph_exit_task(tsk);
> >> +     put_tsk_seccomp(tsk);
> >>       free_task_struct(tsk);
> >>  }
> >>  EXPORT_SYMBOL(free_task);
> >> @@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
> >>       if (err)
> >>               goto out;
> >>
> >> +     inherit_tsk_seccomp(tsk, orig);
> >>       setup_thread_stack(tsk, orig);
> >>       clear_user_return_notifier(tsk);
> >>       clear_tsk_need_resched(tsk);
> >> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> >> index 57d4b13..0a942be 100644
> >> --- a/kernel/seccomp.c
> >> +++ b/kernel/seccomp.c
> >> @@ -2,16 +2,20 @@
> >>   * linux/kernel/seccomp.c
> >>   *
> >>   * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
> >> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
> >>   *
> >>   * This defines a simple but solid secure-computing mode.
> >>   */
> >>
> >>  #include <linux/seccomp.h>
> >>  #include <linux/sched.h>
> >> +#include <linux/slab.h>
> >>  #include <linux/compat.h>
> >> +#include <linux/unistd.h>
> >> +#include <linux/ftrace_event.h>
> >>
> >> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
> >>  /* #define SECCOMP_DEBUG 1 */
> >> -#define NR_SECCOMP_MODES 1
> >>
> >>  /*
> >>   * Secure computing mode 1 allows only read/write/exit/sigreturn.
> >> @@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
> >>
> >>  void __secure_computing(int this_syscall)
> >>  {
> >> -     int mode = current->seccomp.mode;
> >>       int * syscall;
> >>
> >> -     switch (mode) {
> >> +     switch (current->seccomp.mode) {
> >>       case 1:
> >>               syscall = mode1_syscalls;
> >>  #ifdef CONFIG_COMPAT
> >> @@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
> >>                               return;
> >>               } while (*++syscall);
> >>               break;
> >> +#ifdef CONFIG_SECCOMP_FILTER
> >> +     case 2:
> >> +             if (this_syscall >= NR_syscalls || this_syscall < 0)
> >> +                     break;
> >> +
> >> +             if (!seccomp_test_filters(this_syscall))
> >> +                     return;
> >> +
> >> +             seccomp_filter_log_failure(this_syscall);
> >> +             break;
> >> +#endif
> >>       default:
> >>               BUG();
> >>       }
> >> @@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
> >>       if (unlikely(current->seccomp.mode))
> >>               goto out;
> >>
> >> -     ret = -EINVAL;
> >> -     if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
> >> -             current->seccomp.mode = seccomp_mode;
> >> -             set_thread_flag(TIF_SECCOMP);
> >> +     ret = 0;
> >> +     switch (seccomp_mode) {
> >> +     case 1:
> >>  #ifdef TIF_NOTSC
> >>               disable_TSC();
> >>  #endif
> >> -             ret = 0;
> >> +#ifdef CONFIG_SECCOMP_FILTER
> >> +     case 2:
> >> +#endif
> >> +             current->seccomp.mode = seccomp_mode;
> >> +             set_thread_flag(TIF_SECCOMP);
> >> +             break;
> >> +     default:
> >> +             ret = -EINVAL;
> >>       }
> >>
> >> - out:
> >> +out:
> >>       return ret;
> >>  }
> >> diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
> >> new file mode 100644
> >> index 0000000..9782f25
> >> --- /dev/null
> >> +++ b/kernel/seccomp_filter.c
> >> @@ -0,0 +1,784 @@
> >> +/* filter engine-based seccomp system call filtering
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify
> >> + * it under the terms of the GNU General Public License as published by
> >> + * the Free Software Foundation; either version 2 of the License, or
> >> + * (at your option) any later version.
> >> + *
> >> + * This program is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >> + * GNU General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU General Public License
> >> + * along with this program; if not, write to the Free Software
> >> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
> >> + *
> >> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
> >> + */
> >> +
> >> +#include <linux/compat.h>
> >> +#include <linux/err.h>
> >> +#include <linux/errno.h>
> >> +#include <linux/ftrace_event.h>
> >> +#include <linux/seccomp.h>
> >> +#include <linux/seq_file.h>
> >> +#include <linux/sched.h>
> >> +#include <linux/slab.h>
> >> +#include <linux/uaccess.h>
> >> +
> >> +#include <asm/syscall.h>
> >> +#include <trace/syscall.h>
> >> +
> >> +
> >> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
> >> +
> >> +#define SECCOMP_FILTER_ALLOW "1"
> >> +#define SECCOMP_ACTION_DENY 0xffff
> >> +#define SECCOMP_ACTION_ALLOW 0xfffe
> >> +
> >> +/**
> >> + * struct seccomp_filters - container for seccomp filterset
> >> + *
> >> + * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
> >> + *            May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
> >> + * @event_filters: array of pointers to ftrace event objects
> >> + * @count: size of @event_filters
> >> + * @flags: anonymous struct to wrap filters-specific flags
> >> + * @usage: reference count to simplify use.
> >> + */
> >> +struct seccomp_filters {
> >> +     uint16_t syscalls[NR_syscalls];
> >> +     struct event_filter **event_filters;
> >> +     uint16_t count;
> >> +     struct {
> >> +             uint32_t compat:1,
> >> +                      __reserved:31;
> >> +     } flags;
> >> +     atomic_t usage;
> >> +};
> >> +
> >> +/* Handle ftrace symbol non-existence */
> >> +#ifdef CONFIG_FTRACE_SYSCALLS
> >> +#define create_event_filter(_ef_pptr, _event_type, _str) \
> >> +     ftrace_parse_filter(_ef_pptr, _event_type, _str)
> >> +#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
> >> +#define free_event_filter(_f) ftrace_free_filter(_f)
> >> +
> >> +#else
> >> +
> >> +#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
> >> +#define get_filter_string(_ef) (NULL)
> >> +#define free_event_filter(_f) do { } while (0)
> >> +#endif
> >> +
> >> +/**
> >> + * seccomp_filters_new - allocates a new filters object
> >> + * @count: count to allocate for the event_filters array
> >> + *
> >> + * Returns ERR_PTR on error or an allocated object.
> >> + */
> >> +static struct seccomp_filters *seccomp_filters_new(uint16_t count)
> >> +{
> >> +     struct seccomp_filters *f;
> >> +
> >> +     if (count >= SECCOMP_ACTION_ALLOW)
> >> +             return ERR_PTR(-EINVAL);
> >> +
> >> +     f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
> >> +     if (!f)
> >> +             return ERR_PTR(-ENOMEM);
> >> +
> >> +     /* Lazy SECCOMP_ACTION_DENY assignment. */
> >> +     memset(f->syscalls, 0xff, sizeof(f->syscalls));
> >> +     atomic_set(&f->usage, 1);
> >> +
> >> +     f->event_filters = NULL;
> >> +     f->count = count;
> >> +     if (!count)
> >> +             return f;
> >> +
> >> +     f->event_filters = kzalloc(count * sizeof(struct event_filter *),
> >> +                                GFP_KERNEL);
> >> +     if (!f->event_filters) {
> >> +             kfree(f);
> >> +             f = ERR_PTR(-ENOMEM);
> >> +     }
> >> +     return f;
> >> +}
> >> +
> >> +/**
> >> + * seccomp_filters_free - cleans up the filter list and frees the table
> >> + * @filters: NULL or live object to be completely destructed.
> >> + */
> >> +static void seccomp_filters_free(struct seccomp_filters *filters)
> >> +{
> >> +     uint16_t count = 0;
> >> +     if (!filters)
> >> +             return;
> >> +     while (count < filters->count) {
> >> +             struct event_filter *f = filters->event_filters[count];
> >> +             free_event_filter(f);
> >> +             count++;
> >> +     }
> >> +     kfree(filters->event_filters);
> >> +     kfree(filters);
> >> +}
> >> +
> >> +static void __put_seccomp_filters(struct seccomp_filters *orig)
> >> +{
> >> +     WARN_ON(atomic_read(&orig->usage));
> >> +     seccomp_filters_free(orig);
> >> +}
> >> +
> >> +#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
> >> +#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
> >> +#define seccomp_filter_dynamic(_id) \
> >> +     (!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
> >> +static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
> >> +                                      int syscall_nr)
> >> +{
> >> +     if (!f)
> >> +             return SECCOMP_ACTION_DENY;
> >> +     return f->syscalls[syscall_nr];
> >> +}
> >> +
> >> +static inline struct event_filter *seccomp_dynamic_filter(
> >> +             const struct seccomp_filters *filters, uint16_t id)
> >> +{
> >> +     if (!seccomp_filter_dynamic(id))
> >> +             return NULL;
> >> +     return filters->event_filters[id];
> >> +}
> >> +
> >> +static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
> >> +                                      int syscall_nr, uint16_t id)
> >> +{
> >> +     filters->syscalls[syscall_nr] = id;
> >> +}
> >> +
> >> +static inline void set_seccomp_filter(struct seccomp_filters *filters,
> >> +                                   int syscall_nr, uint16_t id,
> >> +                                   struct event_filter *dynamic_filter)
> >> +{
> >> +     filters->syscalls[syscall_nr] = id;
> >> +     if (seccomp_filter_dynamic(id))
> >> +             filters->event_filters[id] = dynamic_filter;
> >> +}
> >> +
> >> +static struct event_filter *alloc_event_filter(int syscall_nr,
> >> +                                            const char *filter_string)
> >> +{
> >> +     struct syscall_metadata *data;
> >> +     struct event_filter *filter = NULL;
> >> +     int err;
> >> +
> >> +     data = syscall_nr_to_meta(syscall_nr);
> >> +     /* Argument-based filtering only works on ftrace-hooked syscalls. */
> >> +     err = -ENOSYS;
> >> +     if (!data)
> >> +             goto fail;
> >> +     err = create_event_filter(&filter,
> >> +                               data->enter_event->event.type,
> >> +                               filter_string);
> >> +     if (err)
> >> +             goto fail;
> >> +
> >> +     return filter;
> >> +fail:
> >> +     kfree(filter);
> >> +     return ERR_PTR(err);
> >> +}
> >> +
> >> +/**
> >> + * seccomp_filters_copy - copies filters from src to dst.
> >> + *
> >> + * @dst: seccomp_filters to populate.
> >> + * @src: table to read from.
> >> + * @skip: specifies an entry, by system call, to skip.
> >> + *
> >> + * Returns non-zero on failure.
> >> + * Both the source and the destination should have no simultaneous
> >> + * writers, and dst should be exclusive to the caller.
> >> + * If @skip is < 0, it is ignored.
> >> + */
> >> +static int seccomp_filters_copy(struct seccomp_filters *dst,
> >> +                             const struct seccomp_filters *src,
> >> +                             int skip)
> >> +{
> >> +     int id = 0, ret = 0, nr;
> >> +     memcpy(&dst->flags, &src->flags, sizeof(src->flags));
> >> +     memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
> >> +     if (!src->count)
> >> +             goto done;
> >> +     for (nr = 0; nr < NR_syscalls; ++nr) {
> >> +             struct event_filter *filter;
> >> +             const char *str;
> >> +             uint16_t src_id = seccomp_filter_id(src, nr);
> >> +             if (nr == skip) {
> >> +                     set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
> >> +                                        NULL);
> >> +                     continue;
> >> +             }
> >> +             if (!seccomp_filter_dynamic(src_id))
> >> +                     continue;
> >> +             if (id >= dst->count) {
> >> +                     ret = -EINVAL;
> >> +                     goto done;
> >> +             }
> >> +             str = get_filter_string(seccomp_dynamic_filter(src, src_id));
> >> +             filter = alloc_event_filter(nr, str);
> >> +             if (IS_ERR(filter)) {
> >> +                     ret = PTR_ERR(filter);
> >> +                     goto done;
> >> +             }
> >> +             set_seccomp_filter(dst, nr, id, filter);
> >> +             id++;
> >> +     }
> >> +
> >> +done:
> >> +     return ret;
> >> +}
> >> +
> >> +/**
> >> + * seccomp_extend_filter - appends more text to a syscall_nr's filter
> >> + * @filters: unattached filter object to operate on
> >> + * @syscall_nr: syscall number to update filters for
> >> + * @filter_string: string to append to the existing filter
> >> + *
> >> + * The new string will be &&'d to the original filter string to ensure that it
> >> + * always matches the existing predicates or less:
> >> + *   (old_filter) && @filter_string
> >> + * A new seccomp_filters instance is returned on success and a ERR_PTR on
> >> + * failure.
> >> + */
> >> +static int seccomp_extend_filter(struct seccomp_filters *filters,
> >> +                              int syscall_nr, char *filter_string)
> >> +{
> >> +     struct event_filter *filter;
> >> +     uint16_t id = seccomp_filter_id(filters, syscall_nr);
> >> +     char *merged = NULL;
> >> +     int ret = -EINVAL, expected;
> >> +
> >> +     /* No extending with a "1". */
> >> +     if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
> >> +             goto out;
> >> +
> >> +     filter = seccomp_dynamic_filter(filters, id);
> >> +     ret = -ENOENT;
> >> +     if (!filter)
> >> +             goto out;
> >> +
> >> +     merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
> >> +     ret = -ENOMEM;
> >> +     if (!merged)
> >> +             goto out;
> >> +
> >> +     expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && %s",
> >> +                         get_filter_string(filter), filter_string);
> >> +     ret = -E2BIG;
> >> +     if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
> >> +             goto out;
> >> +
> >> +     /* Free the old filter */
> >> +     free_event_filter(filter);
> >> +     set_seccomp_filter(filters, syscall_nr, id, NULL);
> >> +
> >> +     /* Replace it */
> >> +     filter = alloc_event_filter(syscall_nr, merged);
> >> +     if (IS_ERR(filter)) {
> >> +             ret = PTR_ERR(filter);
> >> +             goto out;
> >> +     }
> >> +     set_seccomp_filter(filters, syscall_nr, id, filter);
> >> +     ret = 0;
> >> +
> >> +out:
> >> +     kfree(merged);
> >> +     return ret;
> >> +}
> >> +
> >> +/**
> >> + * seccomp_add_filter - adds a filter for an unfiltered syscall
> >> + * @filters: filters object to add a filter/action to
> >> + * @syscall_nr: system call number to add a filter for
> >> + * @filter_string: the filter string to apply
> >> + *
> >> + * Returns 0 on success and non-zero otherwise.
> >> + */
> >> +static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
> >> +                           char *filter_string)
> >> +{
> >> +     struct event_filter *filter;
> >> +     int ret = 0;
> >> +
> >> +     if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
> >> +             set_seccomp_filter(filters, syscall_nr,
> >> +                                SECCOMP_ACTION_ALLOW, NULL);
> >> +             goto out;
> >> +     }
> >> +
> >> +     filter = alloc_event_filter(syscall_nr, filter_string);
> >> +     if (IS_ERR(filter)) {
> >> +             ret = PTR_ERR(filter);
> >> +             goto out;
> >> +     }
> >> +     /* Always add to the last slot available since additions are
> >> +      * are only done one at a time.
> >> +      */
> >> +     set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
> >> +out:
> >> +     return ret;
> >> +}
> >> +
> >> +/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
> >> +static int filter_match_current(struct event_filter *event_filter)
> >> +{
> >> +     int err = 0;
> >> +#ifdef CONFIG_FTRACE_SYSCALLS
> >> +     uint8_t syscall_state[64];
> >> +
> >> +     memset(syscall_state, 0, sizeof(syscall_state));
> >> +
> >> +     /* The generic tracing entry can remain zeroed. */
> >> +     err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
> >> +                                      NULL);
> >> +     if (err)
> >> +             return 0;
> >> +
> >> +     err = filter_match_preds(event_filter, syscall_state);
> >> +#endif
> >> +     return err;
> >> +}
> >> +
> >> +static const char *syscall_nr_to_name(int syscall)
> >> +{
> >> +     const char *syscall_name = "unknown";
> >> +     struct syscall_metadata *data = syscall_nr_to_meta(syscall);
> >> +     if (data)
> >> +             syscall_name = data->name;
> >> +     return syscall_name;
> >> +}
> >> +
> >> +static void filters_set_compat(struct seccomp_filters *filters)
> >> +{
> >> +#ifdef CONFIG_COMPAT
> >> +     if (is_compat_task())
> >> +             filters->flags.compat = 1;
> >> +#endif
> >> +}
> >> +
> >> +static inline int filters_compat_mismatch(struct seccomp_filters *filters)
> >> +{
> >> +     int ret = 0;
> >> +     if (!filters)
> >> +             return 0;
> >> +#ifdef CONFIG_COMPAT
> >> +     if (!!(is_compat_task()) == filters->flags.compat)
> >> +             ret = 1;
> >> +#endif
> >> +     return ret;
> >> +}
> >> +
> >> +static inline int syscall_is_execve(int syscall)
> >> +{
> >> +     int nr = __NR_execve;
> >> +#ifdef CONFIG_COMPAT
> >> +     if (is_compat_task())
> >> +             nr = __NR_seccomp_execve_32;
> >> +#endif
> >> +     return syscall == nr;
> >> +}
> >> +
> >> +#ifndef KSTK_EIP
> >> +#define KSTK_EIP(x) 0L
> >> +#endif
> >> +
> >> +void seccomp_filter_log_failure(int syscall)
> >> +{
> >> +     pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
> >> +             current->comm, task_pid_nr(current), syscall,
> >> +             syscall_nr_to_name(syscall), KSTK_EIP(current));
> >> +}
> >> +
> >> +/* put_seccomp_state - decrements the reference count of @orig and may free. */
> >> +void put_seccomp_filters(struct seccomp_filters *orig)
> >> +{
> >> +     if (!orig)
> >> +             return;
> >> +
> >> +     if (atomic_dec_and_test(&orig->usage))
> >> +             __put_seccomp_filters(orig);
> >> +}
> >> +
> >> +/* get_seccomp_state - increments the reference count of @orig */
> >> +struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)
> >
> > Nit: the name does not match the comment.
> 
> Will fix it here and above. Thanks!
> 
> >> +{
> >> +     if (!orig)
> >> +             return NULL;
> >> +     atomic_inc(&orig->usage);
> >> +     return orig;
> >
> > This is called in an RCU read-side critical section.  What exactly is
> > RCU protecting?  I would expect an rcu_dereference() or one of the
> > RCU list-traversal primitives somewhere, either here or at the caller.
> 
> Ah, I spaced on rcu_dereference().  The goal was to make the
> assignment and replacement of the seccomp_filters pointer
> RCU-protected (in seccomp_state) so there's no concern over it being
> replaced partial on platforms where pointer assignments are non-atomic
> - such as via /proc/<pid>/seccomp_filters access or a call via the
> exported symbols.  Object lifetime is managed by reference counting so
> that I don't have to worry about extending the RCU read-side critical
> section by much or deal with pre-allocations.
> 
> I'll add rcu_dereference() to all the get_seccomp_filters() uses where
> it makes sense, so that it is called safely.  Just to make sure, does
> it make sense to continue to rcu protect the specific pointer?

It might.  The usual other options is to use a lock outside of the element
containing the reference count to protect reference-count manipulation.
If there is some convenient lock, especially if it is already held where
needed, then locking is more straightforward.  Otherwise, RCU is usually
a reasonable option.

> >> +}
> >> +
> >> +/**
> >> + * seccomp_test_filters - tests 'current' against the given syscall
> >> + * @state: seccomp_state of current to use.
> >> + * @syscall: number of the system call to test
> >> + *
> >> + * Returns 0 on ok and non-zero on error/failure.
> >> + */
> >> +int seccomp_test_filters(int syscall)
> >> +{
> >> +     uint16_t id;
> >> +     struct event_filter *filter;
> >> +     struct seccomp_filters *filters;
> >> +     int ret = -EACCES;
> >> +
> >> +     rcu_read_lock();
> >> +     filters = get_seccomp_filters(current->seccomp.filters);
> >> +     rcu_read_unlock();
> >> +
> >> +     if (!filters)
> >> +             goto out;
> >> +
> >> +     if (filters_compat_mismatch(filters)) {
> >> +             pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
> >> +                     current->comm, task_pid_nr(current));
> >> +             goto out;
> >> +     }
> >> +
> >> +     /* execve is never allowed. */
> >> +     if (syscall_is_execve(syscall))
> >> +             goto out;
> >> +
> >> +     ret = 0;
> >> +     id = seccomp_filter_id(filters, syscall);
> >> +     if (seccomp_filter_allow(id))
> >> +             goto out;
> >> +
> >> +     ret = -EACCES;
> >> +     if (!seccomp_filter_dynamic(id))
> >> +             goto out;
> >> +
> >> +     filter = seccomp_dynamic_filter(filters, id);
> >> +     if (filter && filter_match_current(filter))
> >> +             ret = 0;
> >> +out:
> >> +     put_seccomp_filters(filters);
> >> +     return ret;
> >> +}
> >> +
> >> +/**
> >> + * seccomp_show_filters - prints the current filter state to a seq_file
> >> + * @filters: properly get()'d filters object
> >> + * @m: the prepared seq_file to receive the data
> >> + *
> >> + * Returns 0 on a successful write.
> >> + */
> >> +int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
> >> +{
> >> +     int syscall;
> >> +     seq_printf(m, "Mode: %d\n", current->seccomp.mode);
> >> +     if (!filters)
> >> +             goto out;
> >> +
> >> +     for (syscall = 0; syscall < NR_syscalls; ++syscall) {
> >> +             uint16_t id = seccomp_filter_id(filters, syscall);
> >> +             const char *filter_string = SECCOMP_FILTER_ALLOW;
> >> +             if (seccomp_filter_deny(id))
> >> +                     continue;
> >> +             seq_printf(m, "%d (%s): ",
> >> +                           syscall,
> >> +                           syscall_nr_to_name(syscall));
> >> +             if (seccomp_filter_dynamic(id))
> >> +                     filter_string = get_filter_string(
> >> +                                       seccomp_dynamic_filter(filters, id));
> >> +             seq_printf(m, "%s\n", filter_string);
> >> +     }
> >> +out:
> >> +     return 0;
> >> +}
> >> +EXPORT_SYMBOL_GPL(seccomp_show_filters);
> >> +
> >> +/**
> >> + * seccomp_get_filter - copies the filter_string into "buf"
> >> + * @syscall_nr: system call number to look up
> >> + * @buf: destination buffer
> >> + * @bufsize: available space in the buffer.
> >> + *
> >> + * Context: User context only. This function may sleep on allocation and
> >> + *          operates on current. current must be attempting a system call
> >> + *          when this is called.
> >> + *
> >> + * Looks up the filter for the given system call number on current.  If found,
> >> + * the string length of the NUL-terminated buffer is returned and < 0 is
> >> + * returned on error. The NUL byte is not included in the length.
> >> + */
> >> +long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
> >> +{
> >> +     struct seccomp_filters *filters;
> >> +     struct event_filter *filter;
> >> +     long ret = -EINVAL;
> >> +     uint16_t id;
> >> +
> >> +     if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
> >> +             bufsize = SECCOMP_MAX_FILTER_LENGTH;
> >> +
> >> +     rcu_read_lock();
> >> +     filters = get_seccomp_filters(current->seccomp.filters);
> >> +     rcu_read_unlock();
> >> +
> >> +     if (!filters)
> >> +             goto out;
> >> +
> >> +     ret = -ENOENT;
> >> +     id = seccomp_filter_id(filters, syscall_nr);
> >> +     if (seccomp_filter_deny(id))
> >> +             goto out;
> >> +
> >> +     if (seccomp_filter_allow(id)) {
> >> +             ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
> >> +             goto copied;
> >> +     }
> >> +
> >> +     filter = seccomp_dynamic_filter(filters, id);
> >> +     if (!filter)
> >> +             goto out;
> >> +     ret = strlcpy(buf, get_filter_string(filter), bufsize);
> >> +
> >> +copied:
> >> +     if (ret >= bufsize) {
> >> +             ret = -ENOSPC;
> >> +             goto out;
> >> +     }
> >> +     /* Zero out any remaining buffer, just in case. */
> >> +     memset(buf + ret, 0, bufsize - ret);
> >> +out:
> >> +     put_seccomp_filters(filters);
> >> +     return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(seccomp_get_filter);
> >> +
> >> +/**
> >> + * seccomp_clear_filter: clears the seccomp filter for a syscall.
> >> + * @syscall_nr: the system call number to clear filters for.
> >> + *
> >> + * Context: User context only. This function may sleep on allocation and
> >> + *          operates on current. current must be attempting a system call
> >> + *          when this is called.
> >> + *
> >> + * Returns 0 on success.
> >> + */
> >> +long seccomp_clear_filter(int syscall_nr)
> >> +{
> >> +     struct seccomp_filters *filters = NULL, *orig_filters;
> >> +     uint16_t id;
> >> +     int ret = -EINVAL;
> >> +
> >> +     rcu_read_lock();
> >> +     orig_filters = get_seccomp_filters(current->seccomp.filters);
> >> +     rcu_read_unlock();
> >> +
> >> +     if (!orig_filters)
> >> +             goto out;
> >> +
> >> +     if (filters_compat_mismatch(orig_filters))
> >> +             goto out;
> >> +
> >> +     id = seccomp_filter_id(orig_filters, syscall_nr);
> >> +     if (seccomp_filter_deny(id))
> >> +             goto out;
> >> +
> >> +     /* Create a new filters object for the task */
> >> +     if (seccomp_filter_dynamic(id))
> >> +             filters = seccomp_filters_new(orig_filters->count - 1);
> >> +     else
> >> +             filters = seccomp_filters_new(orig_filters->count);
> >> +
> >> +     if (IS_ERR(filters)) {
> >> +             ret = PTR_ERR(filters);
> >> +             goto out;
> >> +     }
> >> +
> >> +     /* Copy, but drop the requested entry. */
> >> +     ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
> >> +     if (ret)
> >> +             goto out;
> >> +     get_seccomp_filters(filters);  /* simplify the out: path */
> >> +
> >> +     rcu_assign_pointer(current->seccomp.filters, filters);
> >
> > What prevents two copies of seccomp_clear_filter() from running
> > concurrently?
> 
> Nothing - the last one wins assignment, but the objects themselves
> should be internally consistent to the parallel calls.  If that's a
> concern, a per-task writer mutex could be used just to ensure
> simultaneous calls to clear and set are performed serially.  Would
> that make more sense?

Here is the sequence of events that I am concerned about:

o	CPU 0 sets orig_filters to point to the current filters.

o	CPU 1 sets its local orig_filters to point to the current
	set of filters.

o	Both CPUs allocate new filters and use rcu_assign_pointer()
	to do the update.  As you say, the last one wins, but it appears
	to me that the first one leaks memory.

o	Both CPUs free the object referenced by their orig_filters,
	which might or might not result in a double free, depending
	on exactly what happens below.  (You might actually be OK,
	I didn't check -- leaking memory was enough for me to call
	attention to this.)

So yes, please use some kind of mutual exclusion.  Not sure what you
mean by "per-task mutex", but whatever it is must prevent two different
tasks from acting on the same set of filters at the same time.  The
thing that I call "per-task mutex" would -not- do that.

> >> +     synchronize_rcu();
> >> +     put_seccomp_filters(orig_filters);  /* for the task */
> >> +out:
> >> +     put_seccomp_filters(orig_filters);  /* for the get */
> >> +     put_seccomp_filters(filters);  /* for the extra get */
> >> +     return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(seccomp_clear_filter);
> >> +
> >> +/**
> >> + * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
> >> + * @syscall_nr: system call number to apply the filter to.
> >> + * @filter: ftrace filter string to apply.
> >> + *
> >> + * Context: User context only. This function may sleep on allocation and
> >> + *          operates on current. current must be attempting a system call
> >> + *          when this is called.
> >> + *
> >> + * New filters may be added for system calls when the current task is
> >> + * not in a secure computing mode (seccomp).  Otherwise, existing filters may
> >> + * be extended.
> >> + *
> >> + * Returns 0 on success or an errno on failure.
> >> + */
> >> +long seccomp_set_filter(int syscall_nr, char *filter)
> >> +{
> >> +     struct seccomp_filters *filters = NULL, *orig_filters = NULL;
> >> +     uint16_t id;
> >> +     long ret = -EINVAL;
> >> +     uint16_t filters_needed;
> >> +
> >> +     if (!filter)
> >> +             goto out;
> >> +
> >> +     filter = strstrip(filter);
> >> +     /* Disallow empty strings. */
> >> +     if (filter[0] == 0)
> >> +             goto out;
> >> +
> >> +     rcu_read_lock();
> >> +     orig_filters = get_seccomp_filters(current->seccomp.filters);
> >> +     rcu_read_unlock();
> >> +
> >> +     /* After the first call, compatibility mode is selected permanently. */
> >> +     ret = -EACCES;
> >> +     if (filters_compat_mismatch(orig_filters))
> >> +             goto out;
> >> +
> >> +     filters_needed = orig_filters ? orig_filters->count : 0;
> >> +     id = seccomp_filter_id(orig_filters, syscall_nr);
> >> +     if (seccomp_filter_deny(id)) {
> >> +             /* Don't allow DENYs to be changed when in a seccomp mode */
> >> +             ret = -EACCES;
> >> +             if (current->seccomp.mode)
> >> +                     goto out;
> >> +             filters_needed++;
> >> +     }
> >> +
> >> +     filters = seccomp_filters_new(filters_needed);
> >> +     if (IS_ERR(filters)) {
> >> +             ret = PTR_ERR(filters);
> >> +             goto out;
> >> +     }
> >> +
> >> +     filters_set_compat(filters);
> >> +     if (orig_filters) {
> >> +             ret = seccomp_filters_copy(filters, orig_filters, -1);
> >> +             if (ret)
> >> +                     goto out;
> >> +     }
> >> +
> >> +     if (seccomp_filter_deny(id))
> >> +             ret = seccomp_add_filter(filters, syscall_nr, filter);
> >> +     else
> >> +             ret = seccomp_extend_filter(filters, syscall_nr, filter);
> >> +     if (ret)
> >> +             goto out;
> >> +     get_seccomp_filters(filters);  /* simplify the error paths */
> >> +
> >> +     rcu_assign_pointer(current->seccomp.filters, filters);
> >
> > Again, what prevents two copies of seccomp_set_filter() from running
> > concurrently?
> 
> Same deal - nothing, but I'd be happy to add a guard if it makes sense.
> 
> Thanks!
> 
> >> +     synchronize_rcu();
> >> +     put_seccomp_filters(orig_filters);  /* for the task */
> >> +out:
> >> +     put_seccomp_filters(orig_filters);  /* for the get */
> >> +     put_seccomp_filters(filters);  /* for get or task, on err */
> >> +     return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(seccomp_set_filter);
> >> +
> >> +long prctl_set_seccomp_filter(unsigned long syscall_nr,
> >> +                           char __user *user_filter)
> >> +{
> >> +     int nr;
> >> +     long ret;
> >> +     char *filter = NULL;
> >> +
> >> +     ret = -EINVAL;
> >> +     if (syscall_nr >= NR_syscalls)
> >> +             goto out;
> >> +
> >> +     ret = -EFAULT;
> >> +     if (!user_filter)
> >> +             goto out;
> >> +
> >> +     filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
> >> +     ret = -ENOMEM;
> >> +     if (!filter)
> >> +             goto out;
> >> +
> >> +     ret = -EFAULT;
> >> +     if (strncpy_from_user(filter, user_filter,
> >> +                           SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
> >> +             goto out;
> >> +
> >> +     nr = (int) syscall_nr;
> >> +     ret = seccomp_set_filter(nr, filter);
> >> +
> >> +out:
> >> +     kfree(filter);
> >> +     return ret;
> >> +}
> >> +
> >> +long prctl_clear_seccomp_filter(unsigned long syscall_nr)
> >> +{
> >> +     int nr = -1;
> >> +     long ret;
> >> +
> >> +     ret = -EINVAL;
> >> +     if (syscall_nr >= NR_syscalls)
> >> +             goto out;
> >> +
> >> +     nr = (int) syscall_nr;
> >> +     ret = seccomp_clear_filter(nr);
> >> +
> >> +out:
> >> +     return ret;
> >> +}
> >> +
> >> +long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
> >> +                           unsigned long available)
> >> +{
> >> +     int ret, nr;
> >> +     unsigned long copied;
> >> +     char *buf = NULL;
> >> +     ret = -EINVAL;
> >> +     if (!available)
> >> +             goto out;
> >> +     /* Ignore extra buffer space. */
> >> +     if (available > SECCOMP_MAX_FILTER_LENGTH)
> >> +             available = SECCOMP_MAX_FILTER_LENGTH;
> >> +
> >> +     ret = -EINVAL;
> >> +     if (syscall_nr >= NR_syscalls)
> >> +             goto out;
> >> +     nr = (int) syscall_nr;
> >> +
> >> +     ret = -ENOMEM;
> >> +     buf = kmalloc(available, GFP_KERNEL);
> >> +     if (!buf)
> >> +             goto out;
> >> +
> >> +     ret = seccomp_get_filter(nr, buf, available);
> >> +     if (ret < 0)
> >> +             goto out;
> >> +
> >> +     /* Include the NUL byte in the copy. */
> >> +     copied = copy_to_user(dst, buf, ret + 1);
> >> +     ret = -ENOSPC;
> >> +     if (copied)
> >> +             goto out;
> >> +     ret = 0;
> >> +out:
> >> +     kfree(buf);
> >> +     return ret;
> >> +}
> >> diff --git a/kernel/sys.c b/kernel/sys.c
> >> index af468ed..ed60d06 100644
> >> --- a/kernel/sys.c
> >> +++ b/kernel/sys.c
> >> @@ -1698,13 +1698,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> >>               case PR_SET_ENDIAN:
> >>                       error = SET_ENDIAN(me, arg2);
> >>                       break;
> >> -
> >>               case PR_GET_SECCOMP:
> >>                       error = prctl_get_seccomp();
> >>                       break;
> >>               case PR_SET_SECCOMP:
> >>                       error = prctl_set_seccomp(arg2);
> >>                       break;
> >> +             case PR_SET_SECCOMP_FILTER:
> >> +                     error = prctl_set_seccomp_filter(arg2,
> >> +                                                      (char __user *) arg3);
> >> +                     break;
> >> +             case PR_CLEAR_SECCOMP_FILTER:
> >> +                     error = prctl_clear_seccomp_filter(arg2);
> >> +                     break;
> >> +             case PR_GET_SECCOMP_FILTER:
> >> +                     error = prctl_get_seccomp_filter(arg2,
> >> +                                                      (char __user *) arg3,
> >> +                                                      arg4);
> >> +                     break;
> >>               case PR_GET_TSC:
> >>                       error = GET_TSC_CTL(arg2);
> >>                       break;
> >> diff --git a/security/Kconfig b/security/Kconfig
> >> index 95accd4..c76adf2 100644
> >> --- a/security/Kconfig
> >> +++ b/security/Kconfig
> >> @@ -2,6 +2,10 @@
> >>  # Security configuration
> >>  #
> >>
> >> +# Make seccomp filter Kconfig switch below available
> >> +config HAVE_SECCOMP_FILTER
> >> +       bool
> >> +
> >>  menu "Security options"
> >>
> >>  config KEYS
> >> @@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
> >>
> >>         If you are unsure how to answer this question, answer N.
> >>
> >> +config SECCOMP_FILTER
> >> +     bool "Enable seccomp-based system call filtering"
> >> +     select SECCOMP
> >> +     depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
> >> +     help
> >> +       This kernel feature expands CONFIG_SECCOMP to allow computing
> >> +       in environments with reduced kernel access dictated by the
> >> +       application itself through prctl calls.  If
> >> +       CONFIG_FTRACE_SYSCALLS is available, then system call
> >> +       argument-based filtering predicates may be used.
> >> +
> >> +       See Documentation/prctl/seccomp_filter.txt for more detail.
> >> +
> >>  config SECURITY
> >>       bool "Enable different security models"
> >>       depends on SYSFS
> >> --
> >> 1.7.0.4
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >> Please read the FAQ at  http://www.tux.org/lkml/
> >
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
  2011-06-02 19:42                                                                                                               ` Paul E. McKenney
@ 2011-06-02 20:28                                                                                                                 ` Will Drewry
  2011-06-02 20:46                                                                                                                   ` Steven Rostedt
  0 siblings, 1 reply; 406+ messages in thread
From: Will Drewry @ 2011-06-02 20:28 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, kees.cook, torvalds, tglx, mingo, rostedt, jmorris,
	Peter Zijlstra, Frederic Weisbecker, linux-security-module

On Thu, Jun 2, 2011 at 2:42 PM, Paul E. McKenney
<paulmck@linux.vnet.ibm.com> wrote:
> On Thu, Jun 02, 2011 at 01:14:54PM -0500, Will Drewry wrote:
>> On Thu, Jun 2, 2011 at 12:36 PM, Paul E. McKenney
>> <paulmck@linux.vnet.ibm.com> wrote:
>> > On Tue, May 31, 2011 at 10:10:35PM -0500, Will Drewry wrote:
>> >> This change adds a new seccomp mode which specifies the allowed system
>> >> calls dynamically.  When in the new mode (2), all system calls are
>> >> checked against process-defined filters - first by system call number,
>> >> then by a filter string.  If an entry exists for a given system call and
>> >> all filter predicates evaluate to true, then the task may proceed.
>> >> Otherwise, the task is killed.
>> >
>> > A few questions below -- I can't say that I understand the RCU usage.
>> >
>> >                                                        Thanx, Paul
>> >
>> >> Filter string parsing and evaluation is handled by the ftrace filter
>> >> engine.  Related patches tweak to the perf filter trace and free
>> >> allowing the calls to be shared. Filters inherit their understanding of
>> >> types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
>> >> subsystem which already populates this information in syscall_metadata
>> >> associated enter_event (and exit_event) structures. If
>> >> CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
>> >> will be allowed.
>> >>
>> >> The net result is a process may have its system calls filtered using the
>> >> ftrace filter engine's inherent understanding of systems calls.  The set
>> >> of filters is specified through the PR_SET_SECCOMP_FILTER argument in
>> >> prctl(). For example, a filterset for a process, like pdftotext, that
>> >> should only process read-only input could (roughly) look like:
>> >>   sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_open, rdonly);
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR__llseek, "1");
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_brk, "1");
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_close, "1");
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_exit_group, "1");
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_fstat64, "1");
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_mmap2, "1");
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_munmap, "1");
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_read, "1");
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_write, "(fd == 1 | fd == 2)");
>> >>   prctl(PR_SET_SECCOMP, 2);
>> >>
>> >> Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
>> >> be &&'d together to ensure that attack surface may only be reduced:
>> >>   prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");
>> >>
>> >> With the earlier example, the active filter becomes:
>> >>   "(fd == 1 || fd == 2) && fd != 2"
>> >>
>> >> The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
>> >> The latter returns the current filter for a system call to userspace:
>> >>
>> >>   prctl(PR_GET_SECCOMP_FILTER, __NR_write, buf, bufsize);
>> >>
>> >> while the former clears any filters for a given system call changing it
>> >> back to a defaulty deny:
>> >>
>> >>   prctl(PR_CLEAR_SECCOMP_FILTER, __NR_write);
>> >>
>> >> v3: - always block execve calls (as per linus torvalds)
>> >>     - add __NR_seccomp_execve(_32) to seccomp-supporting arches
>> >>     - ensure compat tasks can't reach ftrace:syscalls
>> >>     - dropped new defines for seccomp modes.
>> >>     - two level array instead of hlists (sugg. by olof johansson)
>> >>     - added generic Kconfig entry that is not connected.
>> >>     - dropped internal seccomp.h
>> >>     - move prctl helpers to seccomp_filter
>> >>     - killed seccomp_t typedef (as per checkpatch)
>> >> v2: - changed to use the existing syscall number ABI.
>> >>     - prctl changes to minimize parsing in the kernel:
>> >>       prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
>> >>       prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
>> >>       prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
>> >>       prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
>> >>     - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
>> >>     - added flags
>> >>     - provide a default fail syscall_nr_to_meta in ftrace
>> >>     - provides fallback for unhooked system calls
>> >>     - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
>> >>     - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
>> >>     - moved to a hlist and 4 bit hash of linked lists
>> >>     - added support to operate without CONFIG_FTRACE_SYSCALLS
>> >>     - moved Kconfig support next to SECCOMP
>> >>     - made Kconfig entries dependent on EXPERIMENTAL
>> >>     - added macros to avoid ifdefs from kernel/fork.c
>> >>     - added compat task/filter matching
>> >>     - drop seccomp.h inclusion in sched.h and drop seccomp_t
>> >>     - added Filtering to "show" output
>> >>     - added on_exec state dup'ing when enabling after a fast-path accept.
>> >>
>> >> Signed-off-by: Will Drewry <wad@chromium.org>
>> >> ---
>> >>  include/linux/prctl.h   |    5 +
>> >>  include/linux/sched.h   |    2 +-
>> >>  include/linux/seccomp.h |   98 ++++++-
>> >>  include/trace/syscall.h |    7 +
>> >>  kernel/Makefile         |    3 +
>> >>  kernel/fork.c           |    3 +
>> >>  kernel/seccomp.c        |   38 ++-
>> >>  kernel/seccomp_filter.c |  784 +++++++++++++++++++++++++++++++++++++++++++++++
>> >>  kernel/sys.c            |   13 +-
>> >>  security/Kconfig        |   17 +
>> >>  10 files changed, 954 insertions(+), 16 deletions(-)
>> >>  create mode 100644 kernel/seccomp_filter.c
>> >>
>> >> diff --git a/include/linux/prctl.h b/include/linux/prctl.h
>> >> index a3baeb2..44723ce 100644
>> >> --- a/include/linux/prctl.h
>> >> +++ b/include/linux/prctl.h
>> >> @@ -64,6 +64,11 @@
>> >>  #define PR_GET_SECCOMP       21
>> >>  #define PR_SET_SECCOMP       22
>> >>
>> >> +/* Get/set process seccomp filters */
>> >> +#define PR_GET_SECCOMP_FILTER        35
>> >> +#define PR_SET_SECCOMP_FILTER        36
>> >> +#define PR_CLEAR_SECCOMP_FILTER      37
>> >> +
>> >>  /* Get/set the capability bounding set (as per security/commoncap.c) */
>> >>  #define PR_CAPBSET_READ 23
>> >>  #define PR_CAPBSET_DROP 24
>> >> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> >> index 18d63ce..3f0bc8d 100644
>> >> --- a/include/linux/sched.h
>> >> +++ b/include/linux/sched.h
>> >> @@ -1374,7 +1374,7 @@ struct task_struct {
>> >>       uid_t loginuid;
>> >>       unsigned int sessionid;
>> >>  #endif
>> >> -     seccomp_t seccomp;
>> >> +     struct seccomp_struct seccomp;
>> >>
>> >>  /* Thread group tracking */
>> >>       u32 parent_exec_id;
>> >> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
>> >> index 167c333..f4434ca 100644
>> >> --- a/include/linux/seccomp.h
>> >> +++ b/include/linux/seccomp.h
>> >> @@ -1,13 +1,33 @@
>> >>  #ifndef _LINUX_SECCOMP_H
>> >>  #define _LINUX_SECCOMP_H
>> >>
>> >> +struct seq_file;
>> >>
>> >>  #ifdef CONFIG_SECCOMP
>> >>
>> >> +#include <linux/errno.h>
>> >>  #include <linux/thread_info.h>
>> >> +#include <linux/types.h>
>> >>  #include <asm/seccomp.h>
>> >>
>> >> -typedef struct { int mode; } seccomp_t;
>> >> +struct seccomp_filters;
>> >> +/**
>> >> + * struct seccomp_struct - the state of a seccomp'ed process
>> >> + *
>> >> + * @mode:
>> >> + *     if this is 1, the process is under standard seccomp rules
>> >> + *             is 2, the process is only allowed to make system calls where
>> >> + *                   associated filters evaluate successfully.
>> >> + * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
>> >> + *           filters assignment/use should be RCU-protected and its contents
>> >> + *           should never be modified when attached to a seccomp_struct.
>> >> + */
>> >> +struct seccomp_struct {
>> >> +     uint16_t mode;
>> >> +#ifdef CONFIG_SECCOMP_FILTER
>> >> +     struct seccomp_filters *filters;
>> >> +#endif
>> >> +};
>> >>
>> >>  extern void __secure_computing(int);
>> >>  static inline void secure_computing(int this_syscall)
>> >> @@ -16,15 +36,14 @@ static inline void secure_computing(int this_syscall)
>> >>               __secure_computing(this_syscall);
>> >>  }
>> >>
>> >> -extern long prctl_get_seccomp(void);
>> >>  extern long prctl_set_seccomp(unsigned long);
>> >> +extern long prctl_get_seccomp(void);
>> >>
>> >>  #else /* CONFIG_SECCOMP */
>> >>
>> >>  #include <linux/errno.h>
>> >>
>> >> -typedef struct { } seccomp_t;
>> >> -
>> >> +struct seccomp_struct { };
>> >>  #define secure_computing(x) do { } while (0)
>> >>
>> >>  static inline long prctl_get_seccomp(void)
>> >> @@ -32,11 +51,80 @@ static inline long prctl_get_seccomp(void)
>> >>       return -EINVAL;
>> >>  }
>> >>
>> >> -static inline long prctl_set_seccomp(unsigned long arg2)
>> >> +static inline long prctl_set_seccomp(unsigned long a2);
>> >>  {
>> >>       return -EINVAL;
>> >>  }
>> >>
>> >>  #endif /* CONFIG_SECCOMP */
>> >>
>> >> +#ifdef CONFIG_SECCOMP_FILTER
>> >> +
>> >> +#define inherit_tsk_seccomp(_child, _orig) do { \
>> >> +     _child->seccomp.mode = _orig->seccomp.mode; \
>> >> +     _child->seccomp.filters = get_seccomp_filters(_orig->seccomp.filters); \
>> >> +     } while (0)
>> >> +#define put_tsk_seccomp(_tsk) put_seccomp_filters(_tsk->seccomp.filters)
>> >> +
>> >> +extern int seccomp_show_filters(struct seccomp_filters *filters,
>> >> +                             struct seq_file *);
>> >> +extern long seccomp_set_filter(int, char *);
>> >> +extern long seccomp_clear_filter(int);
>> >> +extern long seccomp_get_filter(int, char *, unsigned long);
>> >> +
>> >> +extern long prctl_set_seccomp_filter(unsigned long, char __user *);
>> >> +extern long prctl_get_seccomp_filter(unsigned long, char __user *,
>> >> +                                  unsigned long);
>> >> +extern long prctl_clear_seccomp_filter(unsigned long);
>> >> +
>> >> +extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
>> >> +extern void put_seccomp_filters(struct seccomp_filters *);
>> >> +
>> >> +extern int seccomp_test_filters(int);
>> >> +extern void seccomp_filter_log_failure(int);
>> >> +
>> >> +#else  /* CONFIG_SECCOMP_FILTER */
>> >> +
>> >> +struct seccomp_filters { };
>> >> +#define inherit_tsk_seccomp(_child, _orig) do { } while (0)
>> >> +#define put_tsk_seccomp(_tsk) do { } while (0)
>> >> +
>> >> +static inline int seccomp_show_filters(struct seccomp_filters *filters,
>> >> +                                    struct seq_file *m)
>> >> +{
>> >> +     return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long seccomp_set_filter(int syscall_nr, char *filter)
>> >> +{
>> >> +     return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long seccomp_clear_filter(int syscall_nr)
>> >> +{
>> >> +     return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long seccomp_get_filter(int syscall_nr,
>> >> +                                   char *buf, unsigned long available)
>> >> +{
>> >> +     return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long prctl_set_seccomp_filter(unsigned long a2, char __user *a3)
>> >> +{
>> >> +     return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long prctl_clear_seccomp_filter(unsigned long a2)
>> >> +{
>> >> +     return -ENOSYS;
>> >> +}
>> >> +
>> >> +static inline long prctl_get_seccomp_filter(unsigned long a2, char __user *a3,
>> >> +                                         unsigned long a4)
>> >> +{
>> >> +     return -ENOSYS;
>> >> +}
>> >> +#endif  /* CONFIG_SECCOMP_FILTER */
>> >>  #endif /* _LINUX_SECCOMP_H */
>> >> diff --git a/include/trace/syscall.h b/include/trace/syscall.h
>> >> index 242ae04..e061ad0 100644
>> >> --- a/include/trace/syscall.h
>> >> +++ b/include/trace/syscall.h
>> >> @@ -35,6 +35,8 @@ struct syscall_metadata {
>> >>  extern unsigned long arch_syscall_addr(int nr);
>> >>  extern int init_syscall_trace(struct ftrace_event_call *call);
>> >>
>> >> +extern struct syscall_metadata *syscall_nr_to_meta(int);
>> >> +
>> >>  extern int reg_event_syscall_enter(struct ftrace_event_call *call);
>> >>  extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
>> >>  extern int reg_event_syscall_exit(struct ftrace_event_call *call);
>> >> @@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
>> >>                                     struct trace_event *event);
>> >>  enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
>> >>                                    struct trace_event *event);
>> >> +#else
>> >> +static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
>> >> +{
>> >> +     return NULL;
>> >> +}
>> >>  #endif
>> >>
>> >>  #ifdef CONFIG_PERF_EVENTS
>> >> diff --git a/kernel/Makefile b/kernel/Makefile
>> >> index 85cbfb3..84e7dfb 100644
>> >> --- a/kernel/Makefile
>> >> +++ b/kernel/Makefile
>> >> @@ -81,6 +81,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
>> >>  obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
>> >>  obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
>> >>  obj-$(CONFIG_SECCOMP) += seccomp.o
>> >> +ifeq ($(CONFIG_SECCOMP_FILTER),y)
>> >> +obj-$(CONFIG_SECCOMP) += seccomp_filter.o
>> >> +endif
>> >>  obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
>> >>  obj-$(CONFIG_TREE_RCU) += rcutree.o
>> >>  obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
>> >> diff --git a/kernel/fork.c b/kernel/fork.c
>> >> index e7548de..6f835e0 100644
>> >> --- a/kernel/fork.c
>> >> +++ b/kernel/fork.c
>> >> @@ -34,6 +34,7 @@
>> >>  #include <linux/cgroup.h>
>> >>  #include <linux/security.h>
>> >>  #include <linux/hugetlb.h>
>> >> +#include <linux/seccomp.h>
>> >>  #include <linux/swap.h>
>> >>  #include <linux/syscalls.h>
>> >>  #include <linux/jiffies.h>
>> >> @@ -169,6 +170,7 @@ void free_task(struct task_struct *tsk)
>> >>       free_thread_info(tsk->stack);
>> >>       rt_mutex_debug_task_free(tsk);
>> >>       ftrace_graph_exit_task(tsk);
>> >> +     put_tsk_seccomp(tsk);
>> >>       free_task_struct(tsk);
>> >>  }
>> >>  EXPORT_SYMBOL(free_task);
>> >> @@ -280,6 +282,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
>> >>       if (err)
>> >>               goto out;
>> >>
>> >> +     inherit_tsk_seccomp(tsk, orig);
>> >>       setup_thread_stack(tsk, orig);
>> >>       clear_user_return_notifier(tsk);
>> >>       clear_tsk_need_resched(tsk);
>> >> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
>> >> index 57d4b13..0a942be 100644
>> >> --- a/kernel/seccomp.c
>> >> +++ b/kernel/seccomp.c
>> >> @@ -2,16 +2,20 @@
>> >>   * linux/kernel/seccomp.c
>> >>   *
>> >>   * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
>> >> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
>> >>   *
>> >>   * This defines a simple but solid secure-computing mode.
>> >>   */
>> >>
>> >>  #include <linux/seccomp.h>
>> >>  #include <linux/sched.h>
>> >> +#include <linux/slab.h>
>> >>  #include <linux/compat.h>
>> >> +#include <linux/unistd.h>
>> >> +#include <linux/ftrace_event.h>
>> >>
>> >> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
>> >>  /* #define SECCOMP_DEBUG 1 */
>> >> -#define NR_SECCOMP_MODES 1
>> >>
>> >>  /*
>> >>   * Secure computing mode 1 allows only read/write/exit/sigreturn.
>> >> @@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
>> >>
>> >>  void __secure_computing(int this_syscall)
>> >>  {
>> >> -     int mode = current->seccomp.mode;
>> >>       int * syscall;
>> >>
>> >> -     switch (mode) {
>> >> +     switch (current->seccomp.mode) {
>> >>       case 1:
>> >>               syscall = mode1_syscalls;
>> >>  #ifdef CONFIG_COMPAT
>> >> @@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
>> >>                               return;
>> >>               } while (*++syscall);
>> >>               break;
>> >> +#ifdef CONFIG_SECCOMP_FILTER
>> >> +     case 2:
>> >> +             if (this_syscall >= NR_syscalls || this_syscall < 0)
>> >> +                     break;
>> >> +
>> >> +             if (!seccomp_test_filters(this_syscall))
>> >> +                     return;
>> >> +
>> >> +             seccomp_filter_log_failure(this_syscall);
>> >> +             break;
>> >> +#endif
>> >>       default:
>> >>               BUG();
>> >>       }
>> >> @@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
>> >>       if (unlikely(current->seccomp.mode))
>> >>               goto out;
>> >>
>> >> -     ret = -EINVAL;
>> >> -     if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
>> >> -             current->seccomp.mode = seccomp_mode;
>> >> -             set_thread_flag(TIF_SECCOMP);
>> >> +     ret = 0;
>> >> +     switch (seccomp_mode) {
>> >> +     case 1:
>> >>  #ifdef TIF_NOTSC
>> >>               disable_TSC();
>> >>  #endif
>> >> -             ret = 0;
>> >> +#ifdef CONFIG_SECCOMP_FILTER
>> >> +     case 2:
>> >> +#endif
>> >> +             current->seccomp.mode = seccomp_mode;
>> >> +             set_thread_flag(TIF_SECCOMP);
>> >> +             break;
>> >> +     default:
>> >> +             ret = -EINVAL;
>> >>       }
>> >>
>> >> - out:
>> >> +out:
>> >>       return ret;
>> >>  }
>> >> diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
>> >> new file mode 100644
>> >> index 0000000..9782f25
>> >> --- /dev/null
>> >> +++ b/kernel/seccomp_filter.c
>> >> @@ -0,0 +1,784 @@
>> >> +/* filter engine-based seccomp system call filtering
>> >> + *
>> >> + * This program is free software; you can redistribute it and/or modify
>> >> + * it under the terms of the GNU General Public License as published by
>> >> + * the Free Software Foundation; either version 2 of the License, or
>> >> + * (at your option) any later version.
>> >> + *
>> >> + * This program is distributed in the hope that it will be useful,
>> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> >> + * GNU General Public License for more details.
>> >> + *
>> >> + * You should have received a copy of the GNU General Public License
>> >> + * along with this program; if not, write to the Free Software
>> >> + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
>> >> + *
>> >> + * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
>> >> + */
>> >> +
>> >> +#include <linux/compat.h>
>> >> +#include <linux/err.h>
>> >> +#include <linux/errno.h>
>> >> +#include <linux/ftrace_event.h>
>> >> +#include <linux/seccomp.h>
>> >> +#include <linux/seq_file.h>
>> >> +#include <linux/sched.h>
>> >> +#include <linux/slab.h>
>> >> +#include <linux/uaccess.h>
>> >> +
>> >> +#include <asm/syscall.h>
>> >> +#include <trace/syscall.h>
>> >> +
>> >> +
>> >> +#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
>> >> +
>> >> +#define SECCOMP_FILTER_ALLOW "1"
>> >> +#define SECCOMP_ACTION_DENY 0xffff
>> >> +#define SECCOMP_ACTION_ALLOW 0xfffe
>> >> +
>> >> +/**
>> >> + * struct seccomp_filters - container for seccomp filterset
>> >> + *
>> >> + * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
>> >> + *            May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
>> >> + * @event_filters: array of pointers to ftrace event objects
>> >> + * @count: size of @event_filters
>> >> + * @flags: anonymous struct to wrap filters-specific flags
>> >> + * @usage: reference count to simplify use.
>> >> + */
>> >> +struct seccomp_filters {
>> >> +     uint16_t syscalls[NR_syscalls];
>> >> +     struct event_filter **event_filters;
>> >> +     uint16_t count;
>> >> +     struct {
>> >> +             uint32_t compat:1,
>> >> +                      __reserved:31;
>> >> +     } flags;
>> >> +     atomic_t usage;
>> >> +};
>> >> +
>> >> +/* Handle ftrace symbol non-existence */
>> >> +#ifdef CONFIG_FTRACE_SYSCALLS
>> >> +#define create_event_filter(_ef_pptr, _event_type, _str) \
>> >> +     ftrace_parse_filter(_ef_pptr, _event_type, _str)
>> >> +#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
>> >> +#define free_event_filter(_f) ftrace_free_filter(_f)
>> >> +
>> >> +#else
>> >> +
>> >> +#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
>> >> +#define get_filter_string(_ef) (NULL)
>> >> +#define free_event_filter(_f) do { } while (0)
>> >> +#endif
>> >> +
>> >> +/**
>> >> + * seccomp_filters_new - allocates a new filters object
>> >> + * @count: count to allocate for the event_filters array
>> >> + *
>> >> + * Returns ERR_PTR on error or an allocated object.
>> >> + */
>> >> +static struct seccomp_filters *seccomp_filters_new(uint16_t count)
>> >> +{
>> >> +     struct seccomp_filters *f;
>> >> +
>> >> +     if (count >= SECCOMP_ACTION_ALLOW)
>> >> +             return ERR_PTR(-EINVAL);
>> >> +
>> >> +     f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
>> >> +     if (!f)
>> >> +             return ERR_PTR(-ENOMEM);
>> >> +
>> >> +     /* Lazy SECCOMP_ACTION_DENY assignment. */
>> >> +     memset(f->syscalls, 0xff, sizeof(f->syscalls));
>> >> +     atomic_set(&f->usage, 1);
>> >> +
>> >> +     f->event_filters = NULL;
>> >> +     f->count = count;
>> >> +     if (!count)
>> >> +             return f;
>> >> +
>> >> +     f->event_filters = kzalloc(count * sizeof(struct event_filter *),
>> >> +                                GFP_KERNEL);
>> >> +     if (!f->event_filters) {
>> >> +             kfree(f);
>> >> +             f = ERR_PTR(-ENOMEM);
>> >> +     }
>> >> +     return f;
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_filters_free - cleans up the filter list and frees the table
>> >> + * @filters: NULL or live object to be completely destructed.
>> >> + */
>> >> +static void seccomp_filters_free(struct seccomp_filters *filters)
>> >> +{
>> >> +     uint16_t count = 0;
>> >> +     if (!filters)
>> >> +             return;
>> >> +     while (count < filters->count) {
>> >> +             struct event_filter *f = filters->event_filters[count];
>> >> +             free_event_filter(f);
>> >> +             count++;
>> >> +     }
>> >> +     kfree(filters->event_filters);
>> >> +     kfree(filters);
>> >> +}
>> >> +
>> >> +static void __put_seccomp_filters(struct seccomp_filters *orig)
>> >> +{
>> >> +     WARN_ON(atomic_read(&orig->usage));
>> >> +     seccomp_filters_free(orig);
>> >> +}
>> >> +
>> >> +#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
>> >> +#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
>> >> +#define seccomp_filter_dynamic(_id) \
>> >> +     (!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
>> >> +static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
>> >> +                                      int syscall_nr)
>> >> +{
>> >> +     if (!f)
>> >> +             return SECCOMP_ACTION_DENY;
>> >> +     return f->syscalls[syscall_nr];
>> >> +}
>> >> +
>> >> +static inline struct event_filter *seccomp_dynamic_filter(
>> >> +             const struct seccomp_filters *filters, uint16_t id)
>> >> +{
>> >> +     if (!seccomp_filter_dynamic(id))
>> >> +             return NULL;
>> >> +     return filters->event_filters[id];
>> >> +}
>> >> +
>> >> +static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
>> >> +                                      int syscall_nr, uint16_t id)
>> >> +{
>> >> +     filters->syscalls[syscall_nr] = id;
>> >> +}
>> >> +
>> >> +static inline void set_seccomp_filter(struct seccomp_filters *filters,
>> >> +                                   int syscall_nr, uint16_t id,
>> >> +                                   struct event_filter *dynamic_filter)
>> >> +{
>> >> +     filters->syscalls[syscall_nr] = id;
>> >> +     if (seccomp_filter_dynamic(id))
>> >> +             filters->event_filters[id] = dynamic_filter;
>> >> +}
>> >> +
>> >> +static struct event_filter *alloc_event_filter(int syscall_nr,
>> >> +                                            const char *filter_string)
>> >> +{
>> >> +     struct syscall_metadata *data;
>> >> +     struct event_filter *filter = NULL;
>> >> +     int err;
>> >> +
>> >> +     data = syscall_nr_to_meta(syscall_nr);
>> >> +     /* Argument-based filtering only works on ftrace-hooked syscalls. */
>> >> +     err = -ENOSYS;
>> >> +     if (!data)
>> >> +             goto fail;
>> >> +     err = create_event_filter(&filter,
>> >> +                               data->enter_event->event.type,
>> >> +                               filter_string);
>> >> +     if (err)
>> >> +             goto fail;
>> >> +
>> >> +     return filter;
>> >> +fail:
>> >> +     kfree(filter);
>> >> +     return ERR_PTR(err);
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_filters_copy - copies filters from src to dst.
>> >> + *
>> >> + * @dst: seccomp_filters to populate.
>> >> + * @src: table to read from.
>> >> + * @skip: specifies an entry, by system call, to skip.
>> >> + *
>> >> + * Returns non-zero on failure.
>> >> + * Both the source and the destination should have no simultaneous
>> >> + * writers, and dst should be exclusive to the caller.
>> >> + * If @skip is < 0, it is ignored.
>> >> + */
>> >> +static int seccomp_filters_copy(struct seccomp_filters *dst,
>> >> +                             const struct seccomp_filters *src,
>> >> +                             int skip)
>> >> +{
>> >> +     int id = 0, ret = 0, nr;
>> >> +     memcpy(&dst->flags, &src->flags, sizeof(src->flags));
>> >> +     memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
>> >> +     if (!src->count)
>> >> +             goto done;
>> >> +     for (nr = 0; nr < NR_syscalls; ++nr) {
>> >> +             struct event_filter *filter;
>> >> +             const char *str;
>> >> +             uint16_t src_id = seccomp_filter_id(src, nr);
>> >> +             if (nr == skip) {
>> >> +                     set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
>> >> +                                        NULL);
>> >> +                     continue;
>> >> +             }
>> >> +             if (!seccomp_filter_dynamic(src_id))
>> >> +                     continue;
>> >> +             if (id >= dst->count) {
>> >> +                     ret = -EINVAL;
>> >> +                     goto done;
>> >> +             }
>> >> +             str = get_filter_string(seccomp_dynamic_filter(src, src_id));
>> >> +             filter = alloc_event_filter(nr, str);
>> >> +             if (IS_ERR(filter)) {
>> >> +                     ret = PTR_ERR(filter);
>> >> +                     goto done;
>> >> +             }
>> >> +             set_seccomp_filter(dst, nr, id, filter);
>> >> +             id++;
>> >> +     }
>> >> +
>> >> +done:
>> >> +     return ret;
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_extend_filter - appends more text to a syscall_nr's filter
>> >> + * @filters: unattached filter object to operate on
>> >> + * @syscall_nr: syscall number to update filters for
>> >> + * @filter_string: string to append to the existing filter
>> >> + *
>> >> + * The new string will be &&'d to the original filter string to ensure that it
>> >> + * always matches the existing predicates or less:
>> >> + *   (old_filter) && @filter_string
>> >> + * A new seccomp_filters instance is returned on success and a ERR_PTR on
>> >> + * failure.
>> >> + */
>> >> +static int seccomp_extend_filter(struct seccomp_filters *filters,
>> >> +                              int syscall_nr, char *filter_string)
>> >> +{
>> >> +     struct event_filter *filter;
>> >> +     uint16_t id = seccomp_filter_id(filters, syscall_nr);
>> >> +     char *merged = NULL;
>> >> +     int ret = -EINVAL, expected;
>> >> +
>> >> +     /* No extending with a "1". */
>> >> +     if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
>> >> +             goto out;
>> >> +
>> >> +     filter = seccomp_dynamic_filter(filters, id);
>> >> +     ret = -ENOENT;
>> >> +     if (!filter)
>> >> +             goto out;
>> >> +
>> >> +     merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
>> >> +     ret = -ENOMEM;
>> >> +     if (!merged)
>> >> +             goto out;
>> >> +
>> >> +     expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && %s",
>> >> +                         get_filter_string(filter), filter_string);
>> >> +     ret = -E2BIG;
>> >> +     if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
>> >> +             goto out;
>> >> +
>> >> +     /* Free the old filter */
>> >> +     free_event_filter(filter);
>> >> +     set_seccomp_filter(filters, syscall_nr, id, NULL);
>> >> +
>> >> +     /* Replace it */
>> >> +     filter = alloc_event_filter(syscall_nr, merged);
>> >> +     if (IS_ERR(filter)) {
>> >> +             ret = PTR_ERR(filter);
>> >> +             goto out;
>> >> +     }
>> >> +     set_seccomp_filter(filters, syscall_nr, id, filter);
>> >> +     ret = 0;
>> >> +
>> >> +out:
>> >> +     kfree(merged);
>> >> +     return ret;
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_add_filter - adds a filter for an unfiltered syscall
>> >> + * @filters: filters object to add a filter/action to
>> >> + * @syscall_nr: system call number to add a filter for
>> >> + * @filter_string: the filter string to apply
>> >> + *
>> >> + * Returns 0 on success and non-zero otherwise.
>> >> + */
>> >> +static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
>> >> +                           char *filter_string)
>> >> +{
>> >> +     struct event_filter *filter;
>> >> +     int ret = 0;
>> >> +
>> >> +     if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
>> >> +             set_seccomp_filter(filters, syscall_nr,
>> >> +                                SECCOMP_ACTION_ALLOW, NULL);
>> >> +             goto out;
>> >> +     }
>> >> +
>> >> +     filter = alloc_event_filter(syscall_nr, filter_string);
>> >> +     if (IS_ERR(filter)) {
>> >> +             ret = PTR_ERR(filter);
>> >> +             goto out;
>> >> +     }
>> >> +     /* Always add to the last slot available since additions are
>> >> +      * are only done one at a time.
>> >> +      */
>> >> +     set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
>> >> +out:
>> >> +     return ret;
>> >> +}
>> >> +
>> >> +/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
>> >> +static int filter_match_current(struct event_filter *event_filter)
>> >> +{
>> >> +     int err = 0;
>> >> +#ifdef CONFIG_FTRACE_SYSCALLS
>> >> +     uint8_t syscall_state[64];
>> >> +
>> >> +     memset(syscall_state, 0, sizeof(syscall_state));
>> >> +
>> >> +     /* The generic tracing entry can remain zeroed. */
>> >> +     err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
>> >> +                                      NULL);
>> >> +     if (err)
>> >> +             return 0;
>> >> +
>> >> +     err = filter_match_preds(event_filter, syscall_state);
>> >> +#endif
>> >> +     return err;
>> >> +}
>> >> +
>> >> +static const char *syscall_nr_to_name(int syscall)
>> >> +{
>> >> +     const char *syscall_name = "unknown";
>> >> +     struct syscall_metadata *data = syscall_nr_to_meta(syscall);
>> >> +     if (data)
>> >> +             syscall_name = data->name;
>> >> +     return syscall_name;
>> >> +}
>> >> +
>> >> +static void filters_set_compat(struct seccomp_filters *filters)
>> >> +{
>> >> +#ifdef CONFIG_COMPAT
>> >> +     if (is_compat_task())
>> >> +             filters->flags.compat = 1;
>> >> +#endif
>> >> +}
>> >> +
>> >> +static inline int filters_compat_mismatch(struct seccomp_filters *filters)
>> >> +{
>> >> +     int ret = 0;
>> >> +     if (!filters)
>> >> +             return 0;
>> >> +#ifdef CONFIG_COMPAT
>> >> +     if (!!(is_compat_task()) == filters->flags.compat)
>> >> +             ret = 1;
>> >> +#endif
>> >> +     return ret;
>> >> +}
>> >> +
>> >> +static inline int syscall_is_execve(int syscall)
>> >> +{
>> >> +     int nr = __NR_execve;
>> >> +#ifdef CONFIG_COMPAT
>> >> +     if (is_compat_task())
>> >> +             nr = __NR_seccomp_execve_32;
>> >> +#endif
>> >> +     return syscall == nr;
>> >> +}
>> >> +
>> >> +#ifndef KSTK_EIP
>> >> +#define KSTK_EIP(x) 0L
>> >> +#endif
>> >> +
>> >> +void seccomp_filter_log_failure(int syscall)
>> >> +{
>> >> +     pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
>> >> +             current->comm, task_pid_nr(current), syscall,
>> >> +             syscall_nr_to_name(syscall), KSTK_EIP(current));
>> >> +}
>> >> +
>> >> +/* put_seccomp_state - decrements the reference count of @orig and may free. */
>> >> +void put_seccomp_filters(struct seccomp_filters *orig)
>> >> +{
>> >> +     if (!orig)
>> >> +             return;
>> >> +
>> >> +     if (atomic_dec_and_test(&orig->usage))
>> >> +             __put_seccomp_filters(orig);
>> >> +}
>> >> +
>> >> +/* get_seccomp_state - increments the reference count of @orig */
>> >> +struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)
>> >
>> > Nit: the name does not match the comment.
>>
>> Will fix it here and above. Thanks!
>>
>> >> +{
>> >> +     if (!orig)
>> >> +             return NULL;
>> >> +     atomic_inc(&orig->usage);
>> >> +     return orig;
>> >
>> > This is called in an RCU read-side critical section.  What exactly is
>> > RCU protecting?  I would expect an rcu_dereference() or one of the
>> > RCU list-traversal primitives somewhere, either here or at the caller.
>>
>> Ah, I spaced on rcu_dereference().  The goal was to make the
>> assignment and replacement of the seccomp_filters pointer
>> RCU-protected (in seccomp_state) so there's no concern over it being
>> replaced partial on platforms where pointer assignments are non-atomic
>> - such as via /proc/<pid>/seccomp_filters access or a call via the
>> exported symbols.  Object lifetime is managed by reference counting so
>> that I don't have to worry about extending the RCU read-side critical
>> section by much or deal with pre-allocations.
>>
>> I'll add rcu_dereference() to all the get_seccomp_filters() uses where
>> it makes sense, so that it is called safely.  Just to make sure, does
>> it make sense to continue to rcu protect the specific pointer?
>
> It might.  The usual other options is to use a lock outside of the element
> containing the reference count to protect reference-count manipulation.
> If there is some convenient lock, especially if it is already held where
> needed, then locking is more straightforward.  Otherwise, RCU is usually
> a reasonable option.

I was concerned about the overhead a lock would have at each system
call entry, but I didn't benchmark it to see.  I'll add the
rcu_dereference right away, then look into seeing whether there's a
cleaner approach.  I was trying to be overly protective of mutating
any data internal to the filters through complete replacement on any
change.  I'll take a step back and see if

>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_test_filters - tests 'current' against the given syscall
>> >> + * @state: seccomp_state of current to use.
>> >> + * @syscall: number of the system call to test
>> >> + *
>> >> + * Returns 0 on ok and non-zero on error/failure.
>> >> + */
>> >> +int seccomp_test_filters(int syscall)
>> >> +{
>> >> +     uint16_t id;
>> >> +     struct event_filter *filter;
>> >> +     struct seccomp_filters *filters;
>> >> +     int ret = -EACCES;
>> >> +
>> >> +     rcu_read_lock();
>> >> +     filters = get_seccomp_filters(current->seccomp.filters);
>> >> +     rcu_read_unlock();
>> >> +
>> >> +     if (!filters)
>> >> +             goto out;
>> >> +
>> >> +     if (filters_compat_mismatch(filters)) {
>> >> +             pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
>> >> +                     current->comm, task_pid_nr(current));
>> >> +             goto out;
>> >> +     }
>> >> +
>> >> +     /* execve is never allowed. */
>> >> +     if (syscall_is_execve(syscall))
>> >> +             goto out;
>> >> +
>> >> +     ret = 0;
>> >> +     id = seccomp_filter_id(filters, syscall);
>> >> +     if (seccomp_filter_allow(id))
>> >> +             goto out;
>> >> +
>> >> +     ret = -EACCES;
>> >> +     if (!seccomp_filter_dynamic(id))
>> >> +             goto out;
>> >> +
>> >> +     filter = seccomp_dynamic_filter(filters, id);
>> >> +     if (filter && filter_match_current(filter))
>> >> +             ret = 0;
>> >> +out:
>> >> +     put_seccomp_filters(filters);
>> >> +     return ret;
>> >> +}
>> >> +
>> >> +/**
>> >> + * seccomp_show_filters - prints the current filter state to a seq_file
>> >> + * @filters: properly get()'d filters object
>> >> + * @m: the prepared seq_file to receive the data
>> >> + *
>> >> + * Returns 0 on a successful write.
>> >> + */
>> >> +int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
>> >> +{
>> >> +     int syscall;
>> >> +     seq_printf(m, "Mode: %d\n", current->seccomp.mode);
>> >> +     if (!filters)
>> >> +             goto out;
>> >> +
>> >> +     for (syscall = 0; syscall < NR_syscalls; ++syscall) {
>> >> +             uint16_t id = seccomp_filter_id(filters, syscall);
>> >> +             const char *filter_string = SECCOMP_FILTER_ALLOW;
>> >> +             if (seccomp_filter_deny(id))
>> >> +                     continue;
>> >> +             seq_printf(m, "%d (%s): ",
>> >> +                           syscall,
>> >> +                           syscall_nr_to_name(syscall));
>> >> +             if (seccomp_filter_dynamic(id))
>> >> +                     filter_string = get_filter_string(
>> >> +                                       seccomp_dynamic_filter(filters, id));
>> >> +             seq_printf(m, "%s\n", filter_string);
>> >> +     }
>> >> +out:
>> >> +     return 0;
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(seccomp_show_filters);
>> >> +
>> >> +/**
>> >> + * seccomp_get_filter - copies the filter_string into "buf"
>> >> + * @syscall_nr: system call number to look up
>> >> + * @buf: destination buffer
>> >> + * @bufsize: available space in the buffer.
>> >> + *
>> >> + * Context: User context only. This function may sleep on allocation and
>> >> + *          operates on current. current must be attempting a system call
>> >> + *          when this is called.
>> >> + *
>> >> + * Looks up the filter for the given system call number on current.  If found,
>> >> + * the string length of the NUL-terminated buffer is returned and < 0 is
>> >> + * returned on error. The NUL byte is not included in the length.
>> >> + */
>> >> +long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
>> >> +{
>> >> +     struct seccomp_filters *filters;
>> >> +     struct event_filter *filter;
>> >> +     long ret = -EINVAL;
>> >> +     uint16_t id;
>> >> +
>> >> +     if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
>> >> +             bufsize = SECCOMP_MAX_FILTER_LENGTH;
>> >> +
>> >> +     rcu_read_lock();
>> >> +     filters = get_seccomp_filters(current->seccomp.filters);
>> >> +     rcu_read_unlock();
>> >> +
>> >> +     if (!filters)
>> >> +             goto out;
>> >> +
>> >> +     ret = -ENOENT;
>> >> +     id = seccomp_filter_id(filters, syscall_nr);
>> >> +     if (seccomp_filter_deny(id))
>> >> +             goto out;
>> >> +
>> >> +     if (seccomp_filter_allow(id)) {
>> >> +             ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
>> >> +             goto copied;
>> >> +     }
>> >> +
>> >> +     filter = seccomp_dynamic_filter(filters, id);
>> >> +     if (!filter)
>> >> +             goto out;
>> >> +     ret = strlcpy(buf, get_filter_string(filter), bufsize);
>> >> +
>> >> +copied:
>> >> +     if (ret >= bufsize) {
>> >> +             ret = -ENOSPC;
>> >> +             goto out;
>> >> +     }
>> >> +     /* Zero out any remaining buffer, just in case. */
>> >> +     memset(buf + ret, 0, bufsize - ret);
>> >> +out:
>> >> +     put_seccomp_filters(filters);
>> >> +     return ret;
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(seccomp_get_filter);
>> >> +
>> >> +/**
>> >> + * seccomp_clear_filter: clears the seccomp filter for a syscall.
>> >> + * @syscall_nr: the system call number to clear filters for.
>> >> + *
>> >> + * Context: User context only. This function may sleep on allocation and
>> >> + *          operates on current. current must be attempting a system call
>> >> + *          when this is called.
>> >> + *
>> >> + * Returns 0 on success.
>> >> + */
>> >> +long seccomp_clear_filter(int syscall_nr)
>> >> +{
>> >> +     struct seccomp_filters *filters = NULL, *orig_filters;
>> >> +     uint16_t id;
>> >> +     int ret = -EINVAL;
>> >> +
>> >> +     rcu_read_lock();
>> >> +     orig_filters = get_seccomp_filters(current->seccomp.filters);
>> >> +     rcu_read_unlock();
>> >> +
>> >> +     if (!orig_filters)
>> >> +             goto out;
>> >> +
>> >> +     if (filters_compat_mismatch(orig_filters))
>> >> +             goto out;
>> >> +
>> >> +     id = seccomp_filter_id(orig_filters, syscall_nr);
>> >> +     if (seccomp_filter_deny(id))
>> >> +             goto out;
>> >> +
>> >> +     /* Create a new filters object for the task */
>> >> +     if (seccomp_filter_dynamic(id))
>> >> +             filters = seccomp_filters_new(orig_filters->count - 1);
>> >> +     else
>> >> +             filters = seccomp_filters_new(orig_filters->count);
>> >> +
>> >> +     if (IS_ERR(filters)) {
>> >> +             ret = PTR_ERR(filters);
>> >> +             goto out;
>> >> +     }
>> >> +
>> >> +     /* Copy, but drop the requested entry. */
>> >> +     ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
>> >> +     if (ret)
>> >> +             goto out;
>> >> +     get_seccomp_filters(filters);  /* simplify the out: path */
>> >> +
>> >> +     rcu_assign_pointer(current->seccomp.filters, filters);
>> >
>> > What prevents two copies of seccomp_clear_filter() from running
>> > concurrently?
>>
>> Nothing - the last one wins assignment, but the objects themselves
>> should be internally consistent to the parallel calls.  If that's a
>> concern, a per-task writer mutex could be used just to ensure
>> simultaneous calls to clear and set are performed serially.  Would
>> that make more sense?
>
> Here is the sequence of events that I am concerned about:
>
> o       CPU 0 sets orig_filters to point to the current filters.
>
> o       CPU 1 sets its local orig_filters to point to the current
>        set of filters.
>
> o       Both CPUs allocate new filters and use rcu_assign_pointer()
>        to do the update.  As you say, the last one wins, but it appears
>        to me that the first one leaks memory.
>
> o       Both CPUs free the object referenced by their orig_filters,
>        which might or might not result in a double free, depending
>        on exactly what happens below.  (You might actually be OK,
>        I didn't check -- leaking memory was enough for me to call
>        attention to this.)
>
> So yes, please use some kind of mutual exclusion.  Not sure what you
> mean by "per-task mutex", but whatever it is must prevent two different
> tasks from acting on the same set of filters at the same time.  The
> thing that I call "per-task mutex" would -not- do that.

Ah nice.  Yeah that would most definitely leak as there would be a
remaining increment for the task that isn't dropped.

Since those interfaces acquire the filter itself from
current->seccomp.filters, I was thinking a mutex in current, e.g.,
current->seccomp.write_mutex, would fit the bill to ensure that
pointer isn't accessed for replacement.  I'll look at this and the rcu
usage to see if I'm taking the most logical approach.  I pretty much
always get locking wrong in some way and perhaps I can simplify
further and get the correct guarantees.  I really appreciate the keen
observation and explanation!

>> >> +     synchronize_rcu();
>> >> +     put_seccomp_filters(orig_filters);  /* for the task */
>> >> +out:
>> >> +     put_seccomp_filters(orig_filters);  /* for the get */
>> >> +     put_seccomp_filters(filters);  /* for the extra get */
>> >> +     return ret;
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(seccomp_clear_filter);
>> >> +
>> >> +/**
>> >> + * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
>> >> + * @syscall_nr: system call number to apply the filter to.
>> >> + * @filter: ftrace filter string to apply.
>> >> + *
>> >> + * Context: User context only. This function may sleep on allocation and
>> >> + *          operates on current. current must be attempting a system call
>> >> + *          when this is called.
>> >> + *
>> >> + * New filters may be added for system calls when the current task is
>> >> + * not in a secure computing mode (seccomp).  Otherwise, existing filters may
>> >> + * be extended.
>> >> + *
>> >> + * Returns 0 on success or an errno on failure.
>> >> + */
>> >> +long seccomp_set_filter(int syscall_nr, char *filter)
>> >> +{
>> >> +     struct seccomp_filters *filters = NULL, *orig_filters = NULL;
>> >> +     uint16_t id;
>> >> +     long ret = -EINVAL;
>> >> +     uint16_t filters_needed;
>> >> +
>> >> +     if (!filter)
>> >> +             goto out;
>> >> +
>> >> +     filter = strstrip(filter);
>> >> +     /* Disallow empty strings. */
>> >> +     if (filter[0] == 0)
>> >> +             goto out;
>> >> +
>> >> +     rcu_read_lock();
>> >> +     orig_filters = get_seccomp_filters(current->seccomp.filters);
>> >> +     rcu_read_unlock();
>> >> +
>> >> +     /* After the first call, compatibility mode is selected permanently. */
>> >> +     ret = -EACCES;
>> >> +     if (filters_compat_mismatch(orig_filters))
>> >> +             goto out;
>> >> +
>> >> +     filters_needed = orig_filters ? orig_filters->count : 0;
>> >> +     id = seccomp_filter_id(orig_filters, syscall_nr);
>> >> +     if (seccomp_filter_deny(id)) {
>> >> +             /* Don't allow DENYs to be changed when in a seccomp mode */
>> >> +             ret = -EACCES;
>> >> +             if (current->seccomp.mode)
>> >> +                     goto out;
>> >> +             filters_needed++;
>> >> +     }
>> >> +
>> >> +     filters = seccomp_filters_new(filters_needed);
>> >> +     if (IS_ERR(filters)) {
>> >> +             ret = PTR_ERR(filters);
>> >> +             goto out;
>> >> +     }
>> >> +
>> >> +     filters_set_compat(filters);
>> >> +     if (orig_filters) {
>> >> +             ret = seccomp_filters_copy(filters, orig_filters, -1);
>> >> +             if (ret)
>> >> +                     goto out;
>> >> +     }
>> >> +
>> >> +     if (seccomp_filter_deny(id))
>> >> +             ret = seccomp_add_filter(filters, syscall_nr, filter);
>> >> +     else
>> >> +             ret = seccomp_extend_filter(filters, syscall_nr, filter);
>> >> +     if (ret)
>> >> +             goto out;
>> >> +     get_seccomp_filters(filters);  /* simplify the error paths */
>> >> +
>> >> +     rcu_assign_pointer(current->seccomp.filters, filters);
>> >
>> > Again, what prevents two copies of seccomp_set_filter() from running
>> > concurrently?
>>
>> Same deal - nothing, but I'd be happy to add a guard if it makes sense.
>>
>> Thanks!
>>
>> >> +     synchronize_rcu();
>> >> +     put_seccomp_filters(orig_filters);  /* for the task */
>> >> +out:
>> >> +     put_seccomp_filters(orig_filters);  /* for the get */
>> >> +     put_seccomp_filters(filters);  /* for get or task, on err */
>> >> +     return ret;
>> >> +}
>> >> +EXPORT_SYMBOL_GPL(seccomp_set_filter);
>> >> +
>> >> +long prctl_set_seccomp_filter(unsigned long syscall_nr,
>> >> +                           char __user *user_filter)
>> >> +{
>> >> +     int nr;
>> >> +     long ret;
>> >> +     char *filter = NULL;
>> >> +
>> >> +     ret = -EINVAL;
>> >> +     if (syscall_nr >= NR_syscalls)
>> >> +             goto out;
>> >> +
>> >> +     ret = -EFAULT;
>> >> +     if (!user_filter)
>> >> +             goto out;
>> >> +
>> >> +     filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
>> >> +     ret = -ENOMEM;
>> >> +     if (!filter)
>> >> +             goto out;
>> >> +
>> >> +     ret = -EFAULT;
>> >> +     if (strncpy_from_user(filter, user_filter,
>> >> +                           SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
>> >> +             goto out;
>> >> +
>> >> +     nr = (int) syscall_nr;
>> >> +     ret = seccomp_set_filter(nr, filter);
>> >> +
>> >> +out:
>> >> +     kfree(filter);
>> >> +     return ret;
>> >> +}
>> >> +
>> >> +long prctl_clear_seccomp_filter(unsigned long syscall_nr)
>> >> +{
>> >> +     int nr = -1;
>> >> +     long ret;
>> >> +
>> >> +     ret = -EINVAL;
>> >> +     if (syscall_nr >= NR_syscalls)
>> >> +             goto out;
>> >> +
>> >> +     nr = (int) syscall_nr;
>> >> +     ret = seccomp_clear_filter(nr);
>> >> +
>> >> +out:
>> >> +     return ret;
>> >> +}
>> >> +
>> >> +long prctl_get_seccomp_filter(unsigned long syscall_nr, char __user *dst,
>> >> +                           unsigned long available)
>> >> +{
>> >> +     int ret, nr;
>> >> +     unsigned long copied;
>> >> +     char *buf = NULL;
>> >> +     ret = -EINVAL;
>> >> +     if (!available)
>> >> +             goto out;
>> >> +     /* Ignore extra buffer space. */
>> >> +     if (available > SECCOMP_MAX_FILTER_LENGTH)
>> >> +             available = SECCOMP_MAX_FILTER_LENGTH;
>> >> +
>> >> +     ret = -EINVAL;
>> >> +     if (syscall_nr >= NR_syscalls)
>> >> +             goto out;
>> >> +     nr = (int) syscall_nr;
>> >> +
>> >> +     ret = -ENOMEM;
>> >> +     buf = kmalloc(available, GFP_KERNEL);
>> >> +     if (!buf)
>> >> +             goto out;
>> >> +
>> >> +     ret = seccomp_get_filter(nr, buf, available);
>> >> +     if (ret < 0)
>> >> +             goto out;
>> >> +
>> >> +     /* Include the NUL byte in the copy. */
>> >> +     copied = copy_to_user(dst, buf, ret + 1);
>> >> +     ret = -ENOSPC;
>> >> +     if (copied)
>> >> +             goto out;
>> >> +     ret = 0;
>> >> +out:
>> >> +     kfree(buf);
>> >> +     return ret;
>> >> +}
>> >> diff --git a/kernel/sys.c b/kernel/sys.c
>> >> index af468ed..ed60d06 100644
>> >> --- a/kernel/sys.c
>> >> +++ b/kernel/sys.c
>> >> @@ -1698,13 +1698,24 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>> >>               case PR_SET_ENDIAN:
>> >>                       error = SET_ENDIAN(me, arg2);
>> >>                       break;
>> >> -
>> >>               case PR_GET_SECCOMP:
>> >>                       error = prctl_get_seccomp();
>> >>                       break;
>> >>               case PR_SET_SECCOMP:
>> >>                       error = prctl_set_seccomp(arg2);
>> >>                       break;
>> >> +             case PR_SET_SECCOMP_FILTER:
>> >> +                     error = prctl_set_seccomp_filter(arg2,
>> >> +                                                      (char __user *) arg3);
>> >> +                     break;
>> >> +             case PR_CLEAR_SECCOMP_FILTER:
>> >> +                     error = prctl_clear_seccomp_filter(arg2);
>> >> +                     break;
>> >> +             case PR_GET_SECCOMP_FILTER:
>> >> +                     error = prctl_get_seccomp_filter(arg2,
>> >> +                                                      (char __user *) arg3,
>> >> +                                                      arg4);
>> >> +                     break;
>> >>               case PR_GET_TSC:
>> >>                       error = GET_TSC_CTL(arg2);
>> >>                       break;
>> >> diff --git a/security/Kconfig b/security/Kconfig
>> >> index 95accd4..c76adf2 100644
>> >> --- a/security/Kconfig
>> >> +++ b/security/Kconfig
>> >> @@ -2,6 +2,10 @@
>> >>  # Security configuration
>> >>  #
>> >>
>> >> +# Make seccomp filter Kconfig switch below available
>> >> +config HAVE_SECCOMP_FILTER
>> >> +       bool
>> >> +
>> >>  menu "Security options"
>> >>
>> >>  config KEYS
>> >> @@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
>> >>
>> >>         If you are unsure how to answer this question, answer N.
>> >>
>> >> +config SECCOMP_FILTER
>> >> +     bool "Enable seccomp-based system call filtering"
>> >> +     select SECCOMP
>> >> +     depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
>> >> +     help
>> >> +       This kernel feature expands CONFIG_SECCOMP to allow computing
>> >> +       in environments with reduced kernel access dictated by the
>> >> +       application itself through prctl calls.  If
>> >> +       CONFIG_FTRACE_SYSCALLS is available, then system call
>> >> +       argument-based filtering predicates may be used.
>> >> +
>> >> +       See Documentation/prctl/seccomp_filter.txt for more detail.
>> >> +
>> >>  config SECURITY
>> >>       bool "Enable different security models"
>> >>       depends on SYSFS
>> >> --
>> >> 1.7.0.4
>> >>
>> >> --
>> >> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> >> the body of a message to majordomo@vger.kernel.org
>> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> >> Please read the FAQ at  http://www.tux.org/lkml/
>> >
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
  2011-06-02 20:28                                                                                                                 ` Will Drewry
@ 2011-06-02 20:46                                                                                                                   ` Steven Rostedt
  2011-06-02 21:12                                                                                                                     ` Paul E. McKenney
  0 siblings, 1 reply; 406+ messages in thread
From: Steven Rostedt @ 2011-06-02 20:46 UTC (permalink / raw)
  To: Will Drewry
  Cc: paulmck, linux-kernel, kees.cook, torvalds, tglx, mingo, jmorris,
	Peter Zijlstra, Frederic Weisbecker, linux-security-module

On Thu, 2011-06-02 at 15:28 -0500, Will Drewry wrote:

[ Snipped 860 lines of non relevant text ]

Seriously guys, Please trim your replies. These last few messages were
ridicules. I spent more than 30 seconds searching for what the email was
about. That's too much wasted time.

-- Steve


> >> Ah, I spaced on rcu_dereference().  The goal was to make the
> >> assignment and replacement of the seccomp_filters pointer
> >> RCU-protected (in seccomp_state) so there's no concern over it being
> >> replaced partial on platforms where pointer assignments are non-atomic
> >> - such as via /proc/<pid>/seccomp_filters access or a call via the
> >> exported symbols.  Object lifetime is managed by reference counting so
> >> that I don't have to worry about extending the RCU read-side critical
> >> section by much or deal with pre-allocations.
> >>
> >> I'll add rcu_dereference() to all the get_seccomp_filters() uses where
> >> it makes sense, so that it is called safely.  Just to make sure, does
> >> it make sense to continue to rcu protect the specific pointer?
> >
> > It might.  The usual other options is to use a lock outside of the element
> > containing the reference count to protect reference-count manipulation.
> > If there is some convenient lock, especially if it is already held where
> > needed, then locking is more straightforward.  Otherwise, RCU is usually
> > a reasonable option.
> 
> I was concerned about the overhead a lock would have at each system
> call entry, but I didn't benchmark it to see.  I'll add the
> rcu_dereference right away, then look into seeing whether there's a
> cleaner approach.  I was trying to be overly protective of mutating
> any data internal to the filters through complete replacement on any
> change.  I'll take a step back and see if
> 



^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters
  2011-06-02 20:46                                                                                                                   ` Steven Rostedt
@ 2011-06-02 21:12                                                                                                                     ` Paul E. McKenney
  0 siblings, 0 replies; 406+ messages in thread
From: Paul E. McKenney @ 2011-06-02 21:12 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Will Drewry, linux-kernel, kees.cook, torvalds, tglx, mingo,
	jmorris, Peter Zijlstra, Frederic Weisbecker,
	linux-security-module

On Thu, Jun 02, 2011 at 04:46:07PM -0400, Steven Rostedt wrote:
> On Thu, 2011-06-02 at 15:28 -0500, Will Drewry wrote:
> 
> [ Snipped 860 lines of non relevant text ]
> 
> Seriously guys, Please trim your replies. These last few messages were
> ridicules. I spent more than 30 seconds searching for what the email was
> about. That's too much wasted time.

Because every time I do trim the messages, I get a response from the
reviewee of the form "Oh, I take care of that in function foo()."
And of course function foo() will be in the part I trimmed.  So I then
have to find the earlier message, copy the function back in, and by
that time something else has distracted me.

							Thanx, Paul

> -- Steve
> 
> 
> > >> Ah, I spaced on rcu_dereference().  The goal was to make the
> > >> assignment and replacement of the seccomp_filters pointer
> > >> RCU-protected (in seccomp_state) so there's no concern over it being
> > >> replaced partial on platforms where pointer assignments are non-atomic
> > >> - such as via /proc/<pid>/seccomp_filters access or a call via the
> > >> exported symbols.  Object lifetime is managed by reference counting so
> > >> that I don't have to worry about extending the RCU read-side critical
> > >> section by much or deal with pre-allocations.
> > >>
> > >> I'll add rcu_dereference() to all the get_seccomp_filters() uses where
> > >> it makes sense, so that it is called safely.  Just to make sure, does
> > >> it make sense to continue to rcu protect the specific pointer?
> > >
> > > It might.  The usual other options is to use a lock outside of the element
> > > containing the reference count to protect reference-count manipulation.
> > > If there is some convenient lock, especially if it is already held where
> > > needed, then locking is more straightforward.  Otherwise, RCU is usually
> > > a reasonable option.
> > 
> > I was concerned about the overhead a lock would have at each system
> > call entry, but I didn't benchmark it to see.  I'll add the
> > rcu_dereference right away, then look into seeing whether there's a
> > cleaner approach.  I was trying to be overly protective of mutating
> > any data internal to the filters through complete replacement on any
> > change.  I'll take a step back and see if
> > 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 406+ messages in thread

* Re: [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering
  2011-05-25 20:19                                                                                       ` Ingo Molnar
@ 2011-06-09  9:00                                                                                         ` Sven Anders
  0 siblings, 0 replies; 406+ messages in thread
From: Sven Anders @ 2011-06-09  9:00 UTC (permalink / raw)
  To: linux-kernel

Ingo Molnar <mingo <at> elte.hu> writes:

> <flame>
> 
> IMHO the key design mistake of LSM is that it detaches security 
> policy from applications: you need to be admin to load policies, you 
> need to be root to use/configure an LSM. Dammit, you need to be root 
> to add labels to files!
> 
> This not only makes the LSM policies distro specific (and needlessly 
> forked and detached from real security), but also gives the message 
> that:
> 
>  'to ensure your security you need to be privileged'
> 
> which is the anti-concept of good security IMO.
> 
> [....]
> </flame>
> 
> Thanks,
> 	Ingo


Hello!
An incomplete idea I had some time ago:

Couldn't the security information (like the selinux profiles) be
part of the binaries?

Each source package should deliver it's own security information
and this should be better than adding it later, because the 
developer of the program knows, what his program should be allowed
to do. Moreover, if the developer changes something, he can/must
add the security information altogether.

Of course these information has to be signed in some way to
avoid tampering.

Just an idea...



^ permalink raw reply	[flat|nested] 406+ messages in thread

end of thread, other threads:[~2011-06-09  9:05 UTC | newest]

Thread overview: 406+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-04-28  3:08 [PATCH 2/7] tracing: split out syscall_trace_enter construction Will Drewry
2011-04-28  3:08 ` [PATCH 3/7] seccomp_filter: Enable ftrace-based system call filtering Will Drewry
2011-04-28 13:50   ` Steven Rostedt
2011-04-28 15:30     ` Will Drewry
2011-04-28 16:20       ` Serge E. Hallyn
2011-04-28 16:56       ` Steven Rostedt
2011-04-28 18:02         ` Will Drewry
2011-04-28 14:29   ` Frederic Weisbecker
2011-04-28 15:15     ` Will Drewry
2011-04-28 15:57       ` Frederic Weisbecker
2011-04-28 16:05         ` Will Drewry
2011-04-28 15:12   ` Frederic Weisbecker
2011-04-28 15:20     ` Frederic Weisbecker
2011-04-28 15:29     ` Will Drewry
2011-04-28 16:13       ` Frederic Weisbecker
2011-04-28 16:48         ` Will Drewry
2011-04-28 17:36           ` Frederic Weisbecker
2011-04-28 18:21             ` Will Drewry
2011-04-28 16:28   ` Steven Rostedt
2011-04-28 16:53     ` Will Drewry
2011-04-28 16:55   ` Serge E. Hallyn
2011-04-28 17:16     ` Steven Rostedt
2011-04-28 17:39       ` Serge E. Hallyn
2011-04-28 18:01         ` Will Drewry
2011-04-28 18:21           ` Steven Rostedt
2011-04-28 18:34             ` Will Drewry
2011-04-28 18:54               ` Serge E. Hallyn
2011-04-28 19:07                 ` Steven Rostedt
2011-05-12  3:02                   ` [PATCH 3/5] v2 seccomp_filters: " Will Drewry
2011-05-12  3:02                     ` Will Drewry
2011-05-12  3:02                     ` Will Drewry
2011-05-12  7:48                     ` Ingo Molnar
2011-05-12  7:48                       ` Ingo Molnar
2011-05-12  7:48                       ` Ingo Molnar
2011-05-12  9:24                       ` Kees Cook
2011-05-12  9:24                         ` Kees Cook
2011-05-12  9:24                         ` Kees Cook
2011-05-12 10:49                         ` Ingo Molnar
2011-05-12 10:49                           ` Ingo Molnar
2011-05-12 10:49                           ` Ingo Molnar
2011-05-12 11:44                       ` James Morris
2011-05-12 11:44                         ` James Morris
2011-05-12 11:44                         ` James Morris
2011-05-12 13:01                         ` Ingo Molnar
2011-05-12 13:01                           ` Ingo Molnar
2011-05-12 13:01                           ` Ingo Molnar
2011-05-12 16:26                           ` Will Drewry
2011-05-12 16:26                             ` Will Drewry
2011-05-12 16:26                             ` Will Drewry
2011-05-16 12:55                             ` Ingo Molnar
2011-05-16 12:55                               ` Ingo Molnar
2011-05-16 12:55                               ` Ingo Molnar
2011-05-16 14:42                               ` Will Drewry
2011-05-16 14:42                                 ` Will Drewry
2011-05-16 14:42                                 ` Will Drewry
2011-05-13  0:18                           ` James Morris
2011-05-13  0:18                             ` James Morris
2011-05-13  0:18                             ` James Morris
2011-05-13 12:10                             ` Ingo Molnar
2011-05-13 12:10                               ` Ingo Molnar
2011-05-13 12:10                               ` Ingo Molnar
2011-05-13 12:19                               ` Peter Zijlstra
2011-05-13 12:19                                 ` Peter Zijlstra
2011-05-13 12:19                                 ` Peter Zijlstra
2011-05-13 12:26                                 ` Ingo Molnar
2011-05-13 12:26                                   ` Ingo Molnar
2011-05-13 12:26                                   ` Ingo Molnar
2011-05-13 12:39                                   ` Peter Zijlstra
2011-05-13 12:39                                     ` Peter Zijlstra
2011-05-13 12:39                                     ` Peter Zijlstra
2011-05-13 12:43                                     ` Peter Zijlstra
2011-05-13 12:43                                       ` Peter Zijlstra
2011-05-13 12:43                                       ` Peter Zijlstra
2011-05-13 12:54                                       ` Ingo Molnar
2011-05-13 12:54                                         ` Ingo Molnar
2011-05-13 12:54                                         ` Ingo Molnar
2011-05-13 13:08                                         ` Peter Zijlstra
2011-05-13 13:08                                           ` Peter Zijlstra
2011-05-13 13:08                                           ` Peter Zijlstra
2011-05-13 13:18                                           ` Ingo Molnar
2011-05-13 13:18                                             ` Ingo Molnar
2011-05-13 13:18                                             ` Ingo Molnar
2011-05-13 13:55                                             ` Peter Zijlstra
2011-05-13 13:55                                               ` Peter Zijlstra
2011-05-13 13:55                                               ` Peter Zijlstra
2011-05-13 14:57                                               ` Ingo Molnar
2011-05-13 14:57                                                 ` Ingo Molnar
2011-05-13 14:57                                                 ` Ingo Molnar
2011-05-13 15:27                                                 ` Peter Zijlstra
2011-05-13 15:27                                                   ` Peter Zijlstra
2011-05-13 15:27                                                   ` Peter Zijlstra
2011-05-14  7:05                                                   ` Ingo Molnar
2011-05-14  7:05                                                     ` Ingo Molnar
2011-05-14  7:05                                                     ` Ingo Molnar
2011-05-16 16:23                                                 ` Steven Rostedt
2011-05-16 16:23                                                   ` Steven Rostedt
2011-05-16 16:23                                                   ` Steven Rostedt
2011-05-16 16:52                                                   ` Ingo Molnar
2011-05-16 16:52                                                     ` Ingo Molnar
2011-05-16 16:52                                                     ` Ingo Molnar
2011-05-16 17:03                                                     ` Steven Rostedt
2011-05-16 17:03                                                       ` Steven Rostedt
2011-05-16 17:03                                                       ` Steven Rostedt
2011-05-17 12:42                                                       ` Ingo Molnar
2011-05-17 12:42                                                         ` Ingo Molnar
2011-05-17 12:42                                                         ` Ingo Molnar
2011-05-17 13:05                                                         ` Steven Rostedt
2011-05-17 13:05                                                           ` Steven Rostedt
2011-05-17 13:05                                                           ` Steven Rostedt
2011-05-17 13:19                                                           ` Ingo Molnar
2011-05-17 13:19                                                             ` Ingo Molnar
2011-05-17 13:19                                                             ` Ingo Molnar
2011-05-19  4:07                                                             ` Will Drewry
2011-05-19  4:07                                                               ` Will Drewry
2011-05-19  4:07                                                               ` Will Drewry
2011-05-19 12:22                                                               ` Steven Rostedt
2011-05-19 12:22                                                                 ` Steven Rostedt
2011-05-19 12:22                                                                 ` Steven Rostedt
2011-05-19 21:05                                                                 ` Will Drewry
2011-05-19 21:05                                                                   ` Will Drewry
2011-05-19 21:05                                                                   ` Will Drewry
2011-05-24 15:59                                                                   ` Will Drewry
2011-05-24 15:59                                                                     ` Will Drewry
2011-05-24 15:59                                                                     ` Will Drewry
2011-05-24 16:20                                                                     ` Peter Zijlstra
2011-05-24 16:20                                                                       ` Peter Zijlstra
2011-05-24 16:20                                                                       ` Peter Zijlstra
2011-05-24 16:25                                                                       ` Thomas Gleixner
2011-05-24 16:25                                                                         ` Thomas Gleixner
2011-05-24 16:25                                                                         ` Thomas Gleixner
2011-05-24 19:00                                                                         ` Will Drewry
2011-05-24 19:00                                                                           ` Will Drewry
2011-05-24 19:00                                                                           ` Will Drewry
2011-05-24 19:54                                                                       ` Ingo Molnar
2011-05-24 19:54                                                                         ` Ingo Molnar
2011-05-24 19:54                                                                         ` Ingo Molnar
2011-05-24 20:10                                                                         ` Ingo Molnar
2011-05-24 20:10                                                                           ` Ingo Molnar
2011-05-24 20:10                                                                           ` Ingo Molnar
2011-05-25 10:35                                                                         ` Thomas Gleixner
2011-05-25 10:35                                                                           ` Thomas Gleixner
2011-05-25 10:35                                                                           ` Thomas Gleixner
2011-05-25 15:01                                                                           ` Ingo Molnar
2011-05-25 15:01                                                                             ` Ingo Molnar
2011-05-25 15:01                                                                             ` Ingo Molnar
2011-05-25 17:43                                                                             ` Peter Zijlstra
2011-05-25 17:43                                                                               ` Peter Zijlstra
2011-05-25 17:43                                                                               ` Peter Zijlstra
2011-05-29 20:17                                                                               ` Ingo Molnar
2011-05-29 20:17                                                                                 ` Ingo Molnar
2011-05-29 20:17                                                                                 ` Ingo Molnar
2011-05-25 17:48                                                                             ` Thomas Gleixner
2011-05-25 17:48                                                                               ` Thomas Gleixner
2011-05-25 17:48                                                                               ` Thomas Gleixner
2011-05-25 18:01                                                                               ` Kees Cook
2011-05-25 18:42                                                                                 ` Linus Torvalds
2011-05-25 19:06                                                                                   ` Ingo Molnar
2011-05-25 19:54                                                                                     ` Will Drewry
2011-05-25 19:11                                                                                   ` Kees Cook
2011-05-25 20:01                                                                                     ` Linus Torvalds
2011-05-25 20:19                                                                                       ` Ingo Molnar
2011-06-09  9:00                                                                                         ` Sven Anders
2011-05-26 14:37                                                                                       ` Colin Walters
2011-05-26 15:03                                                                                         ` Linus Torvalds
2011-05-26 15:28                                                                                           ` Colin Walters
2011-05-26 16:33                                                                                           ` Will Drewry
2011-05-26 16:46                                                                                             ` Linus Torvalds
2011-05-26 17:02                                                                                               ` Will Drewry
2011-05-26 17:04                                                                                                 ` Will Drewry
2011-05-26 17:17                                                                                                 ` Linus Torvalds
2011-05-26 17:38                                                                                                   ` Will Drewry
2011-05-26 18:33                                                                                                     ` Linus Torvalds
2011-05-26 18:47                                                                                                       ` Ingo Molnar
2011-05-26 19:05                                                                                                         ` david
2011-05-26 19:09                                                                                                           ` Eric Paris
2011-05-26 19:46                                                                                                           ` Ingo Molnar
2011-05-26 19:49                                                                                                             ` david
2011-05-26 18:49                                                                                                       ` Will Drewry
2011-06-01  3:10                                                                                                         ` [PATCH v3 01/13] tracing: split out filter initialization and clean up Will Drewry
2011-06-01  3:10                                                                                                         ` [PATCH v3 02/13] tracing: split out syscall_trace_enter construction Will Drewry
2011-06-01  7:00                                                                                                           ` Ingo Molnar
2011-06-01 17:15                                                                                                             ` Will Drewry
2011-06-02 14:29                                                                                                               ` Ingo Molnar
2011-06-02 15:18                                                                                                                 ` Will Drewry
2011-06-01  3:10                                                                                                         ` [PATCH v3 03/13] seccomp_filters: new mode with configurable syscall filters Will Drewry
2011-06-02 17:36                                                                                                           ` Paul E. McKenney
2011-06-02 18:14                                                                                                             ` Will Drewry
2011-06-02 19:42                                                                                                               ` Paul E. McKenney
2011-06-02 20:28                                                                                                                 ` Will Drewry
2011-06-02 20:46                                                                                                                   ` Steven Rostedt
2011-06-02 21:12                                                                                                                     ` Paul E. McKenney
2011-06-01  3:10                                                                                                         ` [PATCH v3 04/13] seccomp_filter: add process state reporting Will Drewry
2011-06-01  3:10                                                                                                         ` [PATCH v3 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
2011-06-01 21:23                                                                                                           ` Kees Cook
2011-06-01 23:03                                                                                                             ` Will Drewry
2011-06-01  3:10                                                                                                         ` [PATCH v3 06/13] x86: add HAVE_SECCOMP_FILTER and seccomp_execve Will Drewry
2011-06-01  3:10                                                                                                         ` [PATCH v3 07/13] arm: select HAVE_SECCOMP_FILTER Will Drewry
2011-06-01  3:10                                                                                                           ` Will Drewry
2011-06-01  3:10                                                                                                         ` [PATCH v3 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve Will Drewry
2011-06-01  5:37                                                                                                           ` Michal Simek
2011-06-01  3:10                                                                                                         ` [PATCH v3 09/13] mips: " Will Drewry
2011-06-01  3:10                                                                                                         ` [PATCH v3 10/13] s390: " Will Drewry
2011-06-01  3:10                                                                                                         ` [PATCH v3 11/13] powerpc: " Will Drewry
2011-06-01  3:10                                                                                                           ` Will Drewry
2011-06-01  3:10                                                                                                         ` [PATCH v3 12/13] sparc: " Will Drewry
2011-06-01  3:10                                                                                                           ` Will Drewry
2011-06-01  3:35                                                                                                           ` David Miller
2011-06-01  3:35                                                                                                             ` [PATCH v3 12/13] sparc: select HAVE_SECCOMP_FILTER and provide David Miller
2011-06-01  3:10                                                                                                         ` [PATCH v3 13/13] sh: select HAVE_SECCOMP_FILTER Will Drewry
2011-06-01  3:10                                                                                                           ` Will Drewry
2011-06-02  5:27                                                                                                           ` Paul Mundt
2011-06-02  5:27                                                                                                             ` Paul Mundt
2011-05-26 17:38                                                                                                 ` [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering Valdis.Kletnieks
2011-05-26 18:08                                                                                                   ` Will Drewry
2011-05-26 18:22                                                                                                     ` Valdis.Kletnieks
2011-05-26 17:07                                                                                               ` Steven Rostedt
2011-05-26 18:43                                                                                                 ` Casey Schaufler
2011-05-26 18:54                                                                                                   ` Steven Rostedt
2011-05-26 18:34                                                                                               ` david
2011-05-26 18:54                                                                                               ` Ingo Molnar
2011-05-26  1:19                                                                                   ` James Morris
2011-05-26  6:08                                                                                     ` Avi Kivity
2011-05-26  8:24                                                                                     ` Ingo Molnar
2011-05-26  8:35                                                                                       ` Pekka Enberg
2011-05-26  8:49                                                                                       ` Avi Kivity
2011-05-26  8:57                                                                                         ` Pekka Enberg
     [not found]                                                                                           ` <20110526085939.GG29458@redhat.com>
2011-05-26 10:38                                                                                             ` Ingo Molnar
2011-05-26 10:46                                                                                               ` Avi Kivity
2011-05-26 10:46                                                                                               ` Gleb Natapov
2011-05-26 11:11                                                                                                 ` Ingo Molnar
2011-05-26  9:30                                                                                         ` Ingo Molnar
2011-05-26  9:48                                                                                           ` Ingo Molnar
2011-05-26 11:02                                                                                             ` Avi Kivity
2011-05-26 11:16                                                                                               ` Ingo Molnar
2011-05-26 10:56                                                                                           ` Avi Kivity
2011-05-26 11:38                                                                                             ` Ingo Molnar
2011-05-26 18:06                                                                                               ` Avi Kivity
2011-05-26 18:15                                                                                                 ` Ingo Molnar
2011-05-26 18:20                                                                                                   ` Avi Kivity
2011-05-26 18:36                                                                                                     ` Ingo Molnar
2011-05-26 18:43                                                                                                       ` Valdis.Kletnieks
2011-05-26 18:50                                                                                                         ` Ingo Molnar
2011-05-26 18:22                                                                                                   ` Peter Zijlstra
2011-05-26 18:38                                                                                                     ` Ingo Molnar
2011-05-27  0:12                                                                                                       ` James Morris
2011-05-29 16:51                                                                                   ` Aneesh Kumar K.V
2011-05-29 17:02                                                                                     ` Linus Torvalds
2011-05-29 18:23                                                                                       ` Al Viro
2011-05-26  8:43                                                                               ` Ingo Molnar
2011-05-26  8:43                                                                                 ` Ingo Molnar
2011-05-26  8:43                                                                                 ` Ingo Molnar
2011-05-26  9:15                                                                               ` Ingo Molnar
2011-05-26  9:15                                                                                 ` Ingo Molnar
2011-05-26  9:15                                                                                 ` Ingo Molnar
2011-05-24 20:08                                                                     ` Ingo Molnar
2011-05-24 20:08                                                                       ` Ingo Molnar
2011-05-24 20:08                                                                       ` Ingo Molnar
2011-05-24 20:14                                                                       ` Steven Rostedt
2011-05-24 20:14                                                                         ` Steven Rostedt
2011-05-24 20:14                                                                         ` Steven Rostedt
2011-05-24 20:25                                                                       ` Kees Cook
2011-05-25 19:09                                                                         ` Ingo Molnar
2011-05-25 16:40                                                                       ` Will Drewry
2011-05-13 15:17                                             ` Eric Paris
2011-05-13 15:17                                               ` Eric Paris
2011-05-13 15:17                                               ` Eric Paris
2011-05-13 15:29                                               ` [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system callfiltering David Laight
2011-05-13 15:29                                                 ` David Laight
2011-05-13 15:29                                                 ` David Laight
2011-05-13 15:29                                                 ` David Laight
2011-05-16 12:03                                                 ` Ingo Molnar
2011-05-16 12:03                                                   ` Ingo Molnar
2011-05-16 12:03                                                   ` Ingo Molnar
2011-05-13 12:49                                     ` [PATCH 3/5] v2 seccomp_filters: Enable ftrace-based system call filtering Ingo Molnar
2011-05-13 12:49                                       ` Ingo Molnar
2011-05-13 12:49                                       ` Ingo Molnar
2011-05-13 13:55                                       ` Peter Zijlstra
2011-05-13 13:55                                         ` Peter Zijlstra
2011-05-13 13:55                                         ` Peter Zijlstra
2011-05-13 15:02                                         ` Ingo Molnar
2011-05-13 15:02                                           ` Ingo Molnar
2011-05-13 15:02                                           ` Ingo Molnar
2011-05-13 15:10                               ` Eric Paris
2011-05-13 15:10                                 ` Eric Paris
2011-05-13 15:10                                 ` Eric Paris
2011-05-13 15:23                                 ` Peter Zijlstra
2011-05-13 15:23                                   ` Peter Zijlstra
2011-05-13 15:23                                   ` Peter Zijlstra
2011-05-13 15:55                                   ` Eric Paris
2011-05-13 15:55                                     ` Eric Paris
2011-05-13 15:55                                     ` Eric Paris
2011-05-13 16:29                                     ` Will Drewry
2011-05-13 16:29                                       ` Will Drewry
2011-05-13 16:29                                       ` Will Drewry
2011-05-14  7:30                                 ` Ingo Molnar
2011-05-14  7:30                                   ` Ingo Molnar
2011-05-14  7:30                                   ` Ingo Molnar
2011-05-14 20:57                                   ` Will Drewry
2011-05-14 20:57                                     ` Will Drewry
2011-05-14 20:57                                     ` Will Drewry
2011-05-14 20:57                                     ` Will Drewry
2011-05-16 12:43                                     ` Ingo Molnar
2011-05-16 12:43                                       ` Ingo Molnar
2011-05-16 12:43                                       ` Ingo Molnar
2011-05-16 15:29                                       ` Will Drewry
2011-05-16 15:29                                         ` Will Drewry
2011-05-16 15:29                                         ` Will Drewry
2011-05-17 12:57                                         ` Ingo Molnar
2011-05-17 12:57                                           ` Ingo Molnar
2011-05-17 12:57                                           ` Ingo Molnar
2011-05-16  0:36                               ` James Morris
2011-05-16  0:36                                 ` James Morris
2011-05-16  0:36                                 ` James Morris
2011-05-16 15:08                                 ` Ingo Molnar
2011-05-16 15:08                                   ` Ingo Molnar
2011-05-16 15:08                                   ` Ingo Molnar
2011-05-17  2:24                                   ` James Morris
2011-05-17  2:24                                     ` James Morris
2011-05-17  2:24                                     ` James Morris
2011-05-17 13:10                                     ` Ingo Molnar
2011-05-17 13:10                                       ` Ingo Molnar
2011-05-17 13:10                                       ` Ingo Molnar
2011-05-17 13:29                                       ` James Morris
2011-05-17 13:29                                         ` James Morris
2011-05-17 13:29                                         ` James Morris
2011-05-17 13:29                                         ` James Morris
2011-05-17 18:34                                         ` Ingo Molnar
2011-05-17 18:34                                           ` Ingo Molnar
2011-05-17 18:34                                           ` Ingo Molnar
2011-05-26  6:27                                 ` Pavel Machek
2011-05-26  6:27                                   ` Pavel Machek
2011-05-26  6:27                                   ` Pavel Machek
2011-05-26  8:35                                   ` Ingo Molnar
2011-05-26  8:35                                     ` Ingo Molnar
2011-05-26  8:35                                     ` Ingo Molnar
2011-05-12 12:15                       ` Frederic Weisbecker
2011-05-12 12:15                         ` Frederic Weisbecker
2011-05-12 12:15                         ` Frederic Weisbecker
2011-05-12 11:33                     ` James Morris
2011-05-12 11:33                       ` James Morris
2011-05-12 11:33                       ` James Morris
2011-05-13 19:35                     ` Arnd Bergmann
2011-05-13 19:35                       ` Arnd Bergmann
2011-05-13 19:35                       ` Arnd Bergmann
2011-05-14 20:58                       ` Will Drewry
2011-05-14 20:58                         ` Will Drewry
2011-05-14 20:58                         ` Will Drewry
2011-05-15  6:42                         ` Arnd Bergmann
2011-05-15  6:42                           ` Arnd Bergmann
2011-05-15  6:42                           ` Arnd Bergmann
2011-05-16 12:00                           ` Ingo Molnar
2011-05-16 12:00                             ` Ingo Molnar
2011-05-16 12:00                             ` Ingo Molnar
2011-05-16 15:26                     ` Steven Rostedt
2011-05-16 15:26                       ` Steven Rostedt
2011-05-16 15:26                       ` Steven Rostedt
2011-05-16 15:28                       ` Will Drewry
2011-05-16 15:28                         ` Will Drewry
2011-05-16 15:28                         ` Will Drewry
2011-04-28 19:06               ` [PATCH 3/7] seccomp_filter: " Steven Rostedt
2011-04-28 18:51           ` Serge E. Hallyn
2011-05-03  8:39   ` Avi Kivity
2011-04-28  3:08 ` [PATCH 4/7] seccomp_filter: add process state reporting Will Drewry
2011-04-28  3:21   ` KOSAKI Motohiro
2011-04-28  3:24     ` Will Drewry
2011-04-28  3:40       ` Al Viro
2011-04-28  3:43         ` Will Drewry
2011-04-28 22:54       ` James Morris
2011-05-02 10:08         ` Will Drewry
2011-05-12  3:04   ` [PATCH 4/5] v2 " Will Drewry
2011-04-28  3:08 ` [PATCH 5/7] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
2011-04-28  7:06   ` Ingo Molnar
2011-04-28 14:56     ` Eric Paris
2011-04-28 18:37       ` Will Drewry
2011-04-29 13:18         ` Frederic Weisbecker
2011-04-29 16:13           ` Will Drewry
2011-05-03  1:29             ` Frederic Weisbecker
2011-05-03  1:47               ` Frederic Weisbecker
2011-05-04  9:15                 ` Will Drewry
2011-05-04  9:29                   ` Will Drewry
2011-05-04 17:52                   ` Frederic Weisbecker
2011-05-04 18:23                     ` Steven Rostedt
2011-05-04 18:30                       ` Frederic Weisbecker
2011-05-04 18:46                         ` Steven Rostedt
2011-05-05  9:21                           ` Will Drewry
2011-05-05 13:14                             ` Serge E. Hallyn
2011-05-12  3:20                               ` Will Drewry
2011-05-06 11:53                             ` Steven Rostedt
2011-05-06 13:35                               ` Eric Paris
2011-05-07  1:58                               ` Will Drewry
2011-05-12  3:04                                 ` [PATCH 5/5] v2 " Will Drewry
2011-05-06 16:30                             ` [PATCH 5/7] " Eric Paris
2011-05-07  2:11                               ` Will Drewry
2011-05-04 12:16                 ` Steven Rostedt
2011-05-04 15:54                   ` Eric Paris
2011-05-04 16:06                     ` Steven Rostedt
2011-05-04 16:22                       ` Eric Paris
2011-05-04 16:39                         ` Steven Rostedt
2011-05-04 18:02                           ` Eric Paris
2011-05-04 17:03                         ` Frederic Weisbecker
2011-05-04 17:55                           ` Eric Paris
2011-04-28 17:43     ` Serge E. Hallyn
2011-04-28 15:46   ` Randy Dunlap
2011-04-28 18:23     ` Will Drewry
2011-04-28  3:08 ` [PATCH 6/7] include/linux/syscalls.h: add __ layer of macros with return types Will Drewry
2011-04-28  3:08 ` [PATCH 7/7] arch/x86: hook int returning system calls Will Drewry

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.