All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v9 01/13] tracing: split out filter initialization and clean up uses
@ 2011-06-24  0:36 Will Drewry
  2011-06-24  0:36 ` [PATCH v9 02/13] tracing: split out syscall_trace_enter construction Will Drewry
                   ` (11 more replies)
  0 siblings, 12 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Ingo Molnar,
	Peter Zijlstra, Namhyung Kim, Borislav Petkov, Ben Blum

Moves the perf-specific profile event allocation and freeing code into
kernel/perf_event.c where it is called from and two symbols are exported
via ftrace_event.h for instantiating struct event_filters without
requiring a change to the core tracing code.

The change allows globally registered ftrace events to be used in
event_filter structs.  perf is the current consumer, but a possible
future consumer is a system call filtering using the secure computing
hooks (and the existing syscalls subsystem events).

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 include/linux/ftrace_event.h       |    9 +++--
 kernel/events/core.c               |    7 +++-
 kernel/trace/trace_events_filter.c |   60 ++++++++++++++++++++++--------------
 3 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 59d3ef1..1d8daca 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -229,6 +229,12 @@ extern int filter_current_check_discard(struct ring_buffer *buffer,
 					void *rec,
 					struct ring_buffer_event *event);
 
+extern void ftrace_free_filter(struct event_filter *filter);
+extern int ftrace_parse_filter(struct event_filter **filter,
+			       int event_id,
+			       const char *filter_str);
+extern const char *ftrace_get_filter_string(const struct event_filter *filter);
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
@@ -279,9 +285,6 @@ extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
 extern int  perf_trace_add(struct perf_event *event, int flags);
 extern void perf_trace_del(struct perf_event *event, int flags);
-extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
-				     char *filter_str);
-extern void ftrace_profile_free_filter(struct perf_event *event);
 extern void *perf_trace_buf_prepare(int size, unsigned short type,
 				    struct pt_regs *regs, int *rctxp);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9efe710..a18af25 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5580,7 +5580,8 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 	if (IS_ERR(filter_str))
 		return PTR_ERR(filter_str);
 
-	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+	ret = ftrace_parse_filter(&event->filter, event->attr.config,
+				  filter_str);
 
 	kfree(filter_str);
 	return ret;
@@ -5588,7 +5589,9 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 
 static void perf_event_free_filter(struct perf_event *event)
 {
-	ftrace_profile_free_filter(event);
+	struct event_filter *filter = event->filter;
+	event->filter = NULL;
+	ftrace_free_filter(filter);
 }
 
 #else
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8008ddc..787b174 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -522,7 +522,7 @@ static void remove_filter_string(struct event_filter *filter)
 }
 
 static int replace_filter_string(struct event_filter *filter,
-				 char *filter_string)
+				 const char *filter_string)
 {
 	kfree(filter->filter_string);
 	filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
@@ -1936,21 +1936,27 @@ out_unlock:
 	return err;
 }
 
-#ifdef CONFIG_PERF_EVENTS
-
-void ftrace_profile_free_filter(struct perf_event *event)
+/* ftrace_free_filter - frees a parsed filter its internal structures.
+ *
+ * @filter: pointer to the event_filter to free.
+ */
+void ftrace_free_filter(struct event_filter *filter)
 {
-	struct event_filter *filter = event->filter;
-
-	event->filter = NULL;
-	__free_filter(filter);
+	if (filter)
+		__free_filter(filter);
 }
+EXPORT_SYMBOL_GPL(ftrace_free_filter);
 
-int ftrace_profile_set_filter(struct perf_event *event, int event_id,
-			      char *filter_str)
+/* ftrace_parse_filter - allocates and populates a new event_filter
+ *
+ * @event_id: may be something like syscalls::sys_event_tkill's id.
+ * @filter_str: pointer to the filter string. Ownership IS taken.
+ */
+int ftrace_parse_filter(struct event_filter **filter,
+			int event_id,
+			const char *filter_str)
 {
 	int err;
-	struct event_filter *filter;
 	struct filter_parse_state *ps;
 	struct ftrace_event_call *call = NULL;
 
@@ -1966,12 +1972,12 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 		goto out_unlock;
 
 	err = -EEXIST;
-	if (event->filter)
+	if (*filter)
 		goto out_unlock;
 
-	filter = __alloc_filter();
-	if (!filter) {
-		err = PTR_ERR(filter);
+	*filter = __alloc_filter();
+	if (IS_ERR_OR_NULL(*filter)) {
+		err = PTR_ERR(*filter);
 		goto out_unlock;
 	}
 
@@ -1980,14 +1986,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	if (!ps)
 		goto free_filter;
 
-	parse_init(ps, filter_ops, filter_str);
+	replace_filter_string(*filter, filter_str);
+
+	parse_init(ps, filter_ops, (*filter)->filter_string);
 	err = filter_parse(ps);
 	if (err)
 		goto free_ps;
 
-	err = replace_preds(call, filter, ps, filter_str, false);
-	if (!err)
-		event->filter = filter;
+	err = replace_preds(call, *filter, ps, (*filter)->filter_string, false);
 
 free_ps:
 	filter_opstack_clear(ps);
@@ -1995,14 +2001,22 @@ free_ps:
 	kfree(ps);
 
 free_filter:
-	if (err)
-		__free_filter(filter);
+	if (err) {
+		__free_filter(*filter);
+		*filter = NULL;
+	}
 
 out_unlock:
 	mutex_unlock(&event_mutex);
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(ftrace_parse_filter);
 
-#endif /* CONFIG_PERF_EVENTS */
-
+const char *ftrace_get_filter_string(const struct event_filter *filter)
+{
+	if (!filter)
+		return NULL;
+	return filter->filter_string;
+}
+EXPORT_SYMBOL_GPL(ftrace_get_filter_string);
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 02/13] tracing: split out syscall_trace_enter construction
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
@ 2011-06-24  0:36 ` Will Drewry
  2011-06-24  0:36 ` [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters Will Drewry
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Ingo Molnar

perf appears to be the primary consumer of the CONFIG_FTRACE_SYSCALLS
infrastructure.  As such, many the helpers target at perf can be split
into a peerf-focused helper and a generic CONFIG_FTRACE_SYSCALLS
consumer interface.

This change splits out syscall_trace_enter construction from
perf_syscall_enter for current into two helpers:
- ftrace_syscall_enter_state
- ftrace_syscall_enter_state_size

And adds another helper for completeness:
- ftrace_syscall_exit_state_size

These helpers allow for shared code between perf ftrace events and
any other consumers of CONFIG_FTRACE_SYSCALLS events.  The proposed
seccomp_filter patches use this code.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 include/trace/syscall.h       |    4 ++
 kernel/trace/trace_syscalls.c |   96 +++++++++++++++++++++++++++++++++++------
 2 files changed, 86 insertions(+), 14 deletions(-)

diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 31966a4..242ae04 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -41,6 +41,10 @@ extern int reg_event_syscall_exit(struct ftrace_event_call *call);
 extern void unreg_event_syscall_exit(struct ftrace_event_call *call);
 extern int
 ftrace_format_syscall(struct ftrace_event_call *call, struct trace_seq *s);
+extern int ftrace_syscall_enter_state(u8 *buf, size_t available,
+				      struct trace_entry **entry);
+extern size_t ftrace_syscall_enter_state_size(int nb_args);
+extern size_t ftrace_syscall_exit_state_size(void);
 enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
 				      struct trace_event *event);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee7b5a0..f37f120 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -95,7 +95,7 @@ find_syscall_meta(unsigned long syscall)
 	return NULL;
 }
 
-static struct syscall_metadata *syscall_nr_to_meta(int nr)
+struct syscall_metadata *syscall_nr_to_meta(int nr)
 {
 	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
 		return NULL;
@@ -498,7 +498,7 @@ static int sys_perf_refcount_exit;
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
 	struct syscall_metadata *sys_data;
-	struct syscall_trace_enter *rec;
+	void *buf;
 	struct hlist_head *head;
 	int syscall_nr;
 	int rctx;
@@ -513,25 +513,22 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
-	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
-	size = ALIGN(size + sizeof(u32), sizeof(u64));
-	size -= sizeof(u32);
+	size = ftrace_syscall_enter_state_size(sys_data->nb_args);
 
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
 		      "perf buffer not large enough"))
 		return;
 
-	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-				sys_data->enter_event->event.type, regs, &rctx);
-	if (!rec)
+	buf = perf_trace_buf_prepare(size, sys_data->enter_event->event.type,
+				     regs, &rctx);
+	if (!buf)
 		return;
 
-	rec->nr = syscall_nr;
-	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
-			       (unsigned long *)&rec->args);
+	/* The only error conditions in this helper are handled above. */
+	ftrace_syscall_enter_state(buf, size, NULL);
 
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+	perf_trace_buf_submit(buf, size, rctx, 0, 1, regs, head);
 }
 
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -587,8 +584,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 		return;
 
 	/* We can probably do that at build time */
-	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
-	size -= sizeof(u32);
+	size = ftrace_syscall_exit_state_size();
 
 	/*
 	 * Impossible, but be paranoid with the future
@@ -688,3 +684,75 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 	}
 	return 0;
 }
+
+/* ftrace_syscall_enter_state_size - returns the state size required.
+ *
+ * @nb_args: number of system call args expected.
+ *           a negative value implies the maximum allowed.
+ */
+size_t ftrace_syscall_enter_state_size(int nb_args)
+{
+	/* syscall_get_arguments only supports up to 6 arguments. */
+	int arg_count = (nb_args >= 0 ? nb_args : 6);
+	size_t size = (sizeof(unsigned long) * arg_count) +
+		      sizeof(struct syscall_trace_enter);
+	size = ALIGN(size + sizeof(u32), sizeof(u64));
+	size -= sizeof(u32);
+	return size;
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_enter_state_size);
+
+size_t ftrace_syscall_exit_state_size(void)
+{
+	return ALIGN(sizeof(struct syscall_trace_exit) + sizeof(u32),
+		     sizeof(u64)) - sizeof(u32);
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_exit_state_size);
+
+/* ftrace_syscall_enter_state - build state for filter matching
+ *
+ * @buf: buffer to populate with current task state for matching
+ * @available: size available for use in the buffer.
+ * @entry: optional pointer to the trace_entry member of the state.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ * If @entry is NULL, it will be ignored.
+ */
+int ftrace_syscall_enter_state(u8 *buf, size_t available,
+			       struct trace_entry **entry)
+{
+	struct syscall_trace_enter *sys_enter;
+	struct syscall_metadata *sys_data;
+	int size;
+	int syscall_nr;
+	struct pt_regs *regs = task_pt_regs(current);
+
+	syscall_nr = syscall_get_nr(current, regs);
+	if (syscall_nr < 0)
+		return -EINVAL;
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return -EINVAL;
+
+	/* Determine the actual size needed. */
+	size = sizeof(unsigned long) * sys_data->nb_args +
+	       sizeof(struct syscall_trace_enter);
+	size = ALIGN(size + sizeof(u32), sizeof(u64));
+	size -= sizeof(u32);
+
+	BUG_ON(size > available);
+	sys_enter = (struct syscall_trace_enter *)buf;
+
+	/* Populating the struct trace_sys_enter is left to the caller, but
+	 * a pointer is returned to encourage opacity.
+	 */
+	if (entry)
+		*entry = &sys_enter->ent;
+
+	sys_enter->nr = syscall_nr;
+	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
+			      sys_enter->args);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ftrace_syscall_enter_state);
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
  2011-06-24  0:36 ` [PATCH v9 02/13] tracing: split out syscall_trace_enter construction Will Drewry
@ 2011-06-24  0:36 ` Will Drewry
  2011-06-24  7:30   ` Damien Miller
  2011-06-24 20:20   ` Kees Cook
  2011-06-24  0:36 ` [PATCH v9 04/13] seccomp_filter: add process state reporting Will Drewry
                   ` (9 subsequent siblings)
  11 siblings, 2 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Peter Zijlstra,
	Paul E. McKenney, Andrew Morton, David Howells, Eric Paris,
	Borislav Petkov, Michal Marek, Mike Galbraith, Serge E. Hallyn,
	Jiri Slaby, Greg Kroah-Hartman, linux-security-module

This change adds a new seccomp mode which specifies the allowed system
calls dynamically.  When in the new mode (2), all system calls are
checked against process-defined filters - first by system call number,
then by a filter string.  If an entry exists for a given system call and
all filter predicates evaluate to true, then the task may proceed.
Otherwise, the task is killed.

Filter string parsing and evaluation is handled by the ftrace filter
engine.  Related patches tweak to the perf filter trace and free
allowing the calls to be shared. Filters inherit their understanding of
types and arguments for each system call from the CONFIG_FTRACE_SYSCALLS
subsystem which already populates this information in syscall_metadata
associated enter_event (and exit_event) structures. If
CONFIG_FTRACE_SYSCALLS is not compiled in, only filter strings of "1"
will be allowed.

The net result is a process may have its system calls filtered using the
ftrace filter engine's inherent understanding of systems calls.  The set
of filters is specified through the PR_SET_SECCOMP_FILTER argument in
prctl(). For example, a filterset for a process, like pdftotext, that
should only process read-only input could (roughly) look like:
  sprintf(rdonly, "flags == %u", O_RDONLY|O_LARGEFILE);
  type = PR_SECCOMP_FILTER_SYSCALL;
  prctl(PR_SET_SECCOMP_FILTER, type, __NR_open, rdonly);
  prctl(PR_SET_SECCOMP_FILTER, type, __NR__llseek, "1");
  prctl(PR_SET_SECCOMP_FILTER, type, __NR_brk, "1");
  prctl(PR_SET_SECCOMP_FILTER, type, __NR_close, "1");
  prctl(PR_SET_SECCOMP_FILTER, type, __NR_exit_group, "1");
  prctl(PR_SET_SECCOMP_FILTER, type, __NR_fstat64, "1");
  prctl(PR_SET_SECCOMP_FILTER, type, __NR_mmap2, "1");
  prctl(PR_SET_SECCOMP_FILTER, type, __NR_munmap, "1");
  prctl(PR_SET_SECCOMP_FILTER, type, __NR_read, "1");
  prctl(PR_SET_SECCOMP_FILTER, type, __NR_write, "fd == 1 || fd == 2");
  prctl(PR_SET_SECCOMP, 2);

Subsequent calls to PR_SET_SECCOMP_FILTER for the same system call will
be &&'d together to ensure that attack surface may only be reduced:
  prctl(PR_SET_SECCOMP_FILTER, __NR_write, "fd != 2");

With the earlier example, the active filter becomes:
  "(fd == 1 || fd == 2) && (fd != 2)"

The patch also adds PR_CLEAR_SECCOMP_FILTER and PR_GET_SECCOMP_FILTER.
The latter returns the current filter for a system call to userspace:

  prctl(PR_GET_SECCOMP_FILTER, type, __NR_write, buf, bufsize);

while the former clears any filters for a given system call changing it
back to a defaulty deny:

  prctl(PR_CLEAR_SECCOMP_FILTER, type, __NR_write);

Note, type may be either PR_SECCOMP_FILTER_EVENT or
PR_SECCOMP_FILTER_SYSCALL.  This allows for ftrace event ids to be used
in lieu of system call numbers.  At present, only syscalls:sys_enter_*
event id are supported, but this allows for potential future extension
of the backend.

v9: - rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf
    - disallow PR_SECCOMP_FILTER_EVENT when a compat task is calling
      as ftrace has no compat_syscalls awareness yet.
    - return -ENOSYS when filter engine strings are used on a compat call
      as there are no compat_syscalls events to reference yet.
v8: - expand parenthical use during SET_SECCOMP_FILTER to avoid operator
      precedence undermining attack surface reduction (caught by
      segoon@openwall.com).  Opted to waste bytes on () than reparse to
      avoid OP_OR precedence overriding extend_filter's intentions.
    - remove more lingering references to @state
    - fix incorrect compat mismatch check (anyone up for a Tested-By?)
v7: - disallow seccomp_filter inheritance across fork except when seccomp
      is active.  This avoids filters leaking across processes when they
      are not actively in use but ensure an allowed fork/clone doesn't drop
      filters.
    - remove the Mode: print from show as it reflected current and not the
      filters holder.
v6: - clean up minor unnecessary changes (empty lines, ordering, etc)
    - fix one overly long line
    - add refcount overflow BUG_ON
v5: - drop mutex usage when the task_struct is safe to access directly
v4: - move off of RCU to a read/write guarding mutex after
      paulmck@linux.vnet.ibm.com's feedback (mem leak, rcu fail)
    - stopped inc/dec refcounts in mutex guard sections
    - added required changes to init the mutex in INIT_TASK and safely
      lock around fork inheritance.
    - added id_type support to the prctl interface to support using
      ftrace event ids as an alternative to syscall numbers.  Behavior
      is identical otherwise (as per discussion with mingo@elte.hu)
v3: - always block execve calls (as per torvalds@linux-foundation.org)
    - add __NR_seccomp_execve(_32) to seccomp-supporting arches
    - ensure compat tasks can't reach ftrace:syscalls
    - dropped new defines for seccomp modes.
    - two level array instead of hlists (sugg. by olofj@chromium.org)
    - added generic Kconfig entry that is not connected.
    - dropped internal seccomp.h
    - move prctl helpers to seccomp_filter
    - killed seccomp_t typedef (as per checkpatch)
v2: - changed to use the existing syscall number ABI.
    - prctl changes to minimize parsing in the kernel:
      prctl(PR_SET_SECCOMP, {0 | 1 | 2 }, { 0 | ON_EXEC });
      prctl(PR_SET_SECCOMP_FILTER, __NR_read, "fd == 5");
      prctl(PR_CLEAR_SECCOMP_FILTER, __NR_read);
      prctl(PR_GET_SECCOMP_FILTER, __NR_read, buf, bufsize);
    - defined PR_SECCOMP_MODE_STRICT and ..._FILTER
    - added flags
    - provide a default fail syscall_nr_to_meta in ftrace
    - provides fallback for unhooked system calls
    - use -ENOSYS and ERR_PTR(-ENOSYS) for stubbed functionality
    - added kernel/seccomp.h to share seccomp.c/seccomp_filter.c
    - moved to a hlist and 4 bit hash of linked lists
    - added support to operate without CONFIG_FTRACE_SYSCALLS
    - moved Kconfig support next to SECCOMP
    - made Kconfig entries dependent on EXPERIMENTAL
    - added macros to avoid ifdefs from kernel/fork.c
    - added compat task/filter matching
    - drop seccomp.h inclusion in sched.h and drop seccomp_t
    - added Filtering to "show" output
    - added on_exec state dup'ing when enabling after a fast-path accept.

Signed-off-by: Will Drewry <wad@chromium.org>
---
 include/linux/init_task.h |   12 +
 include/linux/prctl.h     |    7 +
 include/linux/sched.h     |    2 +-
 include/linux/seccomp.h   |  119 ++++++-
 include/trace/syscall.h   |    7 +
 kernel/Makefile           |    3 +
 kernel/fork.c             |    4 +
 kernel/seccomp.c          |   38 ++-
 kernel/seccomp_filter.c   |  875 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys.c              |   12 +
 security/Kconfig          |   17 +
 11 files changed, 1083 insertions(+), 13 deletions(-)
 create mode 100644 kernel/seccomp_filter.c

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 580f70c..56deaf2 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -126,6 +126,17 @@ extern struct cred init_cred;
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
+#ifdef CONFIG_SECCOMP_FILTER
+# define INIT_SECCOMP_FILTER(tsk)					\
+	.seccomp = { \
+		.filters_guard = \
+			__MUTEX_INITIALIZER(tsk.seccomp.filters_guard), \
+	},
+#else
+# define INIT_SECCOMP_FILTER(tsk)
+#endif
+
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -188,6 +199,7 @@ extern struct cred init_cred;
 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_IDS							\
 	INIT_PERF_EVENTS(tsk)						\
+	INIT_SECCOMP_FILTER(tsk)					\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..dac7413 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -64,6 +64,13 @@
 #define PR_GET_SECCOMP	21
 #define PR_SET_SECCOMP	22
 
+/* Get/set process seccomp filters */
+#define PR_GET_SECCOMP_FILTER	35
+#define PR_SET_SECCOMP_FILTER	36
+#define PR_CLEAR_SECCOMP_FILTER	37
+#  define PR_SECCOMP_FILTER_SYSCALL	0
+#  define PR_SECCOMP_FILTER_EVENT	1
+
 /* Get/set the capability bounding set (as per security/commoncap.c) */
 #define PR_CAPBSET_READ 23
 #define PR_CAPBSET_DROP 24
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a837b20..4cb3070 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1395,7 +1395,7 @@ struct task_struct {
 	uid_t loginuid;
 	unsigned int sessionid;
 #endif
-	seccomp_t seccomp;
+	struct seccomp_struct seccomp;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 167c333..320fc88 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -1,13 +1,35 @@
 #ifndef _LINUX_SECCOMP_H
 #define _LINUX_SECCOMP_H
 
+struct seq_file;
 
 #ifdef CONFIG_SECCOMP
 
+#include <linux/errno.h>
 #include <linux/thread_info.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
 #include <asm/seccomp.h>
 
-typedef struct { int mode; } seccomp_t;
+struct seccomp_filters;
+/**
+ * struct seccomp_struct - the state of a seccomp'ed process
+ *
+ * @mode:
+ *     if this is 1, the process is under standard seccomp rules
+ *             is 2, the process is only allowed to make system calls where
+ *                   associated filters evaluate successfully.
+ * @filters: Metadata for filters if using CONFIG_SECCOMP_FILTER.
+ *           @filters assignment and use should always be guarded by
+ *           @filters_guard.
+ */
+struct seccomp_struct {
+	int mode;
+#ifdef CONFIG_SECCOMP_FILTER
+	struct mutex filters_guard;
+	struct seccomp_filters *filters;
+#endif
+};
 
 extern void __secure_computing(int);
 static inline void secure_computing(int this_syscall)
@@ -23,8 +45,7 @@ extern long prctl_set_seccomp(unsigned long);
 
 #include <linux/errno.h>
 
-typedef struct { } seccomp_t;
-
+struct seccomp_struct { };
 #define secure_computing(x) do { } while (0)
 
 static inline long prctl_get_seccomp(void)
@@ -39,4 +60,96 @@ static inline long prctl_set_seccomp(unsigned long arg2)
 
 #endif /* CONFIG_SECCOMP */
 
+#ifdef CONFIG_SECCOMP_FILTER
+
+#define seccomp_filter_init_task(_tsk) do { \
+	mutex_init(&(_tsk)->seccomp.filters_guard); \
+	(_tsk)->seccomp.filters = NULL; \
+} while (0);
+
+/* Do nothing unless seccomp filtering is active. If not, the execve boundary
+ * can not be cleanly enforced and preset filters may leak across execve calls.
+ */
+#define seccomp_filter_fork(_tsk, _orig) do { \
+	if ((_tsk)->seccomp.mode) { \
+		(_tsk)->seccomp.mode = (_orig)->seccomp.mode; \
+		mutex_lock(&(_orig)->seccomp.filters_guard); \
+		(_tsk)->seccomp.filters = \
+			get_seccomp_filters((_orig)->seccomp.filters); \
+		mutex_unlock(&(_orig)->seccomp.filters_guard); \
+	} \
+} while (0);
+
+/* No locking is needed here because the task_struct will
+ * have no parallel consumers.
+ */
+#define seccomp_filter_free_task(_tsk) do { \
+	put_seccomp_filters((_tsk)->seccomp.filters); \
+} while (0);
+
+extern int seccomp_show_filters(struct seccomp_filters *filters,
+				struct seq_file *);
+extern long seccomp_set_filter(int, char *);
+extern long seccomp_clear_filter(int);
+extern long seccomp_get_filter(int, char *, unsigned long);
+
+extern long prctl_set_seccomp_filter(unsigned long, unsigned long,
+				     char __user *);
+extern long prctl_get_seccomp_filter(unsigned long, unsigned long,
+				     char __user *, unsigned long);
+extern long prctl_clear_seccomp_filter(unsigned long, unsigned long);
+
+extern struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *);
+extern void put_seccomp_filters(struct seccomp_filters *);
+
+extern int seccomp_test_filters(int);
+extern void seccomp_filter_log_failure(int);
+
+#else  /* CONFIG_SECCOMP_FILTER */
+
+struct seccomp_filters { };
+#define seccomp_filter_init_task(_tsk) do { } while (0);
+#define seccomp_filter_fork(_tsk, _orig) do { } while (0);
+#define seccomp_filter_free_task(_tsk) do { } while (0);
+
+static inline int seccomp_show_filters(struct seccomp_filters *filters,
+				       struct seq_file *m)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_set_filter(int syscall_nr, char *filter)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_clear_filter(int syscall_nr)
+{
+	return -ENOSYS;
+}
+
+static inline long seccomp_get_filter(int syscall_nr,
+				      char *buf, unsigned long available)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_set_seccomp_filter(unsigned long a2, unsigned long a3,
+					    char __user *a4)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_clear_seccomp_filter(unsigned long a2,
+					      unsigned long a3)
+{
+	return -ENOSYS;
+}
+
+static inline long prctl_get_seccomp_filter(unsigned long a2, unsigned long a3,
+					    char __user *a4, unsigned long a5)
+{
+	return -ENOSYS;
+}
+#endif  /* CONFIG_SECCOMP_FILTER */
 #endif /* _LINUX_SECCOMP_H */
diff --git a/include/trace/syscall.h b/include/trace/syscall.h
index 242ae04..e061ad0 100644
--- a/include/trace/syscall.h
+++ b/include/trace/syscall.h
@@ -35,6 +35,8 @@ struct syscall_metadata {
 extern unsigned long arch_syscall_addr(int nr);
 extern int init_syscall_trace(struct ftrace_event_call *call);
 
+extern struct syscall_metadata *syscall_nr_to_meta(int);
+
 extern int reg_event_syscall_enter(struct ftrace_event_call *call);
 extern void unreg_event_syscall_enter(struct ftrace_event_call *call);
 extern int reg_event_syscall_exit(struct ftrace_event_call *call);
@@ -49,6 +51,11 @@ enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags,
 				      struct trace_event *event);
 enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags,
 				     struct trace_event *event);
+#else
+static inline struct syscall_metadata *syscall_nr_to_meta(int nr)
+{
+	return NULL;
+}
 #endif
 
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d64cfc..ffc77c8 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -79,6 +79,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
+ifeq ($(CONFIG_SECCOMP_FILTER),y)
+obj-$(CONFIG_SECCOMP) += seccomp_filter.o
+endif
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30..166cf10 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -34,6 +34,7 @@
 #include <linux/cgroup.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/seccomp.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
@@ -168,6 +169,7 @@ void free_task(struct task_struct *tsk)
 	free_thread_info(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
+	seccomp_filter_free_task(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -1211,6 +1213,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p);
 
+	seccomp_filter_init_task(p);
 	retval = perf_event_init_task(p);
 	if (retval)
 		goto bad_fork_cleanup_policy;
@@ -1366,6 +1369,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_fork_read_unlock(current);
 	perf_event_fork(p);
+	seccomp_filter_fork(p, current);
 	return p;
 
 bad_fork_free_pid:
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13..0a942be 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -2,16 +2,20 @@
  * linux/kernel/seccomp.c
  *
  * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
+ * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
  *
  * This defines a simple but solid secure-computing mode.
  */
 
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/unistd.h>
+#include <linux/ftrace_event.h>
 
+#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
 /* #define SECCOMP_DEBUG 1 */
-#define NR_SECCOMP_MODES 1
 
 /*
  * Secure computing mode 1 allows only read/write/exit/sigreturn.
@@ -32,10 +36,9 @@ static int mode1_syscalls_32[] = {
 
 void __secure_computing(int this_syscall)
 {
-	int mode = current->seccomp.mode;
 	int * syscall;
 
-	switch (mode) {
+	switch (current->seccomp.mode) {
 	case 1:
 		syscall = mode1_syscalls;
 #ifdef CONFIG_COMPAT
@@ -47,6 +50,17 @@ void __secure_computing(int this_syscall)
 				return;
 		} while (*++syscall);
 		break;
+#ifdef CONFIG_SECCOMP_FILTER
+	case 2:
+		if (this_syscall >= NR_syscalls || this_syscall < 0)
+			break;
+
+		if (!seccomp_test_filters(this_syscall))
+			return;
+
+		seccomp_filter_log_failure(this_syscall);
+		break;
+#endif
 	default:
 		BUG();
 	}
@@ -71,16 +85,22 @@ long prctl_set_seccomp(unsigned long seccomp_mode)
 	if (unlikely(current->seccomp.mode))
 		goto out;
 
-	ret = -EINVAL;
-	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
-		current->seccomp.mode = seccomp_mode;
-		set_thread_flag(TIF_SECCOMP);
+	ret = 0;
+	switch (seccomp_mode) {
+	case 1:
 #ifdef TIF_NOTSC
 		disable_TSC();
 #endif
-		ret = 0;
+#ifdef CONFIG_SECCOMP_FILTER
+	case 2:
+#endif
+		current->seccomp.mode = seccomp_mode;
+		set_thread_flag(TIF_SECCOMP);
+		break;
+	default:
+		ret = -EINVAL;
 	}
 
- out:
+out:
 	return ret;
 }
diff --git a/kernel/seccomp_filter.c b/kernel/seccomp_filter.c
new file mode 100644
index 0000000..16a53e6
--- /dev/null
+++ b/kernel/seccomp_filter.c
@@ -0,0 +1,875 @@
+/* filter engine-based seccomp system call filtering
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2011 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ */
+
+#include <linux/compat.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/ftrace_event.h>
+#include <linux/seccomp.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/prctl.h>
+
+#include <asm/syscall.h>
+#include <trace/syscall.h>
+
+#define SECCOMP_MAX_FILTER_LENGTH MAX_FILTER_STR_VAL
+
+#define SECCOMP_FILTER_ALLOW "1"
+#define SECCOMP_ACTION_DENY 0xffff
+#define SECCOMP_ACTION_ALLOW 0xfffe
+
+/**
+ * struct seccomp_filters - container for seccomp filters
+ *
+ * @syscalls: array of 16-bit indices into @event_filters by syscall_nr
+ *            May also be SECCOMP_ACTION_DENY or SECCOMP_ACTION_ALLOW
+ * @event_filters: array of pointers to ftrace event filters
+ * @count: size of @event_filters
+ * @flags: anonymous struct to wrap filters-specific flags
+ * @usage: reference count to manage the object lifetime.
+ *         get/put helpers should be used when accessing an instance
+ *         outside of a lifetime-guarded section.  In general, this
+ *         is only needed for handling shared filters across tasks.
+ *
+ * seccomp_filters objects should never be modified after being attached
+ * to a task_struct.
+ */
+struct seccomp_filters {
+	uint16_t syscalls[NR_syscalls];
+	struct event_filter **event_filters;
+	uint16_t count;
+	struct {
+		uint32_t compat:1,
+			 __reserved:31;
+	} flags;
+	atomic_t usage;
+};
+
+/* Handle ftrace symbol non-existence */
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define create_event_filter(_ef_pptr, _event_type, _str) \
+	ftrace_parse_filter(_ef_pptr, _event_type, _str)
+#define get_filter_string(_ef) ftrace_get_filter_string(_ef)
+#define free_event_filter(_f) ftrace_free_filter(_f)
+
+#else
+
+#define create_event_filter(_ef_pptr, _event_type, _str) (-ENOSYS)
+#define get_filter_string(_ef) (NULL)
+#define free_event_filter(_f) do { } while (0)
+#endif
+
+/**
+ * seccomp_filters_new - allocates a new filters object
+ * @count: count to allocate for the event_filters array
+ *
+ * Returns ERR_PTR on error or an allocated object.
+ */
+static struct seccomp_filters *seccomp_filters_new(uint16_t count)
+{
+	struct seccomp_filters *f;
+
+	if (count >= SECCOMP_ACTION_ALLOW)
+		return ERR_PTR(-EINVAL);
+
+	f = kzalloc(sizeof(struct seccomp_filters), GFP_KERNEL);
+	if (!f)
+		return ERR_PTR(-ENOMEM);
+
+	/* Lazy SECCOMP_ACTION_DENY assignment. */
+	memset(f->syscalls, 0xff, sizeof(f->syscalls));
+	atomic_set(&f->usage, 1);
+
+	f->event_filters = NULL;
+	f->count = count;
+	if (!count)
+		return f;
+
+	f->event_filters = kzalloc(count * sizeof(struct event_filter *),
+				   GFP_KERNEL);
+	if (!f->event_filters) {
+		kfree(f);
+		f = ERR_PTR(-ENOMEM);
+	}
+	return f;
+}
+
+/**
+ * seccomp_filters_free - cleans up the filter list and frees the table
+ * @filters: NULL or live object to be completely destructed.
+ */
+static void seccomp_filters_free(struct seccomp_filters *filters)
+{
+	uint16_t count = 0;
+	if (!filters)
+		return;
+	while (count < filters->count) {
+		struct event_filter *f = filters->event_filters[count];
+		free_event_filter(f);
+		count++;
+	}
+	kfree(filters->event_filters);
+	kfree(filters);
+}
+
+static void __put_seccomp_filters(struct seccomp_filters *orig)
+{
+	WARN_ON(atomic_read(&orig->usage));
+	seccomp_filters_free(orig);
+}
+
+#define seccomp_filter_allow(_id) ((_id) == SECCOMP_ACTION_ALLOW)
+#define seccomp_filter_deny(_id) ((_id) == SECCOMP_ACTION_DENY)
+#define seccomp_filter_dynamic(_id) \
+	(!seccomp_filter_allow(_id) && !seccomp_filter_deny(_id))
+static inline uint16_t seccomp_filter_id(const struct seccomp_filters *f,
+					 int syscall_nr)
+{
+	if (!f)
+		return SECCOMP_ACTION_DENY;
+	return f->syscalls[syscall_nr];
+}
+
+static inline struct event_filter *seccomp_dynamic_filter(
+		const struct seccomp_filters *filters, uint16_t id)
+{
+	if (!seccomp_filter_dynamic(id))
+		return NULL;
+	return filters->event_filters[id];
+}
+
+static inline void set_seccomp_filter_id(struct seccomp_filters *filters,
+					 int syscall_nr, uint16_t id)
+{
+	filters->syscalls[syscall_nr] = id;
+}
+
+static inline void set_seccomp_filter(struct seccomp_filters *filters,
+				      int syscall_nr, uint16_t id,
+				      struct event_filter *dynamic_filter)
+{
+	filters->syscalls[syscall_nr] = id;
+	if (seccomp_filter_dynamic(id))
+		filters->event_filters[id] = dynamic_filter;
+}
+
+static struct event_filter *alloc_event_filter(int syscall_nr,
+					       const char *filter_string)
+{
+	struct syscall_metadata *data;
+	struct event_filter *filter = NULL;
+	int err;
+
+	data = syscall_nr_to_meta(syscall_nr);
+	/* Argument-based filtering only works on ftrace-hooked syscalls. */
+	err = -ENOSYS;
+	if (!data)
+		goto fail;
+	err = create_event_filter(&filter,
+				  data->enter_event->event.type,
+				  filter_string);
+	if (err)
+		goto fail;
+
+	return filter;
+fail:
+	kfree(filter);
+	return ERR_PTR(err);
+}
+
+/**
+ * seccomp_filters_copy - copies filters from src to dst.
+ *
+ * @dst: seccomp_filters to populate.
+ * @src: table to read from.
+ * @skip: specifies an entry, by system call, to skip.
+ *
+ * Returns non-zero on failure.
+ * Both the source and the destination should have no simultaneous
+ * writers, and dst should be exclusive to the caller.
+ * If @skip is < 0, it is ignored.
+ */
+static int seccomp_filters_copy(struct seccomp_filters *dst,
+				const struct seccomp_filters *src,
+				int skip)
+{
+	int id = 0, ret = 0, nr;
+	memcpy(&dst->flags, &src->flags, sizeof(src->flags));
+	memcpy(dst->syscalls, src->syscalls, sizeof(dst->syscalls));
+	if (!src->count)
+		goto done;
+	for (nr = 0; nr < NR_syscalls; ++nr) {
+		struct event_filter *filter;
+		const char *str;
+		uint16_t src_id = seccomp_filter_id(src, nr);
+		if (nr == skip) {
+			set_seccomp_filter(dst, nr, SECCOMP_ACTION_DENY,
+					   NULL);
+			continue;
+		}
+		if (!seccomp_filter_dynamic(src_id))
+			continue;
+		if (id >= dst->count) {
+			ret = -EINVAL;
+			goto done;
+		}
+		str = get_filter_string(seccomp_dynamic_filter(src, src_id));
+		filter = alloc_event_filter(nr, str);
+		if (IS_ERR(filter)) {
+			ret = PTR_ERR(filter);
+			goto done;
+		}
+		set_seccomp_filter(dst, nr, id, filter);
+		id++;
+	}
+
+done:
+	return ret;
+}
+
+/**
+ * seccomp_extend_filter - appends more text to a syscall_nr's filter
+ * @filters: unattached filter object to operate on
+ * @syscall_nr: syscall number to update filters for
+ * @filter_string: string to append to the existing filter
+ *
+ * The new string will be &&'d to the original filter string to ensure that it
+ * always matches the existing predicates or less:
+ *   (old_filter) && @filter_string
+ * A new seccomp_filters instance is returned on success and a ERR_PTR on
+ * failure.
+ */
+static int seccomp_extend_filter(struct seccomp_filters *filters,
+				 int syscall_nr, char *filter_string)
+{
+	struct event_filter *filter;
+	uint16_t id = seccomp_filter_id(filters, syscall_nr);
+	char *merged = NULL;
+	int ret = -EINVAL, expected;
+
+	/* No extending with a "1". */
+	if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string))
+		goto out;
+
+	/* ftrace events are not aware of CONFIG_COMPAT system calls and will
+	 * use the incorrect argument metadata if enabled.
+	 */
+	ret = -ENOSYS;
+	if (filters->flags.compat)
+		goto out;
+
+	filter = seccomp_dynamic_filter(filters, id);
+	ret = -ENOENT;
+	if (!filter)
+		goto out;
+
+	merged = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
+	ret = -ENOMEM;
+	if (!merged)
+		goto out;
+
+	/* Encapsulate the filter strings in parentheses to isolate operator
+	 * precedence behavior.
+	 */
+	expected = snprintf(merged, SECCOMP_MAX_FILTER_LENGTH, "(%s) && (%s)",
+			    get_filter_string(filter), filter_string);
+	ret = -E2BIG;
+	if (expected >= SECCOMP_MAX_FILTER_LENGTH || expected < 0)
+		goto out;
+
+	/* Free the old filter */
+	free_event_filter(filter);
+	set_seccomp_filter(filters, syscall_nr, id, NULL);
+
+	/* Replace it */
+	filter = alloc_event_filter(syscall_nr, merged);
+	if (IS_ERR(filter)) {
+		ret = PTR_ERR(filter);
+		goto out;
+	}
+	set_seccomp_filter(filters, syscall_nr, id, filter);
+	ret = 0;
+
+out:
+	kfree(merged);
+	return ret;
+}
+
+/**
+ * seccomp_add_filter - adds a filter for an unfiltered syscall
+ * @filters: filters object to add a filter/action to
+ * @syscall_nr: system call number to add a filter for
+ * @filter_string: the filter string to apply
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+static int seccomp_add_filter(struct seccomp_filters *filters, int syscall_nr,
+			      char *filter_string)
+{
+	struct event_filter *filter;
+	int ret = 0;
+
+	if (!strcmp(SECCOMP_FILTER_ALLOW, filter_string)) {
+		set_seccomp_filter(filters, syscall_nr,
+				   SECCOMP_ACTION_ALLOW, NULL);
+		goto out;
+	}
+
+	/* ftrace events are not aware of CONFIG_COMPAT system calls and will
+	 * use the incorrect argument metadata if enabled.
+	 */
+	ret = -ENOSYS;
+	if (filters->flags.compat)
+		goto out;
+
+	ret = 0;
+	filter = alloc_event_filter(syscall_nr, filter_string);
+	if (IS_ERR(filter)) {
+		ret = PTR_ERR(filter);
+		goto out;
+	}
+	/* Always add to the last slot available since additions are
+	 * are only done one at a time.
+	 */
+	set_seccomp_filter(filters, syscall_nr, filters->count - 1, filter);
+out:
+	return ret;
+}
+
+/* Wrap optional ftrace syscall support. Returns 1 on match or 0 otherwise. */
+static int filter_match_current(struct event_filter *event_filter)
+{
+	int err = 0;
+#ifdef CONFIG_FTRACE_SYSCALLS
+	uint8_t syscall_state[64];
+
+	memset(syscall_state, 0, sizeof(syscall_state));
+
+	/* The generic tracing entry can remain zeroed. */
+	err = ftrace_syscall_enter_state(syscall_state, sizeof(syscall_state),
+					 NULL);
+	if (err)
+		return 0;
+
+	err = filter_match_preds(event_filter, syscall_state);
+#endif
+	return err;
+}
+
+static const char *syscall_nr_to_name(int syscall)
+{
+	const char *syscall_name = "unknown";
+	struct syscall_metadata *data = syscall_nr_to_meta(syscall);
+	if (data)
+		syscall_name = data->name;
+	return syscall_name;
+}
+
+/* Encodes translation from sys_enter events to system call numbers.
+ * Returns -ENOSYS when the event doesn't match a system call or if
+ * current is_compat_task().  ftrace has no awareness of CONFIG_COMPAT
+ * yet.
+ */
+static int event_to_syscall_nr(int event_id)
+{
+	int nr, nosys = 1;
+#ifdef CONFIG_COMPAT
+	if (is_compat_task())
+		return -ENOSYS;
+#endif
+	for (nr = 0; nr < NR_syscalls; ++nr) {
+		struct syscall_metadata *data = syscall_nr_to_meta(nr);
+		if (!data)
+			continue;
+		nosys = 0;
+		if (data->enter_event->event.type == event_id)
+			return nr;
+	}
+	if (nosys)
+		return -ENOSYS;
+	return -EINVAL;
+}
+
+static void filters_set_compat(struct seccomp_filters *filters)
+{
+#ifdef CONFIG_COMPAT
+	if (is_compat_task())
+		filters->flags.compat = 1;
+#endif
+}
+
+static inline int filters_compat_mismatch(struct seccomp_filters *filters)
+{
+	int ret = 0;
+	if (!filters)
+		return 0;
+#ifdef CONFIG_COMPAT
+	if (!!(is_compat_task()) != filters->flags.compat)
+		ret = 1;
+#endif
+	return ret;
+}
+
+static inline int syscall_is_execve(int syscall)
+{
+	int nr = __NR_execve;
+#ifdef CONFIG_COMPAT
+	if (is_compat_task())
+		nr = __NR_seccomp_execve_32;
+#endif
+	return syscall == nr;
+}
+
+#ifndef KSTK_EIP
+#define KSTK_EIP(x) 0L
+#endif
+
+void seccomp_filter_log_failure(int syscall)
+{
+	pr_info("%s[%d]: system call %d (%s) blocked at 0x%lx\n",
+		current->comm, task_pid_nr(current), syscall,
+		syscall_nr_to_name(syscall), KSTK_EIP(current));
+}
+
+/* put_seccomp_filters - decrements the reference count of @orig and may free. */
+void put_seccomp_filters(struct seccomp_filters *orig)
+{
+	if (!orig)
+		return;
+
+	if (atomic_dec_and_test(&orig->usage))
+		__put_seccomp_filters(orig);
+}
+
+/* get_seccomp_filters - increments the reference count of @orig. */
+struct seccomp_filters *get_seccomp_filters(struct seccomp_filters *orig)
+{
+	int usage;
+	if (!orig)
+		return NULL;
+	usage = atomic_inc_return(&orig->usage);
+	/* Catch the pathological refcount overflow before it happens. */
+	BUG_ON(usage == INT_MAX);
+	return orig;
+}
+
+/**
+ * seccomp_test_filters - tests 'current' against the given syscall
+ * @syscall: number of the system call to test
+ *
+ * Returns 0 on ok and non-zero on error/failure.
+ */
+int seccomp_test_filters(int syscall)
+{
+	uint16_t id;
+	struct event_filter *filter;
+	struct seccomp_filters *filters;
+	int ret = -EACCES;
+
+	mutex_lock(&current->seccomp.filters_guard);
+	/* No reference counting is done. filters_guard should protect the
+	 * lifetime of any existing pointer below.
+	 */
+	filters = current->seccomp.filters;
+	if (!filters)
+		goto out;
+
+	if (filters_compat_mismatch(filters)) {
+		pr_info("%s[%d]: seccomp_filter compat() mismatch.\n",
+			current->comm, task_pid_nr(current));
+		goto out;
+	}
+
+	/* execve is never allowed. */
+	if (syscall_is_execve(syscall))
+		goto out;
+
+	ret = 0;
+	id = seccomp_filter_id(filters, syscall);
+	if (seccomp_filter_allow(id))
+		goto out;
+
+	ret = -EACCES;
+	if (!seccomp_filter_dynamic(id))
+		goto out;
+
+	filter = seccomp_dynamic_filter(filters, id);
+	if (filter && filter_match_current(filter))
+		ret = 0;
+out:
+	mutex_unlock(&current->seccomp.filters_guard);
+	return ret;
+}
+
+/**
+ * seccomp_show_filters - prints the current filter state to a seq_file
+ * @filters: properly get()'d filters object
+ * @m: the prepared seq_file to receive the data
+ *
+ * Returns 0 on a successful write.
+ */
+int seccomp_show_filters(struct seccomp_filters *filters, struct seq_file *m)
+{
+	int syscall;
+	if (!filters)
+		goto out;
+
+	for (syscall = 0; syscall < NR_syscalls; ++syscall) {
+		uint16_t id = seccomp_filter_id(filters, syscall);
+		const char *filter_string = SECCOMP_FILTER_ALLOW;
+		if (seccomp_filter_deny(id))
+			continue;
+		seq_printf(m, "%d (%s): ",
+			      syscall,
+			      syscall_nr_to_name(syscall));
+		if (seccomp_filter_dynamic(id))
+			filter_string = get_filter_string(
+					  seccomp_dynamic_filter(filters, id));
+		seq_printf(m, "%s\n", filter_string);
+	}
+out:
+	return 0;
+}
+EXPORT_SYMBOL_GPL(seccomp_show_filters);
+
+/**
+ * seccomp_get_filter - copies the filter_string into "buf"
+ * @syscall_nr: system call number to look up
+ * @buf: destination buffer
+ * @bufsize: available space in the buffer.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ *          operates on current. current must be attempting a system call
+ *          when this is called.
+ *
+ * Looks up the filter for the given system call number on current.  If found,
+ * the string length of the NUL-terminated buffer is returned and < 0 is
+ * returned on error. The NUL byte is not included in the length.
+ */
+long seccomp_get_filter(int syscall_nr, char *buf, unsigned long bufsize)
+{
+	struct seccomp_filters *filters;
+	struct event_filter *filter;
+	long ret = -EINVAL;
+	uint16_t id;
+
+	if (bufsize > SECCOMP_MAX_FILTER_LENGTH)
+		bufsize = SECCOMP_MAX_FILTER_LENGTH;
+
+	mutex_lock(&current->seccomp.filters_guard);
+	filters = current->seccomp.filters;
+
+	if (!filters)
+		goto out;
+
+	ret = -ENOENT;
+	id = seccomp_filter_id(filters, syscall_nr);
+	if (seccomp_filter_deny(id))
+		goto out;
+
+	if (seccomp_filter_allow(id)) {
+		ret = strlcpy(buf, SECCOMP_FILTER_ALLOW, bufsize);
+		goto copied;
+	}
+
+	filter = seccomp_dynamic_filter(filters, id);
+	if (!filter)
+		goto out;
+	ret = strlcpy(buf, get_filter_string(filter), bufsize);
+
+copied:
+	if (ret >= bufsize) {
+		ret = -ENOSPC;
+		goto out;
+	}
+	/* Zero out any remaining buffer, just in case. */
+	memset(buf + ret, 0, bufsize - ret);
+out:
+	mutex_unlock(&current->seccomp.filters_guard);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_get_filter);
+
+/**
+ * seccomp_clear_filter: clears the seccomp filter for a syscall.
+ * @syscall_nr: the system call number to clear filters for.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ *          operates on current. current must be attempting a system call
+ *          when this is called.
+ *
+ * Returns 0 on success.
+ */
+long seccomp_clear_filter(int syscall_nr)
+{
+	struct seccomp_filters *filters = NULL, *orig_filters;
+	uint16_t id;
+	int ret = -EINVAL;
+
+	mutex_lock(&current->seccomp.filters_guard);
+	orig_filters = current->seccomp.filters;
+
+	if (!orig_filters)
+		goto out;
+
+	if (filters_compat_mismatch(orig_filters))
+		goto out;
+
+	id = seccomp_filter_id(orig_filters, syscall_nr);
+	if (seccomp_filter_deny(id))
+		goto out;
+
+	/* Create a new filters object for the task */
+	if (seccomp_filter_dynamic(id))
+		filters = seccomp_filters_new(orig_filters->count - 1);
+	else
+		filters = seccomp_filters_new(orig_filters->count);
+
+	if (IS_ERR(filters)) {
+		ret = PTR_ERR(filters);
+		goto out;
+	}
+
+	/* Copy, but drop the requested entry. */
+	ret = seccomp_filters_copy(filters, orig_filters, syscall_nr);
+	if (ret)
+		goto out;
+	get_seccomp_filters(filters);  /* simplify the out: path */
+
+	current->seccomp.filters = filters;
+	put_seccomp_filters(orig_filters);  /* for the task */
+out:
+	put_seccomp_filters(filters);  /* for the extra get */
+	mutex_unlock(&current->seccomp.filters_guard);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_clear_filter);
+
+/**
+ * seccomp_set_filter: - Adds/extends a seccomp filter for a syscall.
+ * @syscall_nr: system call number to apply the filter to.
+ * @filter: ftrace filter string to apply.
+ *
+ * Context: User context only. This function may sleep on allocation and
+ *          operates on current. current must be attempting a system call
+ *          when this is called.
+ *
+ * New filters may be added for system calls when the current task is
+ * not in a secure computing mode (seccomp).  Otherwise, existing filters may
+ * be extended.
+ *
+ * Returns 0 on success or an errno on failure.
+ */
+long seccomp_set_filter(int syscall_nr, char *filter)
+{
+	struct seccomp_filters *filters = NULL, *orig_filters = NULL;
+	uint16_t id;
+	long ret = -EINVAL;
+	uint16_t filters_needed;
+
+	mutex_lock(&current->seccomp.filters_guard);
+	if (!filter)
+		goto out;
+
+	filter = strstrip(filter);
+	/* Disallow empty strings. */
+	if (filter[0] == 0)
+		goto out;
+
+	orig_filters = current->seccomp.filters;
+
+	/* After the first call, compatibility mode is selected permanently. */
+	ret = -EACCES;
+	if (filters_compat_mismatch(orig_filters))
+		goto out;
+
+	filters_needed = orig_filters ? orig_filters->count : 0;
+	id = seccomp_filter_id(orig_filters, syscall_nr);
+	if (seccomp_filter_deny(id)) {
+		/* Don't allow DENYs to be changed when in a seccomp mode */
+		ret = -EACCES;
+		if (current->seccomp.mode)
+			goto out;
+		filters_needed++;
+	}
+
+	filters = seccomp_filters_new(filters_needed);
+	if (IS_ERR(filters)) {
+		ret = PTR_ERR(filters);
+		goto out;
+	}
+
+	filters_set_compat(filters);
+	if (orig_filters) {
+		ret = seccomp_filters_copy(filters, orig_filters, -1);
+		if (ret)
+			goto out;
+	}
+
+	if (seccomp_filter_deny(id))
+		ret = seccomp_add_filter(filters, syscall_nr, filter);
+	else
+		ret = seccomp_extend_filter(filters, syscall_nr, filter);
+	if (ret)
+		goto out;
+	get_seccomp_filters(filters);  /* simplify the error paths */
+
+	current->seccomp.filters = filters;
+	put_seccomp_filters(orig_filters);  /* for the task */
+out:
+	put_seccomp_filters(filters);  /* for get or task, on err */
+	mutex_unlock(&current->seccomp.filters_guard);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(seccomp_set_filter);
+
+long prctl_set_seccomp_filter(unsigned long id_type,
+			      unsigned long id,
+			      char __user *user_filter)
+{
+	int nr;
+	long ret;
+	char *filter = NULL;
+
+	ret = -EINVAL;
+	if (id_type != PR_SECCOMP_FILTER_EVENT &&
+	    id_type != PR_SECCOMP_FILTER_SYSCALL)
+		goto out;
+
+	if (id > (unsigned long) INT_MAX)
+		goto out;
+	nr = (int) id;
+
+	if (id_type == PR_SECCOMP_FILTER_EVENT)
+		nr = event_to_syscall_nr(nr);
+
+	if (nr < 0) {
+		ret = nr;
+		goto out;
+	}
+
+	if (nr >= NR_syscalls)
+		goto out;
+
+	ret = -EFAULT;
+	if (!user_filter)
+		goto out;
+
+	filter = kzalloc(SECCOMP_MAX_FILTER_LENGTH + 1, GFP_KERNEL);
+	ret = -ENOMEM;
+	if (!filter)
+		goto out;
+
+	ret = -EFAULT;
+	if (strncpy_from_user(filter, user_filter,
+			      SECCOMP_MAX_FILTER_LENGTH - 1) < 0)
+		goto out;
+
+	ret = seccomp_set_filter(nr, filter);
+
+out:
+	kfree(filter);
+	return ret;
+}
+
+long prctl_clear_seccomp_filter(unsigned long id_type, unsigned long id)
+{
+	int nr = -1;
+	long ret = -EINVAL;
+
+	if (id_type != PR_SECCOMP_FILTER_EVENT &&
+	    id_type != PR_SECCOMP_FILTER_SYSCALL)
+		goto out;
+
+	if (id > (unsigned long) INT_MAX)
+		goto out;
+
+	nr = (int) id;
+	if (id_type == PR_SECCOMP_FILTER_EVENT)
+		nr = event_to_syscall_nr(nr);
+
+	if (nr < 0) {
+		ret = nr;
+		goto out;
+	}
+
+	if (nr >= NR_syscalls)
+		goto out;
+
+	ret = seccomp_clear_filter(nr);
+
+out:
+	return ret;
+}
+
+long prctl_get_seccomp_filter(unsigned long id_type, unsigned long id,
+			      char __user *dst, unsigned long available)
+{
+	unsigned long copied;
+	char *buf = NULL;
+	int nr, ret = -EINVAL;
+
+	if (!available)
+		goto out;
+
+	/* Ignore extra buffer space. */
+	if (available > SECCOMP_MAX_FILTER_LENGTH)
+		available = SECCOMP_MAX_FILTER_LENGTH;
+
+	if (id_type != PR_SECCOMP_FILTER_EVENT &&
+	    id_type != PR_SECCOMP_FILTER_SYSCALL)
+		goto out;
+
+	if (id > (unsigned long) INT_MAX)
+		goto out;
+	nr = (int) id;
+
+	if (id_type == PR_SECCOMP_FILTER_EVENT)
+		nr = event_to_syscall_nr(nr);
+
+	if (nr < 0) {
+		ret = nr;
+		goto out;
+	}
+
+	if (nr >= NR_syscalls)
+		goto out;
+
+	ret = -ENOMEM;
+	buf = kmalloc(available, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	ret = seccomp_get_filter(nr, buf, available);
+	if (ret < 0)
+		goto out;
+
+	/* Include the NUL byte in the copy. */
+	copied = copy_to_user(dst, buf, ret + 1);
+	ret = -ENOSPC;
+	if (copied)
+		goto out;
+	ret = 0;
+out:
+	kfree(buf);
+	return ret;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index e4128b2..b9f9155 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1704,6 +1704,18 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		case PR_SET_SECCOMP:
 			error = prctl_set_seccomp(arg2);
 			break;
+		case PR_SET_SECCOMP_FILTER:
+			error = prctl_set_seccomp_filter(arg2, arg3,
+							 (char __user *) arg4);
+			break;
+		case PR_CLEAR_SECCOMP_FILTER:
+			error = prctl_clear_seccomp_filter(arg2, arg3);
+			break;
+		case PR_GET_SECCOMP_FILTER:
+			error = prctl_get_seccomp_filter(arg2, arg3,
+							 (char __user *) arg4,
+							 arg5);
+			break;
 		case PR_GET_TSC:
 			error = GET_TSC_CTL(arg2);
 			break;
diff --git a/security/Kconfig b/security/Kconfig
index e0f08b5..2f29234 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -2,6 +2,10 @@
 # Security configuration
 #
 
+# Make seccomp filter Kconfig switch below available
+config HAVE_SECCOMP_FILTER
+       bool
+
 menu "Security options"
 
 config KEYS
@@ -82,6 +86,19 @@ config SECURITY_DMESG_RESTRICT
 
 	  If you are unsure how to answer this question, answer N.
 
+config SECCOMP_FILTER
+	bool "Enable seccomp-based system call filtering"
+	select SECCOMP
+	depends on HAVE_SECCOMP_FILTER && EXPERIMENTAL
+	help
+	  This kernel feature expands CONFIG_SECCOMP to allow computing
+	  in environments with reduced kernel access dictated by the
+	  application itself through prctl calls.  If
+	  CONFIG_FTRACE_SYSCALLS is available, then system call
+	  argument-based filtering predicates may be used.
+
+	  See Documentation/prctl/seccomp_filter.txt for more detail.
+
 config SECURITY
 	bool "Enable different security models"
 	depends on SYSFS
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 04/13] seccomp_filter: add process state reporting
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
  2011-06-24  0:36 ` [PATCH v9 02/13] tracing: split out syscall_trace_enter construction Will Drewry
  2011-06-24  0:36 ` [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters Will Drewry
@ 2011-06-24  0:36 ` Will Drewry
  2011-06-24  0:36 ` [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Andrew Morton, Al Viro,
	David Rientjes, Stephen Wilson, KOSAKI Motohiro

Adds seccomp and seccomp_filter status reporting to proc.
/proc/<pid>/seccomp_filter provides the current seccomp mode
and the list of allowed or dynamically filtered system calls.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf
v8: -
v7: emit seccomp mode directly
v6: -
v5: fix typos when mailing the wrong patch series
v4: move from rcu guard to mutex guard
v3: changed to using filters directly.
v2: removed status entry, added seccomp file.
    (requested by kosaki.motohiro@jp.fujitsu.com)
    allowed S_IRUGO reading of entries
    (requested by viro@zeniv.linux.org.uk)
    added flags
    got rid of the seccomp_t type
    dropped seccomp file

Signed-off-by: Will Drewry <wad@chromium.org>
---
 fs/proc/base.c |   31 +++++++++++++++++++++++++++++++
 1 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8a84210..52e1314 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -73,6 +73,7 @@
 #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
+#include <linux/seccomp.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
@@ -582,6 +583,30 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
+/*
+ * Print out the current seccomp filter set for the task.
+ */
+#ifdef CONFIG_SECCOMP_FILTER
+int proc_pid_seccomp_filter_show(struct seq_file *m, struct pid_namespace *ns,
+				 struct pid *pid, struct task_struct *task)
+{
+	struct seccomp_filters *filters;
+
+	seq_printf(m, "Mode: %d\n", task->seccomp.mode);
+	/* Avoid allowing other processes to incur too much added contention by
+	 * only acquiring a reference under the task-wide mutex.
+	 */
+	if (mutex_lock_killable(&task->seccomp.filters_guard))
+		return -1;
+	filters = get_seccomp_filters(task->seccomp.filters);
+	mutex_unlock(&task->seccomp.filters_guard);
+
+	seccomp_show_filters(filters, m);
+	put_seccomp_filters(filters);
+	return 0;
+}
+#endif /* CONFIG_SECCOMP_FILTER */
+
 /************************************************************************/
 /*                       Here the fs part begins                        */
 /************************************************************************/
@@ -2785,6 +2810,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",    S_IRUGO, proc_pid_syscall),
 #endif
+#ifdef CONFIG_SECCOMP_FILTER
+	ONE("seccomp_filter",     S_IRUGO, proc_pid_seccomp_filter_show),
+#endif
 	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
 	ONE("stat",       S_IRUGO, proc_tgid_stat),
 	ONE("statm",      S_IRUGO, proc_pid_statm),
@@ -3131,6 +3159,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 	INF("syscall",   S_IRUGO, proc_pid_syscall),
 #endif
+#ifdef CONFIG_SECCOMP_FILTER
+	ONE("seccomp_filter",     S_IRUGO, proc_pid_seccomp_filter_show),
+#endif
 	INF("cmdline",   S_IRUGO, proc_pid_cmdline),
 	ONE("stat",      S_IRUGO, proc_tid_stat),
 	ONE("statm",     S_IRUGO, proc_pid_statm),
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
                   ` (2 preceding siblings ...)
  2011-06-24  0:36 ` [PATCH v9 04/13] seccomp_filter: add process state reporting Will Drewry
@ 2011-06-24  0:36 ` Will Drewry
  2011-06-24  7:24   ` Chris Evans
       [not found]   ` <BANLkTimtYUyXbZjWhjK61B_1WBXE4MoAeA@mail.gmail.com>
  2011-06-24  0:36 ` [PATCH v9 06/13] x86: add HAVE_SECCOMP_FILTER and seccomp_execve Will Drewry
                   ` (7 subsequent siblings)
  11 siblings, 2 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Randy Dunlap,
	linux-doc

Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
implemented presently, and what it may be used for.  In addition,
the limitations and caveats of the proposed implementation are
included.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf
v8: -
v7: Add a caveat around fork behavior and execve
v6: -
v5: -
v4: rewording (courtesy kees.cook@canonical.com)
    reflect support for event ids
    add a small section on adding per-arch support
v3: a little more cleanup
v2: moved to prctl/
    updated for the v2 syntax.
    adds a note about compat behavior

Signed-off-by: Will Drewry <wad@chromium.org>
---
 Documentation/prctl/seccomp_filter.txt |  189 ++++++++++++++++++++++++++++++++
 1 files changed, 189 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/prctl/seccomp_filter.txt

diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
new file mode 100644
index 0000000..a9cddc2
--- /dev/null
+++ b/Documentation/prctl/seccomp_filter.txt
@@ -0,0 +1,189 @@
+		Seccomp filtering
+		=================
+
+Introduction
+------------
+
+A large number of system calls are exposed to every userland process
+with many of them going unused for the entire lifetime of the process.
+As system calls change and mature, bugs are found and eradicated.  A
+certain subset of userland applications benefit by having a reduced set
+of available system calls.  The resulting set reduces the total kernel
+surface exposed to the application.  System call filtering is meant for
+use with those applications.
+
+The implementation currently leverages both the existing seccomp
+infrastructure and the kernel tracing infrastructure.  By centralizing
+hooks for attack surface reduction in seccomp, it is possible to assure
+attention to security that is less relevant in normal ftrace scenarios,
+such as time-of-check, time-of-use attacks.  However, ftrace provides a
+rich, human-friendly environment for interfacing with system call
+specific arguments.  (As such, this requires FTRACE_SYSCALLS for any
+introspective filtering support.)
+
+
+What it isn't
+-------------
+
+System call filtering isn't a sandbox.  It provides a clearly defined
+mechanism for minimizing the exposed kernel surface.  Beyond that,
+policy for logical behavior and information flow should be managed with
+a combinations of other system hardening techniques and, potentially, a
+LSM of your choosing.  Expressive, dynamic filters based on the ftrace
+filter engine provide further options down this path (avoiding
+pathological sizes or selecting which of the multiplexed system calls in
+socketcall() is allowed, for instance) which could be construed,
+incorrectly, as a more complete sandboxing solution.
+
+
+Usage
+-----
+
+An additional seccomp mode is exposed through mode '2'.
+This mode depends on CONFIG_SECCOMP_FILTER.  By default, it provides
+only the most trivial of filter support "1" or cleared.  However, if
+CONFIG_FTRACE_SYSCALLS is enabled, the ftrace filter engine may be used
+for more expressive filters.
+
+A collection of filters may be supplied via prctl, and the current set
+of filters is exposed in /proc/<pid>/seccomp_filter.
+
+Interacting with seccomp filters can be done through three new prctl calls
+and one existing one.
+
+PR_SET_SECCOMP:
+	A pre-existing option for enabling strict seccomp mode (1) or
+	filtering seccomp (2).
+
+	Usage:
+		prctl(PR_SET_SECCOMP, 1);  /* strict */
+		prctl(PR_SET_SECCOMP, 2);  /* filters */
+
+PR_SET_SECCOMP_FILTER:
+	Allows the specification of a new filter for a given system
+	call, by number, and filter string.  By default, the filter
+	string may only be "1".  However, if CONFIG_FTRACE_SYSCALLS is
+	supported, the filter string may make use of the ftrace
+	filtering language's awareness of system call arguments.
+
+	In addition, the event id for the system call entry may be
+	specified in lieu of the system call number itself, as
+	determined by the 'type' argument.  This allows for the future
+	addition of seccomp-based filtering on other registered,
+	relevant ftrace events.
+
+	All calls to PR_SET_SECCOMP_FILTER for a given system
+	call will append the supplied string to any existing filters.
+	Filter construction looks as follows:
+		(Nothing) + "fd == 1 || fd == 2" => fd == 1 || fd == 2
+		... + "fd != 2" => (fd == 1 || fd == 2) && fd != 2
+		... + "size < 100" =>
+			((fd == 1 || fd == 2) && fd != 2) && size < 100
+	If there is no filter and the seccomp mode has already
+	transitioned to filtering, additions cannot be made.  Filters
+	may only be added that reduce the available kernel surface.
+
+	Usage (per the construction example above):
+		unsigned long type = PR_SECCOMP_FILTER_SYSCALL;
+		prctl(PR_SET_SECCOMP_FILTER, type, __NR_write,
+			"fd == 1 || fd == 2");
+		prctl(PR_SET_SECCOMP_FILTER, type, __NR_write,
+			"fd != 2");
+		prctl(PR_SET_SECCOMP_FILTER, type, __NR_write,
+			"size < 100");
+
+	The 'type' argument may be one of PR_SECCOMP_FILTER_SYSCALL or
+	PR_SECCOMP_FILTER_EVENT.
+
+PR_CLEAR_SECCOMP_FILTER:
+	Removes all filter entries for a given system call number or
+	event id.  When called prior to entering seccomp filtering mode,
+	it allows for new filters to be applied to the same system call.
+	After transition, however, it completely drops access to the
+	call.
+
+	Usage:
+		prctl(PR_CLEAR_SECCOMP_FILTER,
+			PR_SECCOMP_FILTER_SYSCALL, __NR_open);
+
+PR_GET_SECCOMP_FILTER:
+	Returns the aggregated filter string for a system call into a
+	user-supplied buffer of a given length.
+
+	Usage:
+		prctl(PR_GET_SECCOMP_FILTER,
+			PR_SECCOMP_FILTER_SYSCALL, __NR_write, buf,
+			sizeof(buf));
+
+All of the above calls return 0 on success and non-zero on error.  If
+CONFIG_FTRACE_SYSCALLS is not supported and a rich-filter was specified,
+the caller may check the errno for -ENOSYS.  The same is true if
+specifying an filter by the event id fails to discover any relevant
+event entries.
+
+
+Example
+-------
+
+Assume a process would like to cleanly read and write to stdin/out/err
+as well as access its filters after seccomp enforcement begins.  This
+may be done as follows:
+
+  int filter_syscall(int nr, char *buf) {
+    return prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL,
+                 nr, buf);
+  }
+
+  filter_syscall(__NR_read, "fd == 0");
+  filter_syscall(_NR_write, "fd == 1 || fd == 2");
+  filter_syscall(__NR_exit, "1");
+  filter_syscall(__NR_prctl, "1");
+  prctl(PR_SET_SECCOMP, 2);
+
+  /* Do stuff with fdset . . .*/
+
+  /* Drop read access and keep only write access to fd 1. */
+  prctl(PR_CLEAR_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL, __NR_read);
+  filter_syscall(__NR_write, "fd != 2");
+
+  /* Perform any final processing . . . */
+  syscall(__NR_exit, 0);
+
+
+Caveats
+-------
+
+- Avoid using a filter of "0" to disable a filter.  Always favor calling
+  prctl(PR_CLEAR_SECCOMP_FILTER, ...).  Otherwise the behavior may vary
+  depending on if CONFIG_FTRACE_SYSCALLS support exists -- though an
+  error will be returned if the support is missing.
+
+- execve is always blocked.  seccomp filters may not cross that boundary.
+
+- Filters can be inherited across fork/clone but only when they are
+  active (e.g., PR_SET_SECCOMP has been set to 2), but not prior to use.
+  This stops the parent process from adding filters that may undermine
+  the child process security or create unexpected behavior after an
+  execve.
+
+- Some platforms support a 32-bit userspace with 64-bit kernels.  In
+  these cases (CONFIG_COMPAT), system call numbers may not match across
+  64-bit and 32-bit system calls. When the first PRCTL_SET_SECCOMP_FILTER
+  is called, the in-memory filters state is annotated with whether the
+  call has been made via the compat interface.  All subsequent calls will
+  be checked for compat call mismatch.  In the long run, it may make sense
+  to store compat and non-compat filters separately, but that is not
+  supported at present. Once one type of system call interface has been
+  used, it must be continued to be used.
+
+
+Adding architecture support
+-----------------------
+
+Any platform with seccomp support should be able to support the bare
+minimum of seccomp filter features.  However, since seccomp_filter
+requires that execve be blocked, it expects the architecture to expose a
+__NR_seccomp_execve define that maps to the execve system call number.
+On platforms where CONFIG_COMPAT applies, __NR_seccomp_execve_32 must
+also be provided.  Once those macros exist, "select HAVE_SECCOMP_FILTER"
+support may be added to the architectures Kconfig.
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 06/13] x86: add HAVE_SECCOMP_FILTER and seccomp_execve
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
                   ` (3 preceding siblings ...)
  2011-06-24  0:36 ` [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
@ 2011-06-24  0:36 ` Will Drewry
  2011-06-24  0:36   ` Will Drewry
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Ingo Molnar,
	H. Peter Anvin, x86

Adds support to the x86 architecture by providing a compatibility
mode wrapper for sys_execve's number and selecting HAVE_SECCOMP_FILTER

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/x86/Kconfig                   |    1 +
 arch/x86/include/asm/ia32_unistd.h |    1 +
 arch/x86/include/asm/seccomp_64.h  |    2 ++
 3 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index da34972..c3b7d3b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -63,6 +63,7 @@ config X86
 	select HAVE_TEXT_POKE_SMP
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_SPARSE_IRQ
+	select HAVE_SECCOMP_FILTER
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
index 976f6ec..8ed2922 100644
--- a/arch/x86/include/asm/ia32_unistd.h
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -12,6 +12,7 @@
 #define __NR_ia32_exit		  1
 #define __NR_ia32_read		  3
 #define __NR_ia32_write		  4
+#define __NR_ia32_execve	 11
 #define __NR_ia32_sigreturn	119
 #define __NR_ia32_rt_sigreturn	173
 
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
index 84ec1bd..85c4219 100644
--- a/arch/x86/include/asm/seccomp_64.h
+++ b/arch/x86/include/asm/seccomp_64.h
@@ -8,10 +8,12 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 #define __NR_seccomp_read_32 __NR_ia32_read
 #define __NR_seccomp_write_32 __NR_ia32_write
 #define __NR_seccomp_exit_32 __NR_ia32_exit
 #define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
+#define __NR_seccomp_execve_32 __NR_ia32_execve
 
 #endif /* _ASM_X86_SECCOMP_64_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 07/13] arm: select HAVE_SECCOMP_FILTER
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
@ 2011-06-24  0:36   ` Will Drewry
  2011-06-24  0:36 ` [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters Will Drewry
                     ` (10 subsequent siblings)
  11 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Russell King,
	linux-arm-kernel

Enable support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER by
default.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/arm/Kconfig |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9adc278..537ad16 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -16,6 +16,7 @@ config ARM
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
+	select HAVE_SECCOMP_FILTER
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 07/13] arm: select HAVE_SECCOMP_FILTER
@ 2011-06-24  0:36   ` Will Drewry
  0 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-arm-kernel

Enable support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER by
default.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/arm/Kconfig |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9adc278..537ad16 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -16,6 +16,7 @@ config ARM
 	select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
 	select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL)
 	select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
+	select HAVE_SECCOMP_FILTER
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
                   ` (5 preceding siblings ...)
  2011-06-24  0:36   ` Will Drewry
@ 2011-06-24  0:36 ` Will Drewry
  2011-06-24  0:36 ` [PATCH v9 09/13] mips: " Will Drewry
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Michal Simek,
	microblaze-uclinux

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/microblaze/Kconfig               |    1 +
 arch/microblaze/include/asm/seccomp.h |    2 ++
 2 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index e446bab..dd570cc 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -1,6 +1,7 @@
 config MICROBLAZE
 	def_bool y
 	select HAVE_MEMBLOCK
+	select HAVE_SECCOMP_FILTER
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/microblaze/include/asm/seccomp.h b/arch/microblaze/include/asm/seccomp.h
index 0d91275..0e38eed 100644
--- a/arch/microblaze/include/asm/seccomp.h
+++ b/arch/microblaze/include/asm/seccomp.h
@@ -7,10 +7,12 @@
 #define __NR_seccomp_write		__NR_write
 #define __NR_seccomp_exit		__NR_exit
 #define __NR_seccomp_sigreturn		__NR_sigreturn
+#define __NR_seccomp_execve		__NR_execve
 
 #define __NR_seccomp_read_32		__NR_read
 #define __NR_seccomp_write_32		__NR_write
 #define __NR_seccomp_exit_32		__NR_exit
 #define __NR_seccomp_sigreturn_32	__NR_sigreturn
+#define __NR_seccomp_execve_32		__NR_execve
 
 #endif	/* _ASM_MICROBLAZE_SECCOMP_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 09/13] mips: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
                   ` (6 preceding siblings ...)
  2011-06-24  0:36 ` [PATCH v9 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve Will Drewry
@ 2011-06-24  0:36 ` Will Drewry
  2011-06-24  0:36 ` [PATCH v9 10/13] s390: " Will Drewry
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Ralf Baechle,
	linux-mips

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/mips/Kconfig               |    1 +
 arch/mips/include/asm/seccomp.h |    3 +++
 2 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 653da62..29b0848 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -10,6 +10,7 @@ config MIPS
 	select HAVE_ARCH_KGDB
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_SECCOMP_FILTER
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_C_RECORDMCOUNT
diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h
index ae6306e..4014a3a 100644
--- a/arch/mips/include/asm/seccomp.h
+++ b/arch/mips/include/asm/seccomp.h
@@ -6,6 +6,7 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 /*
  * Kludge alert:
@@ -19,6 +20,7 @@
 #define __NR_seccomp_write_32		4004
 #define __NR_seccomp_exit_32		4001
 #define __NR_seccomp_sigreturn_32	4193	/* rt_sigreturn */
+#define __NR_seccomp_execve_32		4011
 
 #elif defined(CONFIG_MIPS32_N32)
 
@@ -26,6 +28,7 @@
 #define __NR_seccomp_write_32		6001
 #define __NR_seccomp_exit_32		6058
 #define __NR_seccomp_sigreturn_32	6211	/* rt_sigreturn */
+#define __NR_seccomp_execve_32		6057
 
 #endif /* CONFIG_MIPS32_O32 */
 
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 10/13] s390: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
                   ` (7 preceding siblings ...)
  2011-06-24  0:36 ` [PATCH v9 09/13] mips: " Will Drewry
@ 2011-06-24  0:36 ` Will Drewry
  2011-06-24  0:36   ` Will Drewry
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Martin Schwidefsky,
	Heiko Carstens, linux390, linux-s390

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/s390/Kconfig               |    1 +
 arch/s390/include/asm/seccomp.h |    3 ++-
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 90d77bd..327d16a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -64,6 +64,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 config S390
 	def_bool y
 	select USE_GENERIC_SMP_HELPERS if SMP
+	select HAVE_SECCOMP_FILTER
 	select HAVE_SYSCALL_WRAPPERS
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
diff --git a/arch/s390/include/asm/seccomp.h b/arch/s390/include/asm/seccomp.h
index 781a9cf..e5792f5 100644
--- a/arch/s390/include/asm/seccomp.h
+++ b/arch/s390/include/asm/seccomp.h
@@ -7,10 +7,11 @@
 #define __NR_seccomp_write	__NR_write
 #define __NR_seccomp_exit	__NR_exit
 #define __NR_seccomp_sigreturn	__NR_sigreturn
+#define __NR_seccomp_execve	__NR_execve
 
 #define __NR_seccomp_read_32	__NR_read
 #define __NR_seccomp_write_32	__NR_write
 #define __NR_seccomp_exit_32	__NR_exit
-#define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32	__NR_execve
 
 #endif	/* _ASM_S390_SECCOMP_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
@ 2011-06-24  0:36   ` Will Drewry
  2011-06-24  0:36 ` [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters Will Drewry
                     ` (10 subsequent siblings)
  11 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Benjamin Herrenschmidt,
	Paul Mackerras, linuxppc-dev

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/powerpc/Kconfig               |    1 +
 arch/powerpc/include/asm/seccomp.h |    2 ++
 2 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2729c66..030d392 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,6 +129,7 @@ config PPC
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_SPARSE_IRQ
+	select HAVE_SECCOMP_FILTER
 	select IRQ_PER_CPU
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL
diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
index 00c1d91..3cb9cc1 100644
--- a/arch/powerpc/include/asm/seccomp.h
+++ b/arch/powerpc/include/asm/seccomp.h
@@ -7,10 +7,12 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 #define __NR_seccomp_read_32 __NR_read
 #define __NR_seccomp_write_32 __NR_write
 #define __NR_seccomp_exit_32 __NR_exit
 #define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
 
 #endif	/* _ASM_POWERPC_SECCOMP_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
@ 2011-06-24  0:36   ` Will Drewry
  0 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: linuxppc-dev, Will Drewry, fweisbec, scarybeasts, djm, jmorris,
	rostedt, tglx, mingo, Paul Mackerras, kees.cook, torvalds,
	segoon

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/powerpc/Kconfig               |    1 +
 arch/powerpc/include/asm/seccomp.h |    2 ++
 2 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2729c66..030d392 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,6 +129,7 @@ config PPC
 	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_SPARSE_IRQ
+	select HAVE_SECCOMP_FILTER
 	select IRQ_PER_CPU
 	select GENERIC_IRQ_SHOW
 	select GENERIC_IRQ_SHOW_LEVEL
diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
index 00c1d91..3cb9cc1 100644
--- a/arch/powerpc/include/asm/seccomp.h
+++ b/arch/powerpc/include/asm/seccomp.h
@@ -7,10 +7,12 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 #define __NR_seccomp_read_32 __NR_read
 #define __NR_seccomp_write_32 __NR_write
 #define __NR_seccomp_exit_32 __NR_exit
 #define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
 
 #endif	/* _ASM_POWERPC_SECCOMP_H */
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 12/13] sparc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
@ 2011-06-24  0:36   ` Will Drewry
  2011-06-24  0:36 ` [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters Will Drewry
                     ` (10 subsequent siblings)
  11 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, David S. Miller,
	sparclinux

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/sparc/Kconfig               |    2 ++
 arch/sparc/include/asm/seccomp.h |    2 ++
 2 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 253986b..d4d0ec4 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -26,6 +26,7 @@ config SPARC
 	select HAVE_DMA_API_DEBUG
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_GENERIC_HARDIRQS
+	select HAVE_SECCOMP_FILTER
 	select GENERIC_IRQ_SHOW
 	select USE_GENERIC_SMP_HELPERS if SMP
 
@@ -42,6 +43,7 @@ config SPARC64
 	select HAVE_KRETPROBES
 	select HAVE_KPROBES
 	select HAVE_MEMBLOCK
+	select HAVE_SECCOMP_FILTER
 	select HAVE_SYSCALL_WRAPPERS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/sparc/include/asm/seccomp.h b/arch/sparc/include/asm/seccomp.h
index adca1bc..a1dac08 100644
--- a/arch/sparc/include/asm/seccomp.h
+++ b/arch/sparc/include/asm/seccomp.h
@@ -6,10 +6,12 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 #define __NR_seccomp_read_32 __NR_read
 #define __NR_seccomp_write_32 __NR_write
 #define __NR_seccomp_exit_32 __NR_exit
 #define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
 
 #endif /* _ASM_SECCOMP_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 12/13] sparc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
@ 2011-06-24  0:36   ` Will Drewry
  0 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, David S. Miller,
	sparclinux

Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
system call numbering for execve and selecting HAVE_SECCOMP_FILTER.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/sparc/Kconfig               |    2 ++
 arch/sparc/include/asm/seccomp.h |    2 ++
 2 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 253986b..d4d0ec4 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -26,6 +26,7 @@ config SPARC
 	select HAVE_DMA_API_DEBUG
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_GENERIC_HARDIRQS
+	select HAVE_SECCOMP_FILTER
 	select GENERIC_IRQ_SHOW
 	select USE_GENERIC_SMP_HELPERS if SMP
 
@@ -42,6 +43,7 @@ config SPARC64
 	select HAVE_KRETPROBES
 	select HAVE_KPROBES
 	select HAVE_MEMBLOCK
+	select HAVE_SECCOMP_FILTER
 	select HAVE_SYSCALL_WRAPPERS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/sparc/include/asm/seccomp.h b/arch/sparc/include/asm/seccomp.h
index adca1bc..a1dac08 100644
--- a/arch/sparc/include/asm/seccomp.h
+++ b/arch/sparc/include/asm/seccomp.h
@@ -6,10 +6,12 @@
 #define __NR_seccomp_write __NR_write
 #define __NR_seccomp_exit __NR_exit
 #define __NR_seccomp_sigreturn __NR_rt_sigreturn
+#define __NR_seccomp_execve __NR_execve
 
 #define __NR_seccomp_read_32 __NR_read
 #define __NR_seccomp_write_32 __NR_write
 #define __NR_seccomp_exit_32 __NR_exit
 #define __NR_seccomp_sigreturn_32 __NR_sigreturn
+#define __NR_seccomp_execve_32 __NR_execve
 
 #endif /* _ASM_SECCOMP_H */
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 13/13] sh: select HAVE_SECCOMP_FILTER
  2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
@ 2011-06-24  0:36   ` Will Drewry
  2011-06-24  0:36 ` [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters Will Drewry
                     ` (10 subsequent siblings)
  11 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Paul Mundt, linux-sh

Add support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/sh/Kconfig |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index f03338c..79f046d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -10,6 +10,7 @@ config SUPERH
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
 	select HAVE_IRQ_WORK
+	select HAVE_SECCOMP_FILTER
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_KERNEL_GZIP
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* [PATCH v9 13/13] sh: select HAVE_SECCOMP_FILTER
@ 2011-06-24  0:36   ` Will Drewry
  0 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-06-24  0:36 UTC (permalink / raw)
  To: linux-kernel
  Cc: torvalds, djm, segoon, kees.cook, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Will Drewry, Paul Mundt, linux-sh

Add support for CONFIG_SECCOMP_FILTER by selecting HAVE_SECCOMP_FILTER.

v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf

Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/sh/Kconfig |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index f03338c..79f046d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -10,6 +10,7 @@ config SUPERH
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_ATTRS
 	select HAVE_IRQ_WORK
+	select HAVE_SECCOMP_FILTER
 	select HAVE_PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select HAVE_KERNEL_GZIP
-- 
1.7.0.4


^ permalink raw reply related	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-06-24  0:36 ` [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
@ 2011-06-24  7:24   ` Chris Evans
       [not found]   ` <BANLkTimtYUyXbZjWhjK61B_1WBXE4MoAeA@mail.gmail.com>
  1 sibling, 0 replies; 51+ messages in thread
From: Chris Evans @ 2011-06-24  7:24 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, torvalds, djm, segoon, kees.cook, mingo, rostedt,
	jmorris, fweisbec, tglx, Randy Dunlap, linux-doc

I just wanted to add a +1 for this facility, now that it has undergone
extensive review and tweaking. I've wanted something similar in the
Linux kernel for a long time.

With patches like these, there can be the concern: will anyone actually use it??

I will definitely be using this in vsftpd, Chromium and internally at Google.


Cheers
Chris

On Thu, Jun 23, 2011 at 5:36 PM, Will Drewry <wad@chromium.org> wrote:
>
> Adds a text file covering what CONFIG_SECCOMP_FILTER is, how it is
> implemented presently, and what it may be used for.  In addition,
> the limitations and caveats of the proposed implementation are
> included.
>
> v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf
> v8: -
> v7: Add a caveat around fork behavior and execve
> v6: -
> v5: -
> v4: rewording (courtesy kees.cook@canonical.com)
>    reflect support for event ids
>    add a small section on adding per-arch support
> v3: a little more cleanup
> v2: moved to prctl/
>    updated for the v2 syntax.
>    adds a note about compat behavior
>
> Signed-off-by: Will Drewry <wad@chromium.org>
> ---
>  Documentation/prctl/seccomp_filter.txt |  189 ++++++++++++++++++++++++++++++++
>  1 files changed, 189 insertions(+), 0 deletions(-)
>  create mode 100644 Documentation/prctl/seccomp_filter.txt
>
> diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
> new file mode 100644
> index 0000000..a9cddc2
> --- /dev/null
> +++ b/Documentation/prctl/seccomp_filter.txt
> @@ -0,0 +1,189 @@
> +               Seccomp filtering
> +               =================
> +
> +Introduction
> +------------
> +
> +A large number of system calls are exposed to every userland process
> +with many of them going unused for the entire lifetime of the process.
> +As system calls change and mature, bugs are found and eradicated.  A
> +certain subset of userland applications benefit by having a reduced set
> +of available system calls.  The resulting set reduces the total kernel
> +surface exposed to the application.  System call filtering is meant for
> +use with those applications.
> +
> +The implementation currently leverages both the existing seccomp
> +infrastructure and the kernel tracing infrastructure.  By centralizing
> +hooks for attack surface reduction in seccomp, it is possible to assure
> +attention to security that is less relevant in normal ftrace scenarios,
> +such as time-of-check, time-of-use attacks.  However, ftrace provides a
> +rich, human-friendly environment for interfacing with system call
> +specific arguments.  (As such, this requires FTRACE_SYSCALLS for any
> +introspective filtering support.)
> +
> +
> +What it isn't
> +-------------
> +
> +System call filtering isn't a sandbox.  It provides a clearly defined
> +mechanism for minimizing the exposed kernel surface.  Beyond that,
> +policy for logical behavior and information flow should be managed with
> +a combinations of other system hardening techniques and, potentially, a
> +LSM of your choosing.  Expressive, dynamic filters based on the ftrace
> +filter engine provide further options down this path (avoiding
> +pathological sizes or selecting which of the multiplexed system calls in
> +socketcall() is allowed, for instance) which could be construed,
> +incorrectly, as a more complete sandboxing solution.
> +
> +
> +Usage
> +-----
> +
> +An additional seccomp mode is exposed through mode '2'.
> +This mode depends on CONFIG_SECCOMP_FILTER.  By default, it provides
> +only the most trivial of filter support "1" or cleared.  However, if
> +CONFIG_FTRACE_SYSCALLS is enabled, the ftrace filter engine may be used
> +for more expressive filters.
> +
> +A collection of filters may be supplied via prctl, and the current set
> +of filters is exposed in /proc/<pid>/seccomp_filter.
> +
> +Interacting with seccomp filters can be done through three new prctl calls
> +and one existing one.
> +
> +PR_SET_SECCOMP:
> +       A pre-existing option for enabling strict seccomp mode (1) or
> +       filtering seccomp (2).
> +
> +       Usage:
> +               prctl(PR_SET_SECCOMP, 1);  /* strict */
> +               prctl(PR_SET_SECCOMP, 2);  /* filters */
> +
> +PR_SET_SECCOMP_FILTER:
> +       Allows the specification of a new filter for a given system
> +       call, by number, and filter string.  By default, the filter
> +       string may only be "1".  However, if CONFIG_FTRACE_SYSCALLS is
> +       supported, the filter string may make use of the ftrace
> +       filtering language's awareness of system call arguments.
> +
> +       In addition, the event id for the system call entry may be
> +       specified in lieu of the system call number itself, as
> +       determined by the 'type' argument.  This allows for the future
> +       addition of seccomp-based filtering on other registered,
> +       relevant ftrace events.
> +
> +       All calls to PR_SET_SECCOMP_FILTER for a given system
> +       call will append the supplied string to any existing filters.
> +       Filter construction looks as follows:
> +               (Nothing) + "fd == 1 || fd == 2" => fd == 1 || fd == 2
> +               ... + "fd != 2" => (fd == 1 || fd == 2) && fd != 2
> +               ... + "size < 100" =>
> +                       ((fd == 1 || fd == 2) && fd != 2) && size < 100
> +       If there is no filter and the seccomp mode has already
> +       transitioned to filtering, additions cannot be made.  Filters
> +       may only be added that reduce the available kernel surface.
> +
> +       Usage (per the construction example above):
> +               unsigned long type = PR_SECCOMP_FILTER_SYSCALL;
> +               prctl(PR_SET_SECCOMP_FILTER, type, __NR_write,
> +                       "fd == 1 || fd == 2");
> +               prctl(PR_SET_SECCOMP_FILTER, type, __NR_write,
> +                       "fd != 2");
> +               prctl(PR_SET_SECCOMP_FILTER, type, __NR_write,
> +                       "size < 100");
> +
> +       The 'type' argument may be one of PR_SECCOMP_FILTER_SYSCALL or
> +       PR_SECCOMP_FILTER_EVENT.
> +
> +PR_CLEAR_SECCOMP_FILTER:
> +       Removes all filter entries for a given system call number or
> +       event id.  When called prior to entering seccomp filtering mode,
> +       it allows for new filters to be applied to the same system call.
> +       After transition, however, it completely drops access to the
> +       call.
> +
> +       Usage:
> +               prctl(PR_CLEAR_SECCOMP_FILTER,
> +                       PR_SECCOMP_FILTER_SYSCALL, __NR_open);
> +
> +PR_GET_SECCOMP_FILTER:
> +       Returns the aggregated filter string for a system call into a
> +       user-supplied buffer of a given length.
> +
> +       Usage:
> +               prctl(PR_GET_SECCOMP_FILTER,
> +                       PR_SECCOMP_FILTER_SYSCALL, __NR_write, buf,
> +                       sizeof(buf));
> +
> +All of the above calls return 0 on success and non-zero on error.  If
> +CONFIG_FTRACE_SYSCALLS is not supported and a rich-filter was specified,
> +the caller may check the errno for -ENOSYS.  The same is true if
> +specifying an filter by the event id fails to discover any relevant
> +event entries.
> +
> +
> +Example
> +-------
> +
> +Assume a process would like to cleanly read and write to stdin/out/err
> +as well as access its filters after seccomp enforcement begins.  This
> +may be done as follows:
> +
> +  int filter_syscall(int nr, char *buf) {
> +    return prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL,
> +                 nr, buf);
> +  }
> +
> +  filter_syscall(__NR_read, "fd == 0");
> +  filter_syscall(_NR_write, "fd == 1 || fd == 2");
> +  filter_syscall(__NR_exit, "1");
> +  filter_syscall(__NR_prctl, "1");
> +  prctl(PR_SET_SECCOMP, 2);
> +
> +  /* Do stuff with fdset . . .*/
> +
> +  /* Drop read access and keep only write access to fd 1. */
> +  prctl(PR_CLEAR_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL, __NR_read);
> +  filter_syscall(__NR_write, "fd != 2");
> +
> +  /* Perform any final processing . . . */
> +  syscall(__NR_exit, 0);
> +
> +
> +Caveats
> +-------
> +
> +- Avoid using a filter of "0" to disable a filter.  Always favor calling
> +  prctl(PR_CLEAR_SECCOMP_FILTER, ...).  Otherwise the behavior may vary
> +  depending on if CONFIG_FTRACE_SYSCALLS support exists -- though an
> +  error will be returned if the support is missing.
> +
> +- execve is always blocked.  seccomp filters may not cross that boundary.
> +
> +- Filters can be inherited across fork/clone but only when they are
> +  active (e.g., PR_SET_SECCOMP has been set to 2), but not prior to use.
> +  This stops the parent process from adding filters that may undermine
> +  the child process security or create unexpected behavior after an
> +  execve.
> +
> +- Some platforms support a 32-bit userspace with 64-bit kernels.  In
> +  these cases (CONFIG_COMPAT), system call numbers may not match across
> +  64-bit and 32-bit system calls. When the first PRCTL_SET_SECCOMP_FILTER
> +  is called, the in-memory filters state is annotated with whether the
> +  call has been made via the compat interface.  All subsequent calls will
> +  be checked for compat call mismatch.  In the long run, it may make sense
> +  to store compat and non-compat filters separately, but that is not
> +  supported at present. Once one type of system call interface has been
> +  used, it must be continued to be used.
> +
> +
> +Adding architecture support
> +-----------------------
> +
> +Any platform with seccomp support should be able to support the bare
> +minimum of seccomp filter features.  However, since seccomp_filter
> +requires that execve be blocked, it expects the architecture to expose a
> +__NR_seccomp_execve define that maps to the execve system call number.
> +On platforms where CONFIG_COMPAT applies, __NR_seccomp_execve_32 must
> +also be provided.  Once those macros exist, "select HAVE_SECCOMP_FILTER"
> +support may be added to the architectures Kconfig.
> --
> 1.7.0.4
>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters
  2011-06-24  0:36 ` [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters Will Drewry
@ 2011-06-24  7:30   ` Damien Miller
  2011-06-24 20:20   ` Kees Cook
  1 sibling, 0 replies; 51+ messages in thread
From: Damien Miller @ 2011-06-24  7:30 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, torvalds, segoon, kees.cook, mingo, rostedt,
	jmorris, fweisbec, tglx, scarybeasts, Peter Zijlstra,
	Paul E. McKenney, Andrew Morton, David Howells, Eric Paris,
	Borislav Petkov, Michal Marek, Mike Galbraith, Serge E. Hallyn,
	Jiri Slaby, Greg Kroah-Hartman, linux-security-module

On Thu, 23 Jun 2011, Will Drewry wrote:

> This change adds a new seccomp mode which specifies the allowed system
> calls dynamically.  When in the new mode (2), all system calls are
> checked against process-defined filters - first by system call number,
> then by a filter string.  If an entry exists for a given system call and
> all filter predicates evaluate to true, then the task may proceed.
> Otherwise, the task is killed.

This would be a great help for OpenSSH to improve pre-authentication
privilege separation. We use a separate process that is chroot()ed and
switched to a dedicated UID to limit the effects of compromise in the
complex network-facing code, but a vulnerability in this process
still allows an attacker to open new network sockets (e.g. to proxy
attacks through your firewall) or attempt to exploit local kernel bugs.

If we are able to restrict the syscalls the unprivileged process can make
then it becomes very difficult for an attacker to do anything useful -
they won't be able to open new sockets and a narrow set of available
syscalls makes exploitable kernel bugs much harder to find or reach.

The current SECCOMP sandbox is too restrictive, because we need to
poll/select() on sockets and mmap() to allocate new memory. Allowing
application developers to specify which syscalls are allowed is a
good, low-cost way to make the SECCOMP sandbox much more useful.

-d

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters
  2011-06-24  0:36 ` [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters Will Drewry
  2011-06-24  7:30   ` Damien Miller
@ 2011-06-24 20:20   ` Kees Cook
  1 sibling, 0 replies; 51+ messages in thread
From: Kees Cook @ 2011-06-24 20:20 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, torvalds, djm, segoon, mingo, rostedt, jmorris,
	fweisbec, tglx, scarybeasts, Peter Zijlstra, Paul E. McKenney,
	Andrew Morton, David Howells, Eric Paris, Borislav Petkov,
	Michal Marek, Mike Galbraith, Serge E. Hallyn, Jiri Slaby,
	Greg Kroah-Hartman, linux-security-module

Hi Will,

On Thu, Jun 23, 2011 at 07:36:42PM -0500, Will Drewry wrote:
> This change adds a new seccomp mode which specifies the allowed system
> calls dynamically.  When in the new mode (2), all system calls are
> checked against process-defined filters - first by system call number,
> then by a filter string.  If an entry exists for a given system call and
> all filter predicates evaluate to true, then the task may proceed.
> Otherwise, the task is killed.
> [...]
> Signed-off-by: Will Drewry <wad@chromium.org>

Thanks for continuing to work on this. I look forward to being able to use
it. :)

Acked-by: Kees Cook <kees.cook@canonical.com>

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
       [not found]   ` <BANLkTimtYUyXbZjWhjK61B_1WBXE4MoAeA@mail.gmail.com>
@ 2011-06-26 23:20     ` James Morris
  2011-06-29 19:13       ` Will Drewry
  0 siblings, 1 reply; 51+ messages in thread
From: James Morris @ 2011-06-26 23:20 UTC (permalink / raw)
  To: Chris Evans
  Cc: Will Drewry, linux-kernel, torvalds, djm, segoon, kees.cook,
	mingo, rostedt, fweisbec, tglx, Randy Dunlap, linux-doc

On Fri, 24 Jun 2011, Chris Evans wrote:

> With patches like these, there can be the concern: will anyone actually use
> it??

It's likely to be used by kvm/qemu to help protect against vm break-outs.

-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-06-26 23:20     ` James Morris
@ 2011-06-29 19:13       ` Will Drewry
  2011-06-30  1:30         ` James Morris
  0 siblings, 1 reply; 51+ messages in thread
From: Will Drewry @ 2011-06-29 19:13 UTC (permalink / raw)
  To: James Morris
  Cc: Chris Evans, linux-kernel, torvalds, djm, segoon, kees.cook,
	mingo, rostedt, fweisbec, tglx, Randy Dunlap, linux-doc,
	Eric Paris

On Sun, Jun 26, 2011 at 6:20 PM, James Morris <jmorris@namei.org> wrote:
> On Fri, 24 Jun 2011, Chris Evans wrote:
>
>> With patches like these, there can be the concern: will anyone actually use
>> it??
>
> It's likely to be used by kvm/qemu to help protect against vm break-outs.

Since it seems that there'll be consumers (openssh, vsftpd, kvm/qemu,
chromium, chromium os) and feedback quieted down, what are the next
steps to get this to a pull/no-pull decision points (or at least some
Ack's or Nack's)?  I know this patch series crosses a number of
maintainers, and I never know exactly what's next when the feedback
slows down.

Thanks!
will

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-06-29 19:13       ` Will Drewry
@ 2011-06-30  1:30         ` James Morris
  2011-07-01 11:56           ` Ingo Molnar
  0 siblings, 1 reply; 51+ messages in thread
From: James Morris @ 2011-06-30  1:30 UTC (permalink / raw)
  To: Will Drewry
  Cc: Chris Evans, linux-kernel, Linus Torvalds, djm, segoon,
	kees.cook, Ingo Molnar, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module

On Wed, 29 Jun 2011, Will Drewry wrote:

> Since it seems that there'll be consumers (openssh, vsftpd, kvm/qemu,
> chromium, chromium os) and feedback quieted down, what are the next
> steps to get this to a pull/no-pull decision points (or at least some
> Ack's or Nack's)?  I know this patch series crosses a number of
> maintainers, and I never know exactly what's next when the feedback
> slows down.

Are there any outstanding objections to this approach?  How do the tracing 
folk feel about it?

-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-06-30  1:30         ` James Morris
@ 2011-07-01 11:56           ` Ingo Molnar
  2011-07-01 12:56             ` Will Drewry
  0 siblings, 1 reply; 51+ messages in thread
From: Ingo Molnar @ 2011-07-01 11:56 UTC (permalink / raw)
  To: James Morris
  Cc: Will Drewry, Chris Evans, linux-kernel, Linus Torvalds, djm,
	segoon, kees.cook, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module


* James Morris <jmorris@namei.org> wrote:

> On Wed, 29 Jun 2011, Will Drewry wrote:
> 
> > Since it seems that there'll be consumers (openssh, vsftpd, 
> > kvm/qemu, chromium, chromium os) and feedback quieted down, what 
> > are the next steps to get this to a pull/no-pull decision points 
> > (or at least some Ack's or Nack's)?  I know this patch series 
> > crosses a number of maintainers, and I never know exactly what's 
> > next when the feedback slows down.
> 
> Are there any outstanding objections to this approach?  How do the 
> tracing folk feel about it?

I think i outlined my objections a couple of times and haven't seen 
them addressed.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 11:56           ` Ingo Molnar
@ 2011-07-01 12:56             ` Will Drewry
  2011-07-01 13:07               ` Ingo Molnar
  0 siblings, 1 reply; 51+ messages in thread
From: Will Drewry @ 2011-07-01 12:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Chris Evans, linux-kernel, Linus Torvalds, djm,
	segoon, kees.cook, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module

On Fri, Jul 1, 2011 at 6:56 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * James Morris <jmorris@namei.org> wrote:
>
>> On Wed, 29 Jun 2011, Will Drewry wrote:
>>
>> > Since it seems that there'll be consumers (openssh, vsftpd,
>> > kvm/qemu, chromium, chromium os) and feedback quieted down, what
>> > are the next steps to get this to a pull/no-pull decision points
>> > (or at least some Ack's or Nack's)?  I know this patch series
>> > crosses a number of maintainers, and I never know exactly what's
>> > next when the feedback slows down.
>>
>> Are there any outstanding objections to this approach?  How do the
>> tracing folk feel about it?
>
> I think i outlined my objections a couple of times and haven't seen
> them addressed.

After our last discussion, I suggested changes which I then undertook
and reposted.  Those changes have been posted for over two weeks.
I've continued to resolve any bugs with the patches and keep them
rebased.

Is it just that you will not accept the prctl()-based approach?  If
that's the case, I'm confused as it doesn't seem like there was
support for a perf-based change or a change that was more fundamental.

If you would re-outline the places where it falls down, that would be
useful.  If it is just that you do not think we can reach agreement,
that would be useful to know as well.

thanks,
will

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 12:56             ` Will Drewry
@ 2011-07-01 13:07               ` Ingo Molnar
  2011-07-01 15:46                 ` Will Drewry
  2011-07-05 15:26                 ` Vasiliy Kulikov
  0 siblings, 2 replies; 51+ messages in thread
From: Ingo Molnar @ 2011-07-01 13:07 UTC (permalink / raw)
  To: Will Drewry
  Cc: James Morris, Chris Evans, linux-kernel, Linus Torvalds, djm,
	segoon, kees.cook, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module


* Will Drewry <wad@chromium.org> wrote:

> On Fri, Jul 1, 2011 at 6:56 AM, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > * James Morris <jmorris@namei.org> wrote:
> >
> >> On Wed, 29 Jun 2011, Will Drewry wrote:
> >>
> >> > Since it seems that there'll be consumers (openssh, vsftpd,
> >> > kvm/qemu, chromium, chromium os) and feedback quieted down, what
> >> > are the next steps to get this to a pull/no-pull decision points
> >> > (or at least some Ack's or Nack's)?  I know this patch series
> >> > crosses a number of maintainers, and I never know exactly what's
> >> > next when the feedback slows down.
> >>
> >> Are there any outstanding objections to this approach?  How do the
> >> tracing folk feel about it?
> >
> > I think i outlined my objections a couple of times and haven't seen
> > them addressed.
> 
> After our last discussion, I suggested changes which I then undertook
> and reposted.  Those changes have been posted for over two weeks.

Have you addressed my basic objection of why we should go for a more 
complex and less capable variant over a shared yet more capable 
facility:

  http://lkml.kernel.org/r/20110526091518.GE26775@elte.hu

?

You are pushing the 'filter engine' approach currently, not the 
(much) more unified 'event filters' approach.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 13:07               ` Ingo Molnar
@ 2011-07-01 15:46                 ` Will Drewry
  2011-07-01 16:10                   ` Ingo Molnar
  2011-07-05 15:26                 ` Vasiliy Kulikov
  1 sibling, 1 reply; 51+ messages in thread
From: Will Drewry @ 2011-07-01 15:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Chris Evans, linux-kernel, Linus Torvalds, djm,
	segoon, kees.cook, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module

On Fri, Jul 1, 2011 at 8:07 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> On Fri, Jul 1, 2011 at 6:56 AM, Ingo Molnar <mingo@elte.hu> wrote:
>> >
>> > * James Morris <jmorris@namei.org> wrote:
>> >
>> >> On Wed, 29 Jun 2011, Will Drewry wrote:
>> >>
>> >> > Since it seems that there'll be consumers (openssh, vsftpd,
>> >> > kvm/qemu, chromium, chromium os) and feedback quieted down, what
>> >> > are the next steps to get this to a pull/no-pull decision points
>> >> > (or at least some Ack's or Nack's)?  I know this patch series
>> >> > crosses a number of maintainers, and I never know exactly what's
>> >> > next when the feedback slows down.
>> >>
>> >> Are there any outstanding objections to this approach?  How do the
>> >> tracing folk feel about it?
>> >
>> > I think i outlined my objections a couple of times and haven't seen
>> > them addressed.
>>
>> After our last discussion, I suggested changes which I then undertook
>> and reposted.  Those changes have been posted for over two weeks.
>
> Have you addressed my basic objection of why we should go for a more
> complex and less capable variant over a shared yet more capable
> facility:
>
>  http://lkml.kernel.org/r/20110526091518.GE26775@elte.hu
>
> ?

I withheld a fair number of comments because of the other participants
addressed some of them, but I believe I've laid out my thoughts at
least twice.  Perhaps not clearly, though.  I can post links if you
like, but they were all direct responses to your posts.

> You are pushing the 'filter engine' approach currently, not the
> (much) more unified 'event filters' approach.

In short, it is a large amount of work that will provide an incomplete
solution.  As is, this patch series has been in process for over 8
weeks. I've rewritten it about twice as many times as the 'v9' label
indicates.  After doing that, I do not believe your proposed solution
is simpler or more reasonable in the near term (<= 2 years).  Perhaps
the issue is just that someone who is more skilled and more
comfortable in the kernel could propose a better patch series - I
don't know.



>From my view, ftrace events are not ready for the job yet - and
relying purely on available wrapped events may make it unsuitable for
attack surface reduction forever.  As is, there is no compat syscall
support.  Many syscalls are not wrapped at present and no one ack'd my
earlier patches around wrapping more.  All of perf needs to be
overhauled to share per-task infrastructure. A new ABI needs to be
proposed if my prctl() changes are not acceptable to handle some of
the security-focused behavioral requirements.  Performance
characteristics need to be better analyzed as the current perf
list_head approach may not scale as desired.  The list goes on.  My
proof of concept patch for "event filters" was just that - a proof of
concept.  To truly share the filter events is a large amount of work
that may not be viable, and I believe you know that as well as I do.

That said, I did attempt to internalize all of your feedback, the
feedback from potential consumers, Linus' feedback, and that from
Steve and Frederic, along with the other people who were so kind as to
provide excellent feedback.  The current patch series is forward
portable to an "event filters" model and nearly all of its per-task
management complexity could be folded into shared code if it is
possible to create such infrastructure (for perf, event filters, ?).
Alternatively, this interface could become the entry point until such
time as it outgrows a per-process only experience.

Based on the support from potential API consumers, I believe there is
interest in this patch series, and I worry that just like with the
last two attempts in the last two years, this series will be relegated
to the lwn archives in anticipation of a future solution that uses
infrastructure that isn't quite ready.  I'm trying to approach a
problem that can be addressed today in a flexible, future-friendly
way, rather than try to open up a larger cross-kernel impacting patch
series that I'm unsure of exactly how to integrate sanely and don't
know that I can commit to doing.  And I don't see anyone else jumping
up to write the patch series to do it either :/



Anyway, that's where I am.  I'd really like to be able to offer this
functionality for all Linux users of Chromium and I'd be thrilled to
see it helping enhance the privilege separation code in OpenSSH, among
the other projects that have come forward (qemu, etc).

If you still think this patch series is a "stupid, limited hack" even
my attempts to balance the present with a possible future, then I'm
not sure what the practical next steps are.  And even if it is such a
hack, it is isolated to a very specific purpose and does not impact
those who don't care about it.  I don't think there's an attainable,
perfect solution, and when it comes to attack surface reduction, it's
hard to be hooking the syscalls very explicitly.

Thanks again for the continued feedback,
will

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 15:46                 ` Will Drewry
@ 2011-07-01 16:10                   ` Ingo Molnar
  2011-07-01 16:43                     ` Will Drewry
  0 siblings, 1 reply; 51+ messages in thread
From: Ingo Molnar @ 2011-07-01 16:10 UTC (permalink / raw)
  To: Will Drewry
  Cc: James Morris, Chris Evans, linux-kernel, Linus Torvalds, djm,
	segoon, kees.cook, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module


* Will Drewry <wad@chromium.org> wrote:

> From my view, ftrace events are not ready for the job yet - and 
> relying purely on available wrapped events may make it unsuitable 
> for attack surface reduction forever.  As is, there is no compat 
> syscall support.  Many syscalls are not wrapped at present and no 
> one ack'd my earlier patches around wrapping more.  All of perf 
> needs to be overhauled to share per-task infrastructure. A new ABI 
> needs to be proposed if my prctl() changes are not acceptable to 
> handle some of the security-focused behavioral requirements.  
> Performance characteristics need to be better analyzed as the 
> current perf list_head approach may not scale as desired.  The list 
> goes on.  My proof of concept patch for "event filters" was just 
> that - a proof of concept.  To truly share the filter events is a 
> large amount of work that may not be viable, and I believe you know 
> that as well as I do.

But that's exactly my point: i consider it the right way forward 
because it maximizes kernel utility in the long run.

Note that *all* the specific technical items you mention:

 - wrapping more syscalls (i.e. making syscall tracing
   feature-complete)

 - a clean filtering ABI

 - performance improvements. (Note that this one is already
   in progress, Thomas has written an IDR implementation that
   eliminates the list iteration entirely. You could help him
   finish  it.)

are not some bad side effect or quirk, they are all generic 
improvements we want in any case and not just for sandboxing.

You might not be interested in all of those items, you are only 
interested in getting the narrow feature-set you are interested in, 
but you sure are interested in getting sandboxing versus not getting 
anything at all, right?

Not doing it right because "it's too much work", especially as the 
trivial 'proof of concept' prototype already gave us something very 
promising that worked to a fair degree:

       bitmask (2009):  6 files changed,  194 insertions(+), 22 deletions(-)
 filter engine (2010): 18 files changed, 1100 insertions(+), 21 deletions(-)
 event filters (2011):  5 files changed,   82 insertions(+), 16 deletions(-)

are pretty hollow arguments to me. That diffstat sums up my argument 
of proper structure pretty well.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 16:10                   ` Ingo Molnar
@ 2011-07-01 16:43                     ` Will Drewry
  2011-07-01 18:04                       ` Steven Rostedt
  2011-07-01 21:00                       ` Ingo Molnar
  0 siblings, 2 replies; 51+ messages in thread
From: Will Drewry @ 2011-07-01 16:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Chris Evans, linux-kernel, Linus Torvalds, djm,
	segoon, kees.cook, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module

On Fri, Jul 1, 2011 at 11:10 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> From my view, ftrace events are not ready for the job yet - and
>> relying purely on available wrapped events may make it unsuitable
>> for attack surface reduction forever.  As is, there is no compat
>> syscall support.  Many syscalls are not wrapped at present and no
>> one ack'd my earlier patches around wrapping more.  All of perf
>> needs to be overhauled to share per-task infrastructure. A new ABI
>> needs to be proposed if my prctl() changes are not acceptable to
>> handle some of the security-focused behavioral requirements.
>> Performance characteristics need to be better analyzed as the
>> current perf list_head approach may not scale as desired.  The list
>> goes on.  My proof of concept patch for "event filters" was just
>> that - a proof of concept.  To truly share the filter events is a
>> large amount of work that may not be viable, and I believe you know
>> that as well as I do.
>
> But that's exactly my point: i consider it the right way forward
> because it maximizes kernel utility in the long run.

Not if it never happens.  Which is what happened with the proposals
from Adam and from Eric.

> Note that *all* the specific technical items you mention:
>
>  - wrapping more syscalls (i.e. making syscall tracing
>   feature-complete)
>
>  - a clean filtering ABI
>
>  - performance improvements. (Note that this one is already
>   in progress, Thomas has written an IDR implementation that
>   eliminates the list iteration entirely. You could help him
>   finish  it.)

I was thinking specifically about how filter events are stored and
accessed.  But sure, I could try to contribute to any number of
related efforts.

> are not some bad side effect or quirk, they are all generic
> improvements we want in any case and not just for sandboxing.

I didn't say they were bad side effects or quirks.

> You might not be interested in all of those items, you are only
> interested in getting the narrow feature-set you are interested in,
> but you sure are interested in getting sandboxing versus not getting
> anything at all, right?

Unfortunately, that isn't the value proposition for me or many other
contributors.  The real question is whether I am interested in getting
sandboxing in with mainline or if I want to sign up to maintain the
patches out of tree until my hair falls out.

I would much prefer to have a solution that Linux users as a whole can
benefit from and not just a subset of users I affect, but it's not a
hostage situation. I was hoping to work toward a solution that met
needs in the near future while being able to continue to invest in
driving long term changes.  If all the other work is a prerequisite
for system call restriction, I'll be very lucky to see anything this
calendar year assuming I can even write the patches in that time.

> Not doing it right because "it's too much work", especially as the

I'm not averse to work, but I don't necessarily feel that the extra
work is justified.  I also have to deal with my own personal and work
time constraints.

> trivial 'proof of concept' prototype already gave us something very
> promising that worked to a fair degree:
>
>       bitmask (2009):  6 files changed,  194 insertions(+), 22 deletions(-)
>  filter engine (2010): 18 files changed, 1100 insertions(+), 21 deletions(-)
>  event filters (2011):  5 files changed,   82 insertions(+), 16 deletions(-)
>
> are pretty hollow arguments to me. That diffstat sums up my argument
> of proper structure pretty well.

I wrote that code so I know how hollow the diffstats are.  The 82
lines of code do not:
- convince perf maintainers to share per-task events with "event filter" code
- provide reduce-privilege-only semantics
- provide a clean ABI that doesn't stomp all over the perf ABI
- provide compat syscalls
- provide a rewrite of DEFINE_SYSCALL* to support non 'long' syscalls
- provide ptreg syscall support
- provide any sort of blocking guarantees for unhooked system call events
- ...


I'd like to be able to move along security for the platform today and
not in two years, but if my only chance of any form of this being
ACK'd is to write it such that it shares code with perf and has a
shiny new ABI, then I'll queue up the work for when I can start trying
to tackle it.

That said, I still feel that this patch series is the right thing to
do now - not just for my personal reasons but for the kernel too.

Thanks,
will

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 16:43                     ` Will Drewry
@ 2011-07-01 18:04                       ` Steven Rostedt
  2011-07-01 18:09                         ` Will Drewry
  2011-07-01 20:25                         ` Kees Cook
  2011-07-01 21:00                       ` Ingo Molnar
  1 sibling, 2 replies; 51+ messages in thread
From: Steven Rostedt @ 2011-07-01 18:04 UTC (permalink / raw)
  To: Will Drewry
  Cc: Ingo Molnar, James Morris, Chris Evans, linux-kernel,
	Linus Torvalds, djm, segoon, kees.cook, fweisbec, tglx,
	Randy Dunlap, linux-doc, Eric Paris, linux-security-module,
	ksummit-2011-discuss

On Fri, 2011-07-01 at 11:43 -0500, Will Drewry wrote:
> On Fri, Jul 1, 2011 at 11:10 AM, Ingo Molnar <mingo@elte.hu> wrote:

> I'd like to be able to move along security for the platform today and
> not in two years, but if my only chance of any form of this being
> ACK'd is to write it such that it shares code with perf and has a
> shiny new ABI, then I'll queue up the work for when I can start trying
> to tackle it.

As this seems to be dragging on, and does not look to be solved by
October, I would like to propose this topic for the Kernel Summit in
Prague. I believe all parties involved may be there, and if not, I will
push hard to get them there.

Email is not always the best median for discussions. Face to face can
usually solve things much quicker.

-- Steve



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 18:04                       ` Steven Rostedt
@ 2011-07-01 18:09                         ` Will Drewry
  2011-07-01 18:48                           ` Steven Rostedt
  2011-07-05  2:54                           ` [Ksummit-2011-discuss] " Eugene Teo
  2011-07-01 20:25                         ` Kees Cook
  1 sibling, 2 replies; 51+ messages in thread
From: Will Drewry @ 2011-07-01 18:09 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, James Morris, Chris Evans, linux-kernel,
	Linus Torvalds, djm, segoon, kees.cook, fweisbec, tglx,
	Randy Dunlap, linux-doc, Eric Paris, linux-security-module,
	ksummit-2011-discuss

On Fri, Jul 1, 2011 at 1:04 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> On Fri, 2011-07-01 at 11:43 -0500, Will Drewry wrote:
>> On Fri, Jul 1, 2011 at 11:10 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
>> I'd like to be able to move along security for the platform today and
>> not in two years, but if my only chance of any form of this being
>> ACK'd is to write it such that it shares code with perf and has a
>> shiny new ABI, then I'll queue up the work for when I can start trying
>> to tackle it.
>
> As this seems to be dragging on, and does not look to be solved by
> October, I would like to propose this topic for the Kernel Summit in
> Prague. I believe all parties involved may be there, and if not, I will
> push hard to get them there.
>
> Email is not always the best median for discussions. Face to face can
> usually solve things much quicker.

I have a panel reserved on system/kernel hardening with Kees at the
Linux Security Summit as part of the LPC (thanks to the LSS
committee).  Will the relevant people be able to attend that in
September?

(Since the KS is invite-only, I'm obviously not on the list for it,
but I realize I'm just a bit player in this. Maybe some direction can
be resolved I can play catch up with.)

What's another couple months? :)

Thanks!
will

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 18:09                         ` Will Drewry
@ 2011-07-01 18:48                           ` Steven Rostedt
  2011-07-04  2:19                             ` James Morris
  2011-07-05  2:54                           ` [Ksummit-2011-discuss] " Eugene Teo
  1 sibling, 1 reply; 51+ messages in thread
From: Steven Rostedt @ 2011-07-01 18:48 UTC (permalink / raw)
  To: Will Drewry
  Cc: Ingo Molnar, James Morris, Chris Evans, linux-kernel,
	Linus Torvalds, djm, segoon, kees.cook, fweisbec, tglx,
	Randy Dunlap, linux-doc, Eric Paris, linux-security-module,
	ksummit-2011-discuss

On Fri, 2011-07-01 at 13:09 -0500, Will Drewry wrote:

> I have a panel reserved on system/kernel hardening with Kees at the
> Linux Security Summit as part of the LPC (thanks to the LSS
> committee).  Will the relevant people be able to attend that in
> September?

No, unfortunately most of the key players will not be there.

> 
> (Since the KS is invite-only, I'm obviously not on the list for it,

But I'm on the KS committee ;) I'll vote for you and I think I can talk
the others into accepting you. If this is an issue that can be solved
face to face, I do not think anyone would have a problem with you
coming.

-- Steve



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 18:04                       ` Steven Rostedt
  2011-07-01 18:09                         ` Will Drewry
@ 2011-07-01 20:25                         ` Kees Cook
  2011-07-04 16:09                           ` [Ksummit-2011-discuss] " Greg KH
  1 sibling, 1 reply; 51+ messages in thread
From: Kees Cook @ 2011-07-01 20:25 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Will Drewry, Ingo Molnar, James Morris, Chris Evans,
	linux-kernel, Linus Torvalds, djm, segoon, fweisbec, tglx,
	Randy Dunlap, linux-doc, Eric Paris, linux-security-module,
	ksummit-2011-discuss

On Fri, Jul 01, 2011 at 02:04:08PM -0400, Steven Rostedt wrote:
> On Fri, 2011-07-01 at 11:43 -0500, Will Drewry wrote:
> > On Fri, Jul 1, 2011 at 11:10 AM, Ingo Molnar <mingo@elte.hu> wrote:
> 
> > I'd like to be able to move along security for the platform today and
> > not in two years, but if my only chance of any form of this being
> > ACK'd is to write it such that it shares code with perf and has a
> > shiny new ABI, then I'll queue up the work for when I can start trying
> > to tackle it.
> 
> As this seems to be dragging on, and does not look to be solved by
> October, I would like to propose this topic for the Kernel Summit in
> Prague. I believe all parties involved may be there, and if not, I will
> push hard to get them there.
> 
> Email is not always the best median for discussions. Face to face can
> usually solve things much quicker.

How about we put it in as-is and mark it experimental, and then folks
can discuss improvements to it in Oct after all the API users have had
a chance to play with it? Four months seems like a needless delay to me.
I respect the objections, but it doesn't seem to balance against the
demonstrated need for this feature when faced with a viable working patch
series.

-Kees

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 16:43                     ` Will Drewry
  2011-07-01 18:04                       ` Steven Rostedt
@ 2011-07-01 21:00                       ` Ingo Molnar
  2011-07-01 21:34                         ` Will Drewry
  1 sibling, 1 reply; 51+ messages in thread
From: Ingo Molnar @ 2011-07-01 21:00 UTC (permalink / raw)
  To: Will Drewry
  Cc: James Morris, Chris Evans, linux-kernel, Linus Torvalds, djm,
	segoon, kees.cook, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module


* Will Drewry <wad@chromium.org> wrote:

> On Fri, Jul 1, 2011 at 11:10 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> > But that's exactly my point: i consider it the right way forward 
> > because it maximizes kernel utility in the long run.
> 
> Not if it never happens.  Which is what happened with the proposals 
> from Adam and from Eric.

Well, that's what my job is as a maintainer: to tell you if i dislike 
a solution and to suggest better solutions - and here this was really 
easy to do, as you already prototyped a solution i consider (far) 
superior.

I cannot force you to do it like that, but i do have to say 'no'.

> > are not some bad side effect or quirk, they are all generic 
> > improvements we want in any case and not just for sandboxing.
> 
> I didn't say they were bad side effects or quirks.

That's a pretty important point as well.

> > You might not be interested in all of those items, you are only 
> > interested in getting the narrow feature-set you are interested 
> > in, but you sure are interested in getting sandboxing versus not 
> > getting anything at all, right?
> 
> Unfortunately, that isn't the value proposition for me or many 
> other contributors.  The real question is whether I am interested 
> in getting sandboxing in with mainline or if I want to sign up to 
> maintain the patches out of tree until my hair falls out.

Well, if you implement the right solution then why should it stay out 
of tree?

If the code is clean and useful and resolves all the technical 
objections that were raised against it then i will certainly merge it 
- and i hope Linus will chime in if he finds the actual iteration 
unacceptable and the direction harmful, to save us all the trouble.

In the end the 'sandboxing' feature should be a few dozen lines at 
most - all the rest will just be shared infrastructure.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 21:00                       ` Ingo Molnar
@ 2011-07-01 21:34                         ` Will Drewry
  2011-07-05  9:50                           ` Ingo Molnar
  0 siblings, 1 reply; 51+ messages in thread
From: Will Drewry @ 2011-07-01 21:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Chris Evans, linux-kernel, Linus Torvalds, djm,
	segoon, kees.cook, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module

On Fri, Jul 1, 2011 at 4:00 PM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> On Fri, Jul 1, 2011 at 11:10 AM, Ingo Molnar <mingo@elte.hu> wrote:
>>
>> > But that's exactly my point: i consider it the right way forward
>> > because it maximizes kernel utility in the long run.
>>
>> Not if it never happens.  Which is what happened with the proposals
>> from Adam and from Eric.
>
> Well, that's what my job is as a maintainer: to tell you if i dislike
> a solution and to suggest better solutions - and here this was really
> easy to do, as you already prototyped a solution i consider (far)
> superior.

But other maintainers disagreed.  So as a non-maintainer, I get to
guess as to what happens next.

> I cannot force you to do it like that, but i do have to say 'no'.

It seems like a catch-22.  There's not a perfectly clear path forward,
and anything that looks like the perf-style proof of concept will be
NACK'd by other maintainers.  While I believe we could lift perf up
off its foundation and create a shared location for storing perf
events and ftrace events so that they will be inherited the same way
(currently nack'd by linus) and walked the same way (kinda), the
syscall interface couldn't currently be shared (also nack'd by perf),
and creating a new one is possible modeled on the perf one, but it's
also unclear what the ABI should be for a generic filtering system.
As I mentioned in a prior mail, syscalls should be whitelist only
(default fail), but random function call points in the kernel are more
likely to be fail-open.  And as the policy mechanisms get more
complicated, so does the code that needs to manage them.  I don't know
what the specification would look like or how it should look.  (For
instance, I abused a seccomp mode for the poc to change the match
behavior for syscall events. I imagine that could be folded into the
event handler for the proposed system, but I don't know what we'd do
for any other event subsystems.)

>> > are not some bad side effect or quirk, they are all generic
>> > improvements we want in any case and not just for sandboxing.
>>
>> I didn't say they were bad side effects or quirks.
>
> That's a pretty important point as well.

Well I have very explicit concerns that (new) system calls will slip
through the cracks forever and then a non-system call focused system
for reducing the kernel attack surface will always be incomplete.  The
biggest benefit of creating a highly targeted patch series that
focuses on the kernel syscall ABI is that future additions won't be
able to slip through the cracks without major system-level changes.
Perhaps this is why I am nervous about a generic solution.  I find
that sometimes the specialized solution is the best one even when
there is some additional cost.

>> > You might not be interested in all of those items, you are only
>> > interested in getting the narrow feature-set you are interested
>> > in, but you sure are interested in getting sandboxing versus not
>> > getting anything at all, right?
>>
>> Unfortunately, that isn't the value proposition for me or many
>> other contributors.  The real question is whether I am interested
>> in getting sandboxing in with mainline or if I want to sign up to
>> maintain the patches out of tree until my hair falls out.
>
> Well, if you implement the right solution then why should it stay out
> of tree?

"right" is subjective -- as always.  We each have different value
judgements we make.  Specialized verus generalized solutions.  Shared
code versus duplicated code.  Iterative improvements versus giant
overhauls, etc.   I was just responding to the fact I interpreted the
language in your mail more like code-writing-extortion than working
towards a better Linux kernel.  Perhaps I was reading it too harshly.

> If the code is clean and useful and resolves all the technical
> objections that were raised against it then i will certainly merge it
> - and i hope Linus will chime in if he finds the actual iteration
> unacceptable and the direction harmful, to save us all the trouble.

Any guidance here would certainly be welcomed.  I had thought that
doing a larger more complicated solution had been shot down along with
supporting inheritance or other generic features.  Instead, this patch
series is focused (attack surface reduction) and process-tied so that
could be iteratively improved and moved toward a longer term
shared-infrastructure goal without pulling in all the overhead and
design complexity right away.

I had hoped the current iteration would have been satisfactory from
both a usefulness and a technical quality perspective to potential API
consumers and maintainers alike.

> In the end the 'sandboxing' feature should be a few dozen lines at
> most - all the rest will just be shared infrastructure.

Anytime a powerful feature can be a few lines of code, it's a good
thing.  It seems like we're still a ways away from defining what the
shared infrastructure is that would allow a few dozen lines of code to
be enough.  The bones are there, but there's a large amount of missing
and under-designed work.

Thanks,
will

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 18:48                           ` Steven Rostedt
@ 2011-07-04  2:19                             ` James Morris
  2011-07-05 12:40                               ` Steven Rostedt
  2011-07-05 23:56                               ` Steven Rostedt
  0 siblings, 2 replies; 51+ messages in thread
From: James Morris @ 2011-07-04  2:19 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Will Drewry, Ingo Molnar, Chris Evans, linux-kernel,
	Linus Torvalds, djm, segoon, kees.cook, fweisbec, tglx,
	Randy Dunlap, linux-doc, Eric Paris, linux-security-module,
	ksummit-2011-discuss

On Fri, 1 Jul 2011, Steven Rostedt wrote:

> On Fri, 2011-07-01 at 13:09 -0500, Will Drewry wrote:
> 
> > I have a panel reserved on system/kernel hardening with Kees at the
> > Linux Security Summit as part of the LPC (thanks to the LSS
> > committee).  Will the relevant people be able to attend that in
> > September?
> 
> No, unfortunately most of the key players will not be there.

Hold on, do we know this for sure?  And that they will all definitely be 
at KS?  I'd really prefer to see this resolved via email, it's more 
scalable, transparent and timely.

In any case, Plumbers and LSS should have some of the key people.

Ingo & Linus: will you be there?



- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Ksummit-2011-discuss] [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 20:25                         ` Kees Cook
@ 2011-07-04 16:09                           ` Greg KH
  0 siblings, 0 replies; 51+ messages in thread
From: Greg KH @ 2011-07-04 16:09 UTC (permalink / raw)
  To: Kees Cook
  Cc: Steven Rostedt, linux-security-module, ksummit-2011-discuss,
	Will Drewry, linux-doc, fweisbec, Chris Evans, djm, James Morris,
	Eric Paris, linux-kernel, Randy Dunlap, tglx, Ingo Molnar,
	Linus Torvalds, segoon

On Fri, Jul 01, 2011 at 01:25:38PM -0700, Kees Cook wrote:
> On Fri, Jul 01, 2011 at 02:04:08PM -0400, Steven Rostedt wrote:
> > On Fri, 2011-07-01 at 11:43 -0500, Will Drewry wrote:
> > > On Fri, Jul 1, 2011 at 11:10 AM, Ingo Molnar <mingo@elte.hu> wrote:
> > 
> > > I'd like to be able to move along security for the platform today and
> > > not in two years, but if my only chance of any form of this being
> > > ACK'd is to write it such that it shares code with perf and has a
> > > shiny new ABI, then I'll queue up the work for when I can start trying
> > > to tackle it.
> > 
> > As this seems to be dragging on, and does not look to be solved by
> > October, I would like to propose this topic for the Kernel Summit in
> > Prague. I believe all parties involved may be there, and if not, I will
> > push hard to get them there.
> > 
> > Email is not always the best median for discussions. Face to face can
> > usually solve things much quicker.
> 
> How about we put it in as-is and mark it experimental, and then folks
> can discuss improvements to it in Oct after all the API users have had
> a chance to play with it? Four months seems like a needless delay to me.
> I respect the objections, but it doesn't seem to balance against the
> demonstrated need for this feature when faced with a viable working patch
> series.

We can't add something and then remove it if userspace programs are
depending on it, sorry.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Ksummit-2011-discuss] [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 18:09                         ` Will Drewry
  2011-07-01 18:48                           ` Steven Rostedt
@ 2011-07-05  2:54                           ` Eugene Teo
  1 sibling, 0 replies; 51+ messages in thread
From: Eugene Teo @ 2011-07-05  2:54 UTC (permalink / raw)
  To: Will Drewry
  Cc: Steven Rostedt, linux-security-module, ksummit-2011-discuss,
	linux-doc, fweisbec, Chris Evans, djm, James Morris, Eric Paris,
	linux-kernel, Randy Dunlap, kees.cook, tglx, Ingo Molnar,
	Linus Torvalds, segoon

On Sat, Jul 2, 2011 at 2:09 AM, Will Drewry <wad@chromium.org> wrote:
> On Fri, Jul 1, 2011 at 1:04 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
>> On Fri, 2011-07-01 at 11:43 -0500, Will Drewry wrote:
>>> On Fri, Jul 1, 2011 at 11:10 AM, Ingo Molnar <mingo@elte.hu> wrote:
>>
>>> I'd like to be able to move along security for the platform today and
>>> not in two years, but if my only chance of any form of this being
>>> ACK'd is to write it such that it shares code with perf and has a
>>> shiny new ABI, then I'll queue up the work for when I can start trying
>>> to tackle it.
>>
>> As this seems to be dragging on, and does not look to be solved by
>> October, I would like to propose this topic for the Kernel Summit in
>> Prague. I believe all parties involved may be there, and if not, I will
>> push hard to get them there.
>>
>> Email is not always the best median for discussions. Face to face can
>> usually solve things much quicker.
>
> I have a panel reserved on system/kernel hardening with Kees at the
> Linux Security Summit as part of the LPC (thanks to the LSS
> committee).  Will the relevant people be able to attend that in
> September?
>
> (Since the KS is invite-only, I'm obviously not on the list for it,
> but I realize I'm just a bit player in this. Maybe some direction can
> be resolved I can play catch up with.)
>
> What's another couple months? :)

I have been involved in mostly the reactive part of security for the
past few years as far as the Linux kernel is concerned, but I am
interested to participate in this at KS if a session is planned. Just
a note, Red Hat is unable to fly me to any summit or conference so I
will need to find some ways to fund my travel. I would love to though.

Thanks, Eugene

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 21:34                         ` Will Drewry
@ 2011-07-05  9:50                           ` Ingo Molnar
  2011-07-06 18:24                             ` Will Drewry
  0 siblings, 1 reply; 51+ messages in thread
From: Ingo Molnar @ 2011-07-05  9:50 UTC (permalink / raw)
  To: Will Drewry
  Cc: James Morris, Chris Evans, linux-kernel, Linus Torvalds, djm,
	segoon, kees.cook, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module


* Will Drewry <wad@chromium.org> wrote:

> > In the end the 'sandboxing' feature should be a few dozen lines 
> > at most - all the rest will just be shared infrastructure.
> 
> Anytime a powerful feature can be a few lines of code, it's a good 
> thing.  It seems like we're still a ways away from defining what 
> the shared infrastructure is that would allow a few dozen lines of 
> code to be enough.  The bones are there, but there's a large amount 
> of missing and under-designed work.

But adding some intermediate solution with its own ABI and its own 
forked specializations hinders (and might even prevent, if it's "good 
enough") the proper solution of this topic.

It's not like such features are in super-high demand so we *want* and 
*need* as much generalization and unification as possible, to utilize 
economies of scale and such.

There's really just two ways forward that i can see (in terms of this 
going upstream via the events/tracing/instrumentation tree that i 
co-maintain):

 1) Do it properly generalized - as shown by the prototype patch.
    I can give you all help that is needed for that: we can host
    intermediate stages in -tip and we can push upstream step by
    step. You won't have to maintain some large in-limbo set of
    patches. 95% of the work you've identified will be warmly
    welcome by everyone and will be utilized well beyond sandboxing! 
    That's not a bad starting position to get something controversial 
    upstream: most of the crazy trees are 95% crazy.

 2) Give a compelling list of technical reasons why the
    generalization is not desirable and thus go for a minimally
    invasive solution

Option #2 does not apply because you've yourself stated that the 
generalizations make a ton of sense (and even if you didnt state it 
i'd make that point).

The option you seem to have opted for:

 3) do it in a half-ways and limited fashion due to time constraints 
    and perceived upstream resistence

is not something that was a winning model in the past so i'm not 
really interested in that.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-04  2:19                             ` James Morris
@ 2011-07-05 12:40                               ` Steven Rostedt
  2011-07-05 23:46                                 ` James Morris
  2011-07-05 23:56                               ` Steven Rostedt
  1 sibling, 1 reply; 51+ messages in thread
From: Steven Rostedt @ 2011-07-05 12:40 UTC (permalink / raw)
  To: James Morris
  Cc: Will Drewry, Ingo Molnar, Chris Evans, linux-kernel,
	Linus Torvalds, djm, segoon, kees.cook, fweisbec, tglx,
	Randy Dunlap, linux-doc, Eric Paris, linux-security-module,
	ksummit-2011-discuss

On Mon, 2011-07-04 at 12:19 +1000, James Morris wrote:
> On Fri, 1 Jul 2011, Steven Rostedt wrote:
> 
> > On Fri, 2011-07-01 at 13:09 -0500, Will Drewry wrote:
> > 
> > > I have a panel reserved on system/kernel hardening with Kees at the
> > > Linux Security Summit as part of the LPC (thanks to the LSS
> > > committee).  Will the relevant people be able to attend that in
> > > September?
> > 
> > No, unfortunately most of the key players will not be there.
> 
> Hold on, do we know this for sure?  And that they will all definitely be 
> at KS?  I'd really prefer to see this resolved via email, it's more 
> scalable, transparent and timely.
> 
> In any case, Plumbers and LSS should have some of the key people.
> 
> Ingo & Linus: will you be there?

I don't know about Linus, but as I was trying to find out who would be
interested in a tracing track a plumber's, Peter Zijlstra, Thomas
Gleixner, and Ingo, all told me that they would not be able to make it.

-- Steve



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-01 13:07               ` Ingo Molnar
  2011-07-01 15:46                 ` Will Drewry
@ 2011-07-05 15:26                 ` Vasiliy Kulikov
  1 sibling, 0 replies; 51+ messages in thread
From: Vasiliy Kulikov @ 2011-07-05 15:26 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Will Drewry, James Morris, Chris Evans, linux-kernel,
	Linus Torvalds, djm, kees.cook, rostedt, fweisbec, tglx,
	Randy Dunlap, linux-doc, Eric Paris, linux-security-module

Hi Ingo,

On Fri, Jul 01, 2011 at 15:07 +0200, Ingo Molnar wrote:
> Have you addressed my basic objection of why we should go for a more 
> complex and less capable variant over a shared yet more capable 
> facility:
> 
>   http://lkml.kernel.org/r/20110526091518.GE26775@elte.hu
> 
> ?
> 
> You are pushing the 'filter engine' approach currently, not the 
> (much) more unified 'event filters' approach.

I'm sorry if my question was already addressed - I wasn't CC'ed to the
first series of the patch, I read some emails from lkml.org, so I might
miss something.

You're proposing the more generic solution than a more limited syscalls
filtering thing.  AFAIU, you're trying to merge different security
models like syscalls filtering, defining object policies, hardening,
etc.  This would be a something like a generic framework, which
will be used by specific models like syscalls filtering:

http://marc.info/?l=linux-kernel&m=130639834930092&w=2

I'm worried about one thing, one of important properties of all
*generic* things: what disadvantages would this framework have?  The
generic filtering engine might be:

1) Too slow if it can filter everything and supersets almost every
policy.

2) Too compicated in sense it exposes too much attack surface.  The latest
patch is rather simple to audit, but generic filters, as every generic
thing, is hard to properly audit.  Will mentioned that tracepoints code
is currently a bit complicated for this sort of things.  It is a
subjective point, but anyway.

3) Too complicated in sense of API/usage.  E.g. current SELinux policies
is too hard to configure properly for most of users, plenty of users
just use the predefined policies coming with distributions.  Would be
the "superset" more complicated?

4) Not "generic" enough.  Some awfully cool new security model invented
in 5 years will need some thing that cannot be added to the current
model without changing ABI or will need the full framework refactoring.
This would lead to reimplementing some already existing filters, but in
sense of this model desires.

(I don't claim that event filtering will end with these issues, surely
no, I'm merely asking about possible problems.)


On the contrary, the syscalls sandboxing clearly has potential users
like QEMU and daemons with privilege separation architecture, similar
(but more weak) model with namespaces separation is proven to work just
well.  The set of users is more or less understandable.  The
unprivileged process has a clearly defined set of permitted operations,
which can be enforced by syscalls inhibitions.  The code is simple
enough to audit.

Yes, this model has a limited set of users, but - hey! - every security
model with the same mission will be used only by restricted set of users.


Another perhaps naive question - how long should it take to bring the event
filtering code to the state when it is aleady useful at leasy for the
complete syscalls filtering?  (Will posted a list of issues that were
not addressed in his PoC because of proper API lacking.)  Wouldn't the
time frame be too large to spawn numerous users of not yet complete
solution, leading to the stalling with incomplete/nonscalabe ABI, which
would be worse than any particular speciality models?


Please don't take my words as an attack to your model, but an attempt to
clarify dark spots, which might be a real barrier of your mutual
understanding.


Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-05 12:40                               ` Steven Rostedt
@ 2011-07-05 23:46                                 ` James Morris
  2011-07-06  0:37                                   ` [Ksummit-2011-discuss] " Ted Ts'o
  0 siblings, 1 reply; 51+ messages in thread
From: James Morris @ 2011-07-05 23:46 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Will Drewry, Ingo Molnar, Chris Evans, linux-kernel,
	Linus Torvalds, djm, segoon, kees.cook, fweisbec, tglx,
	Randy Dunlap, linux-doc, Eric Paris, linux-security-module,
	ksummit-2011-discuss

On Tue, 5 Jul 2011, Steven Rostedt wrote:

> I don't know about Linus, but as I was trying to find out who would be
> interested in a tracing track a plumber's, Peter Zijlstra, Thomas
> Gleixner, and Ingo, all told me that they would not be able to make it.

Damn.  Bring on (or back) the single developer event...


- James
-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-04  2:19                             ` James Morris
  2011-07-05 12:40                               ` Steven Rostedt
@ 2011-07-05 23:56                               ` Steven Rostedt
  1 sibling, 0 replies; 51+ messages in thread
From: Steven Rostedt @ 2011-07-05 23:56 UTC (permalink / raw)
  To: James Morris
  Cc: Will Drewry, Ingo Molnar, Chris Evans, linux-kernel,
	Linus Torvalds, djm, segoon, kees.cook, fweisbec, tglx,
	Randy Dunlap, linux-doc, Eric Paris, linux-security-module,
	ksummit-2011-discuss

On Mon, 2011-07-04 at 12:19 +1000, James Morris wrote:

>   I'd really prefer to see this resolved via email, it's more 
> scalable, transparent and timely.

But unfortunately it's not always as effective as face to face. I've
seen other disagreements solved face to face that never get solved via
email. It's the fact that we are still made of this carbon substance,
and have yet to convert our mobile units into silicon. Although, some
have implanted themselves with silicon, but I doubt it has anything to
do with becoming more logical.

-- Steve



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [Ksummit-2011-discuss] [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-05 23:46                                 ` James Morris
@ 2011-07-06  0:37                                   ` Ted Ts'o
  0 siblings, 0 replies; 51+ messages in thread
From: Ted Ts'o @ 2011-07-06  0:37 UTC (permalink / raw)
  To: James Morris
  Cc: Steven Rostedt, linux-security-module, ksummit-2011-discuss,
	Will Drewry, linux-doc, fweisbec, Chris Evans, djm, linux-kernel,
	Eric Paris, Randy Dunlap, kees.cook, tglx, Ingo Molnar,
	Linus Torvalds, segoon

On Wed, Jul 06, 2011 at 09:46:36AM +1000, James Morris wrote:
> On Tue, 5 Jul 2011, Steven Rostedt wrote:
> 
> > I don't know about Linus, but as I was trying to find out who would be
> > interested in a tracing track a plumber's, Peter Zijlstra, Thomas
> > Gleixner, and Ingo, all told me that they would not be able to make it.
> 
> Damn.  Bring on (or back) the single developer event...

I tried telling you that people were complaining about needing to fly
to different cities for Plumbers and Kernel Summit, and you didn't
believe me.....  :-)

					- Ted

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works.
  2011-07-05  9:50                           ` Ingo Molnar
@ 2011-07-06 18:24                             ` Will Drewry
  0 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-07-06 18:24 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: James Morris, Chris Evans, linux-kernel, Linus Torvalds, djm,
	segoon, kees.cook, rostedt, fweisbec, tglx, Randy Dunlap,
	linux-doc, Eric Paris, linux-security-module

On Tue, Jul 5, 2011 at 4:50 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Will Drewry <wad@chromium.org> wrote:
>
>> > In the end the 'sandboxing' feature should be a few dozen lines
>> > at most - all the rest will just be shared infrastructure.
>>
>> Anytime a powerful feature can be a few lines of code, it's a good
>> thing.  It seems like we're still a ways away from defining what
>> the shared infrastructure is that would allow a few dozen lines of
>> code to be enough.  The bones are there, but there's a large amount
>> of missing and under-designed work.
>
> But adding some intermediate solution with its own ABI and its own
> forked specializations hinders (and might even prevent, if it's "good
> enough") the proper solution of this topic.
>
> It's not like such features are in super-high demand so we *want* and
> *need* as much generalization and unification as possible, to utilize
> economies of scale and such.
>
> There's really just two ways forward that i can see (in terms of this
> going upstream via the events/tracing/instrumentation tree that i
> co-maintain):
>
>  1) Do it properly generalized - as shown by the prototype patch.
>    I can give you all help that is needed for that: we can host
>    intermediate stages in -tip and we can push upstream step by
>    step. You won't have to maintain some large in-limbo set of
>    patches. 95% of the work you've identified will be warmly
>    welcome by everyone and will be utilized well beyond sandboxing!
>    That's not a bad starting position to get something controversial
>    upstream: most of the crazy trees are 95% crazy.
>
>  2) Give a compelling list of technical reasons why the
>    generalization is not desirable and thus go for a minimally
>    invasive solution
>
> Option #2 does not apply because you've yourself stated that the
> generalizations make a ton of sense (and even if you didnt state it
> i'd make that point).
>
> The option you seem to have opted for:
>
>  3) do it in a half-ways and limited fashion due to time constraints
>    and perceived upstream resistence
>
> is not something that was a winning model in the past so i'm not
> really interested in that.


My connectivity it a bit limiting right now, but I'll go think through
what we've discussed in these threads and what you propose here and
see if I can chart out a path that makes sense to me, or if there are
pieces I just can't resolve.  I'll follow up soon-ish.

My main concerns at this point center around making sure that whatever
solution results still implements the original goal of the patch set
-- to reduce the attack surface of the kernel in a reliable way.
Having a robust plan will help there, I hope, and give others
something to provide constructive feedback on.  Also, I'd much prefer
to be making progress on this sooner rather than waiting until October
just to figure out where to begin.  As is, I'm not entirely sold that
perf/ftrace event ids are the right interface*, but it does make sense
to explore it more thoroughly given their use in the event analysis in
this patch series.  Most likely, I'll be able to resolve my own
objection :)

thanks,
will


*  For instance, I can compile a filter set with __NR_unistd as the id
in the current patch series and have that compiled code work when run
without traversing a debugfs mount point.  That will just be another
area to be addressed in whatever the next solution is.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-06-24  0:36   ` Will Drewry
@ 2011-08-30  5:28     ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2011-08-30  5:28 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, torvalds, djm, segoon, kees.cook, mingo, rostedt,
	jmorris, fweisbec, tglx, scarybeasts, Paul Mackerras,
	linuxppc-dev

On Thu, 2011-06-23 at 19:36 -0500, Will Drewry wrote:
> Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
> system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
> 
> v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf
> 
> Signed-off-by: Will Drewry <wad@chromium.org>

Seen these around for a while ... :-)

I don't see a harm in the patches per-se tho I haven't reviewed the
actual seccomp filter stuff and it's good (or bad) behaviour on ppc.

Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Cheers,
Ben.

> ---
>  arch/powerpc/Kconfig               |    1 +
>  arch/powerpc/include/asm/seccomp.h |    2 ++
>  2 files changed, 3 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 2729c66..030d392 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -129,6 +129,7 @@ config PPC
>  	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
>  	select HAVE_GENERIC_HARDIRQS
>  	select HAVE_SPARSE_IRQ
> +	select HAVE_SECCOMP_FILTER
>  	select IRQ_PER_CPU
>  	select GENERIC_IRQ_SHOW
>  	select GENERIC_IRQ_SHOW_LEVEL
> diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
> index 00c1d91..3cb9cc1 100644
> --- a/arch/powerpc/include/asm/seccomp.h
> +++ b/arch/powerpc/include/asm/seccomp.h
> @@ -7,10 +7,12 @@
>  #define __NR_seccomp_write __NR_write
>  #define __NR_seccomp_exit __NR_exit
>  #define __NR_seccomp_sigreturn __NR_rt_sigreturn
> +#define __NR_seccomp_execve __NR_execve
>  
>  #define __NR_seccomp_read_32 __NR_read
>  #define __NR_seccomp_write_32 __NR_write
>  #define __NR_seccomp_exit_32 __NR_exit
>  #define __NR_seccomp_sigreturn_32 __NR_sigreturn
> +#define __NR_seccomp_execve_32 __NR_execve
>  
>  #endif	/* _ASM_POWERPC_SECCOMP_H */



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
@ 2011-08-30  5:28     ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2011-08-30  5:28 UTC (permalink / raw)
  To: Will Drewry
  Cc: linuxppc-dev, fweisbec, scarybeasts, djm, linux-kernel, rostedt,
	segoon, tglx, Paul Mackerras, kees.cook, jmorris, torvalds,
	mingo

On Thu, 2011-06-23 at 19:36 -0500, Will Drewry wrote:
> Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
> system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
> 
> v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf
> 
> Signed-off-by: Will Drewry <wad@chromium.org>

Seen these around for a while ... :-)

I don't see a harm in the patches per-se tho I haven't reviewed the
actual seccomp filter stuff and it's good (or bad) behaviour on ppc.

Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Cheers,
Ben.

> ---
>  arch/powerpc/Kconfig               |    1 +
>  arch/powerpc/include/asm/seccomp.h |    2 ++
>  2 files changed, 3 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 2729c66..030d392 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -129,6 +129,7 @@ config PPC
>  	select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64
>  	select HAVE_GENERIC_HARDIRQS
>  	select HAVE_SPARSE_IRQ
> +	select HAVE_SECCOMP_FILTER
>  	select IRQ_PER_CPU
>  	select GENERIC_IRQ_SHOW
>  	select GENERIC_IRQ_SHOW_LEVEL
> diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
> index 00c1d91..3cb9cc1 100644
> --- a/arch/powerpc/include/asm/seccomp.h
> +++ b/arch/powerpc/include/asm/seccomp.h
> @@ -7,10 +7,12 @@
>  #define __NR_seccomp_write __NR_write
>  #define __NR_seccomp_exit __NR_exit
>  #define __NR_seccomp_sigreturn __NR_rt_sigreturn
> +#define __NR_seccomp_execve __NR_execve
>  
>  #define __NR_seccomp_read_32 __NR_read
>  #define __NR_seccomp_write_32 __NR_write
>  #define __NR_seccomp_exit_32 __NR_exit
>  #define __NR_seccomp_sigreturn_32 __NR_sigreturn
> +#define __NR_seccomp_execve_32 __NR_execve
>  
>  #endif	/* _ASM_POWERPC_SECCOMP_H */

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-08-30  5:28     ` Benjamin Herrenschmidt
@ 2011-11-28  0:14       ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2011-11-28  0:14 UTC (permalink / raw)
  To: Will Drewry
  Cc: linuxppc-dev, fweisbec, scarybeasts, djm, linux-kernel, rostedt,
	segoon, tglx, Paul Mackerras, kees.cook, jmorris, torvalds,
	mingo

On Tue, 2011-08-30 at 15:28 +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2011-06-23 at 19:36 -0500, Will Drewry wrote:
> > Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
> > system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
> > 
> > v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf
> > 
> > Signed-off-by: Will Drewry <wad@chromium.org>
> 
> Seen these around for a while ... :-)
> 
> I don't see a harm in the patches per-se tho I haven't reviewed the
> actual seccomp filter stuff and it's good (or bad) behaviour on ppc.

Did that stuff every got anywhere ? I don't see HAVE_SECCOMP_FILTER
upsteam ... should I just drop the powerpc patch from patchwork ?

Cheers,
Ben.



^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
@ 2011-11-28  0:14       ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 51+ messages in thread
From: Benjamin Herrenschmidt @ 2011-11-28  0:14 UTC (permalink / raw)
  To: Will Drewry
  Cc: fweisbec, scarybeasts, djm, linux-kernel, rostedt, jmorris,
	torvalds, kees.cook, Paul Mackerras, mingo, tglx, linuxppc-dev,
	segoon

On Tue, 2011-08-30 at 15:28 +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2011-06-23 at 19:36 -0500, Will Drewry wrote:
> > Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
> > system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
> > 
> > v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf
> > 
> > Signed-off-by: Will Drewry <wad@chromium.org>
> 
> Seen these around for a while ... :-)
> 
> I don't see a harm in the patches per-se tho I haven't reviewed the
> actual seccomp filter stuff and it's good (or bad) behaviour on ppc.

Did that stuff every got anywhere ? I don't see HAVE_SECCOMP_FILTER
upsteam ... should I just drop the powerpc patch from patchwork ?

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
  2011-11-28  0:14       ` Benjamin Herrenschmidt
@ 2011-11-28  1:45         ` Will Drewry
  -1 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-11-28  1:45 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: linuxppc-dev, fweisbec, scarybeasts, djm, linux-kernel, rostedt,
	segoon, tglx, Paul Mackerras, kees.cook, jmorris, torvalds,
	mingo

On Sun, Nov 27, 2011 at 6:14 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Tue, 2011-08-30 at 15:28 +1000, Benjamin Herrenschmidt wrote:
>> On Thu, 2011-06-23 at 19:36 -0500, Will Drewry wrote:
>> > Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
>> > system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
>> >
>> > v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf
>> >
>> > Signed-off-by: Will Drewry <wad@chromium.org>
>>
>> Seen these around for a while ... :-)
>>
>> I don't see a harm in the patches per-se tho I haven't reviewed the
>> actual seccomp filter stuff and it's good (or bad) behaviour on ppc.
>
> Did that stuff every got anywhere ? I don't see HAVE_SECCOMP_FILTER
> upsteam ... should I just drop the powerpc patch from patchwork ?

Thanks for following up! At present, it stalled out, so I think it'd
make sense to just drop it.  I will repost the series soon-ish and see
if any progress can be made.  [I've explored a number of alternative
approaches and still think something along the seccomp_filter lines is
still rational.]

cheers!
will

^ permalink raw reply	[flat|nested] 51+ messages in thread

* Re: [PATCH v9 11/13] powerpc: select HAVE_SECCOMP_FILTER and provide seccomp_execve
@ 2011-11-28  1:45         ` Will Drewry
  0 siblings, 0 replies; 51+ messages in thread
From: Will Drewry @ 2011-11-28  1:45 UTC (permalink / raw)
  To: Benjamin Herrenschmidt
  Cc: fweisbec, scarybeasts, djm, linux-kernel, rostedt, jmorris,
	torvalds, kees.cook, Paul Mackerras, mingo, tglx, linuxppc-dev,
	segoon

On Sun, Nov 27, 2011 at 6:14 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Tue, 2011-08-30 at 15:28 +1000, Benjamin Herrenschmidt wrote:
>> On Thu, 2011-06-23 at 19:36 -0500, Will Drewry wrote:
>> > Facilitate the use of CONFIG_SECCOMP_FILTER by wrapping compatibility
>> > system call numbering for execve and selecting HAVE_SECCOMP_FILTER.
>> >
>> > v9: rebase on to bccaeafd7c117acee36e90d37c7e05c19be9e7bf
>> >
>> > Signed-off-by: Will Drewry <wad@chromium.org>
>>
>> Seen these around for a while ... :-)
>>
>> I don't see a harm in the patches per-se tho I haven't reviewed the
>> actual seccomp filter stuff and it's good (or bad) behaviour on ppc.
>
> Did that stuff every got anywhere ? I don't see HAVE_SECCOMP_FILTER
> upsteam ... should I just drop the powerpc patch from patchwork ?

Thanks for following up! At present, it stalled out, so I think it'd
make sense to just drop it.  I will repost the series soon-ish and see
if any progress can be made.  [I've explored a number of alternative
approaches and still think something along the seccomp_filter lines is
still rational.]

cheers!
will

^ permalink raw reply	[flat|nested] 51+ messages in thread

end of thread, other threads:[~2011-11-28  1:45 UTC | newest]

Thread overview: 51+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-06-24  0:36 [PATCH v9 01/13] tracing: split out filter initialization and clean up uses Will Drewry
2011-06-24  0:36 ` [PATCH v9 02/13] tracing: split out syscall_trace_enter construction Will Drewry
2011-06-24  0:36 ` [PATCH v9 03/13] seccomp_filter: new mode with configurable syscall filters Will Drewry
2011-06-24  7:30   ` Damien Miller
2011-06-24 20:20   ` Kees Cook
2011-06-24  0:36 ` [PATCH v9 04/13] seccomp_filter: add process state reporting Will Drewry
2011-06-24  0:36 ` [PATCH v9 05/13] seccomp_filter: Document what seccomp_filter is and how it works Will Drewry
2011-06-24  7:24   ` Chris Evans
     [not found]   ` <BANLkTimtYUyXbZjWhjK61B_1WBXE4MoAeA@mail.gmail.com>
2011-06-26 23:20     ` James Morris
2011-06-29 19:13       ` Will Drewry
2011-06-30  1:30         ` James Morris
2011-07-01 11:56           ` Ingo Molnar
2011-07-01 12:56             ` Will Drewry
2011-07-01 13:07               ` Ingo Molnar
2011-07-01 15:46                 ` Will Drewry
2011-07-01 16:10                   ` Ingo Molnar
2011-07-01 16:43                     ` Will Drewry
2011-07-01 18:04                       ` Steven Rostedt
2011-07-01 18:09                         ` Will Drewry
2011-07-01 18:48                           ` Steven Rostedt
2011-07-04  2:19                             ` James Morris
2011-07-05 12:40                               ` Steven Rostedt
2011-07-05 23:46                                 ` James Morris
2011-07-06  0:37                                   ` [Ksummit-2011-discuss] " Ted Ts'o
2011-07-05 23:56                               ` Steven Rostedt
2011-07-05  2:54                           ` [Ksummit-2011-discuss] " Eugene Teo
2011-07-01 20:25                         ` Kees Cook
2011-07-04 16:09                           ` [Ksummit-2011-discuss] " Greg KH
2011-07-01 21:00                       ` Ingo Molnar
2011-07-01 21:34                         ` Will Drewry
2011-07-05  9:50                           ` Ingo Molnar
2011-07-06 18:24                             ` Will Drewry
2011-07-05 15:26                 ` Vasiliy Kulikov
2011-06-24  0:36 ` [PATCH v9 06/13] x86: add HAVE_SECCOMP_FILTER and seccomp_execve Will Drewry
2011-06-24  0:36 ` [PATCH v9 07/13] arm: select HAVE_SECCOMP_FILTER Will Drewry
2011-06-24  0:36   ` Will Drewry
2011-06-24  0:36 ` [PATCH v9 08/13] microblaze: select HAVE_SECCOMP_FILTER and provide seccomp_execve Will Drewry
2011-06-24  0:36 ` [PATCH v9 09/13] mips: " Will Drewry
2011-06-24  0:36 ` [PATCH v9 10/13] s390: " Will Drewry
2011-06-24  0:36 ` [PATCH v9 11/13] powerpc: " Will Drewry
2011-06-24  0:36   ` Will Drewry
2011-08-30  5:28   ` Benjamin Herrenschmidt
2011-08-30  5:28     ` Benjamin Herrenschmidt
2011-11-28  0:14     ` Benjamin Herrenschmidt
2011-11-28  0:14       ` Benjamin Herrenschmidt
2011-11-28  1:45       ` Will Drewry
2011-11-28  1:45         ` Will Drewry
2011-06-24  0:36 ` [PATCH v9 12/13] sparc: " Will Drewry
2011-06-24  0:36   ` Will Drewry
2011-06-24  0:36 ` [PATCH v9 13/13] sh: select HAVE_SECCOMP_FILTER Will Drewry
2011-06-24  0:36   ` Will Drewry

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.