Re: security problem with seccomp-filter

From: Felix von Leitner <felix-linuxkernel@fefe.de>
To: Kees Cook <keescook@chromium.org>
Cc: LKML <linux-kernel@vger.kernel.org>,
	Andy Lutomirski <luto@amacapital.net>,
	Will Drewry <wad@chromium.org>
Subject: Re: security problem with seccomp-filter
Date: Sun, 12 Apr 2015 23:33:12 +0200	[thread overview]
Message-ID: <20150412213311.GA16854@qarx.de> (raw)
In-Reply-To: <CAGXu5jKNKQvDk_V3bSa6Bp18VTMJkdixk9tUOVGNWaxjsnfqug@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 873 bytes --]

> What you're describing should work correctly (it's part of the
> regression test suite we use). So, given that, I'd love to get to the
> bottom of what you're seeing. Do you have a URL to your code? What
> architecture are you running on?

Well, I must be doing something wrong then.
I extracted a test case from my program.
I put it on http://ptrace.fefe.de/seccompfail.c

It installs three seccomp filters, the last one containing this:

    DISALLOW_SYSCALL(prctl),

with

#define DISALLOW_SYSCALL(name) \
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)

It is my understanding that that should then kill the process if the
prctl syscall is called again.

I test this by attempting to install the very same seccomp filter again,
which calls prctl, but the process is not killed.

What am I doing wrong?

Thanks,
Felix

[-- Attachment #2: seccompfail.c --]
[-- Type: text/x-csrc, Size: 6334 bytes --]

#include <stddef.h>
#include <features.h>
#include <inttypes.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netinet/ip_icmp.h>
#include <arpa/inet.h>
#include <sys/poll.h>
#include <unistd.h>
#include <time.h>
#include <netdb.h>
#include <alloca.h>
#include <signal.h>
#include <errno.h>

#include <sys/prctl.h>
#include <linux/unistd.h>
#include <linux/audit.h>
#include <linux/filter.h>
#include <linux/seccomp.h>

#ifndef SECCOMP_MODE_FILTER
# define SECCOMP_MODE_FILTER	2 /* uses user-supplied filter. */
# define SECCOMP_RET_KILL	0x00000000U /* kill the task immediately */
# define SECCOMP_RET_TRAP	0x00030000U /* disallow and force a SIGSYS */
# define SECCOMP_RET_ALLOW	0x7fff0000U /* allow */
struct seccomp_data {
    int nr;
    __u32 arch;
    __u64 instruction_pointer;
    __u64 args[6];
};
#endif
#ifndef SYS_SECCOMP
# define SYS_SECCOMP 1
#endif

#define syscall_nr (offsetof(struct seccomp_data, nr))

#if defined(__i386__)
# define REG_SYSCALL	REG_EAX
# define ARCH_NR	AUDIT_ARCH_I386
#elif defined(__x86_64__)
# define REG_SYSCALL	REG_RAX
# define ARCH_NR	AUDIT_ARCH_X86_64
#else
# error "Platform does not support seccomp filter yet"
#endif

#define ALLOW_SYSCALL(name) \
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)

static int install_syscall_filter(void) {
  /* Linux allows a process to restrict itself (and potential children)
   * in what syscalls can be issued.  The mechanism is called
   * seccomp-filter or "seccomp mode 2".  It works by reusing the
   * Berkeley Packet Filter, which is meant for PCAP-style packet
   * filtering expressions like "only TCP packets, please".  But it is
   * really a bytecode that has to be passed inside an array, and each
   * instruction is constructed using scary looking macros.  The basics
   * are not so bad, however.  We have two registers, one accumulator
   * and one index register (which is not used in this part of the
   * code), and instead of a network packet we are operating on a
   * certain struct with the syscall info, which is called seccomp_data
   * (reproduced above). */
  struct sock_filter filter[] = {
    /* validate architecture to avoid x32-on-x86_64 syscall aliasing shenanigans */

    /* BPF_LD = load, BPF_W = word, BPF_ABS = absolute offset */
    BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, arch)),
    /* BPF_JMP+BPF_JEQ+BPF_K = compare accumulator to constant (in our
     * case, ARCH_NR), and skip the next instruction if equal */
    BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, ARCH_NR, 1, 0),
    /* "return SECCOMP_RET_KILL", tell seccomp to kill the process */
    BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),

    /* load the syscall number */
    BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),

    /* and now a list of allowed syscalls */
    ALLOW_SYSCALL(rt_sigreturn),
#ifdef __NR_sigreturn
    ALLOW_SYSCALL(sigreturn),
#endif
    ALLOW_SYSCALL(exit_group),
    ALLOW_SYSCALL(exit),

#ifdef __NR_socketcall
    ALLOW_SYSCALL(socketcall),
#else
    ALLOW_SYSCALL(socket),
    ALLOW_SYSCALL(sendto),
    ALLOW_SYSCALL(recvfrom),
#endif

    ALLOW_SYSCALL(poll),

    /* so we can further restrict allowed syscalls */
    ALLOW_SYSCALL(prctl),

    /* so gethostbyname can open /etc/resolv.conf */
    ALLOW_SYSCALL(open),
    ALLOW_SYSCALL(read),
    ALLOW_SYSCALL(mmap),
    ALLOW_SYSCALL(mmap2),
    ALLOW_SYSCALL(munmap),
    ALLOW_SYSCALL(lseek),
    ALLOW_SYSCALL(_llseek),
    ALLOW_SYSCALL(close),

    /* for our time keeping */
    ALLOW_SYSCALL(gettimeofday),	// x86_64 uses a vsyscall for this, so this filter will never trigger

    /* for when buffer writes the output; since we only write to stdout, filter for fd==1 */
    BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_write, 0, 4),
    /* it's write(2).  Load first argument into accumulator */
    BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, args[0])),
    /* if it's 1 (stdout), skip 1 instruction */
    BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 1, 1, 0),
    /* "return SECCOMP_RET_KILL", tell seccomp to kill the process */
    BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
    /* "return SECCOMP_RET_ALLOW", tell seccomp to allow the syscall */
    BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),

    /* if none of these syscalls matched, kill the process */
    BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
  };
  struct sock_fprog prog = {
    .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
    .filter = filter
  };

  /* see linux/Documentation/prctl/no_new_privs.txt */
  if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
    /* if this fails, we are running on an ancient kernel without
     * seccomp support; nothing we can do about it, really. */
    return -1;
  }

  /* see linux/Documentation/prctl/seccomp_filter.txt */
  if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
    /* if this happens, we are running on a kernel without seccomp
     * filters support; nothing we can do about it, really. */
    return -1;
  }
  return 0;
}

#define DISALLOW_SYSCALL(name) \
	BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)

static int seccomp_denyfile() {
  struct sock_filter filter[] = {
    DISALLOW_SYSCALL(open),
    DISALLOW_SYSCALL(mmap),
    DISALLOW_SYSCALL(mmap2),
    DISALLOW_SYSCALL(munmap),
    DISALLOW_SYSCALL(lseek),
    DISALLOW_SYSCALL(_llseek),
    DISALLOW_SYSCALL(close),
    BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
  };
  struct sock_fprog prog = {
    .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
    .filter = filter
  };
  return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
}

static int seccomp_denysocket() {
  struct sock_filter filter[] = {
#ifndef __NR_socketcall
    DISALLOW_SYSCALL(setsockopt),
    DISALLOW_SYSCALL(socket),
#endif
    DISALLOW_SYSCALL(prctl),
    BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
  };
  struct sock_fprog prog = {
    .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
    .filter = filter
  };
  return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
}

int main(int argc,char* argv[]) {
  /* If it fails, the kernel does not support seccomp filter.
   * We'll just continue */
  install_syscall_filter();

  seccomp_denyfile();

  seccomp_denysocket();
  seccomp_denysocket();

  return 0;
}