All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86/vsyscall: allow seccomp filter in vsyscall=emulate
@ 2012-07-13  5:17 Will Drewry
  2012-07-13 14:32 ` Andrew Lutomirski
  2012-07-13 17:06 ` [PATCH v2] " Will Drewry
  0 siblings, 2 replies; 7+ messages in thread
From: Will Drewry @ 2012-07-13  5:17 UTC (permalink / raw)
  To: linux-kernel, torvalds
  Cc: qmewlo, eparis, keescook, james.l.morris, hpa, cevans, luto, Will Drewry

If a seccomp filter program is installed, older static binaries and
distributions with older libc implementations (glibc 2.13 and earlier)
that rely on vsyscall use will be terminated regardless of the filter
program policy when executing time, gettimeofday, or getcpu.  This is
only the case when vsyscall emulation is in use (vsyscall=emulate is the
default).

This patch emulates system call entry inside a vsyscall=emulate trap
such that seccomp can properly evaluate the system call.

Reported-by: Owen Kibel <qmewlo@gmail.com>
Signed-off-by: Will Drewry <wad@chromium.org>
---
 arch/x86/kernel/vsyscall_64.c |   29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 7515cf0..433545f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -139,6 +139,14 @@ static int addr_to_vsyscall_nr(unsigned long addr)
 	return nr;
 }
 
+static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
+{
+	if (!seccomp_mode(&tsk->seccomp))
+		return 0;
+	task_pt_regs(tsk)->orig_ax = syscall_nr;
+	return __secure_computing(syscall_nr);
+}
+
 static bool write_ok_or_segv(unsigned long ptr, size_t size)
 {
 	/*
@@ -174,6 +182,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	int vsyscall_nr;
 	int prev_sig_on_uaccess_error;
 	long ret;
+	int skip;
 
 	/*
 	 * No point in checking CS -- the only way to get here is a user mode
@@ -205,9 +214,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	}
 
 	tsk = current;
-	if (seccomp_mode(&tsk->seccomp))
-		do_exit(SIGKILL);
-
 	/*
 	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
 	 * preserve that behavior to make writing exploits harder.
@@ -222,8 +228,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	 * address 0".
 	 */
 	ret = -EFAULT;
+	skip = 0;
 	switch (vsyscall_nr) {
 	case 0:
+		skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
+		if (skip)
+			break;
+
 		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
 		    !write_ok_or_segv(regs->si, sizeof(struct timezone)))
 			break;
@@ -234,6 +245,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 		break;
 
 	case 1:
+		skip = vsyscall_seccomp(tsk, __NR_time);
+		if (skip)
+			break;
+
 		if (!write_ok_or_segv(regs->di, sizeof(time_t)))
 			break;
 
@@ -241,6 +256,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 		break;
 
 	case 2:
+		skip = vsyscall_seccomp(tsk, __NR_getcpu);
+		if (skip)
+			break;
+
 		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
 		    !write_ok_or_segv(regs->si, sizeof(unsigned)))
 			break;
@@ -253,6 +272,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
 
+	if (skip)
+		goto do_ret;
+
 	if (ret == -EFAULT) {
 		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
 		warn_bad_vsyscall(KERN_INFO, regs,
@@ -271,6 +293,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 	regs->ax = ret;
 
+do_ret:
 	/* Emulate a ret instruction. */
 	regs->ip = caller;
 	regs->sp += 8;
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86/vsyscall: allow seccomp filter in vsyscall=emulate
  2012-07-13  5:17 [PATCH] x86/vsyscall: allow seccomp filter in vsyscall=emulate Will Drewry
@ 2012-07-13 14:32 ` Andrew Lutomirski
  2012-07-13 17:11   ` Will Drewry
  2012-07-13 17:06 ` [PATCH v2] " Will Drewry
  1 sibling, 1 reply; 7+ messages in thread
From: Andrew Lutomirski @ 2012-07-13 14:32 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, torvalds, qmewlo, eparis, keescook, james.l.morris,
	hpa, cevans

On Thu, Jul 12, 2012 at 10:17 PM, Will Drewry <wad@chromium.org> wrote:
> If a seccomp filter program is installed, older static binaries and
> distributions with older libc implementations (glibc 2.13 and earlier)
> that rely on vsyscall use will be terminated regardless of the filter
> program policy when executing time, gettimeofday, or getcpu.  This is
> only the case when vsyscall emulation is in use (vsyscall=emulate is the
> default).
>
> This patch emulates system call entry inside a vsyscall=emulate trap
> such that seccomp can properly evaluate the system call.
>
> Reported-by: Owen Kibel <qmewlo@gmail.com>
> Signed-off-by: Will Drewry <wad@chromium.org>
> ---
>  arch/x86/kernel/vsyscall_64.c |   29 ++++++++++++++++++++++++++---
>  1 file changed, 26 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
> index 7515cf0..433545f 100644
> --- a/arch/x86/kernel/vsyscall_64.c
> +++ b/arch/x86/kernel/vsyscall_64.c
> @@ -139,6 +139,14 @@ static int addr_to_vsyscall_nr(unsigned long addr)
>         return nr;
>  }
>
> +static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
> +{
> +       if (!seccomp_mode(&tsk->seccomp))
> +               return 0;
> +       task_pt_regs(tsk)->orig_ax = syscall_nr;
> +       return __secure_computing(syscall_nr);
> +}
> +
>  static bool write_ok_or_segv(unsigned long ptr, size_t size)
>  {
>         /*
> @@ -174,6 +182,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>         int vsyscall_nr;
>         int prev_sig_on_uaccess_error;
>         long ret;
> +       int skip;
>
>         /*
>          * No point in checking CS -- the only way to get here is a user mode
> @@ -205,9 +214,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>         }
>
>         tsk = current;
> -       if (seccomp_mode(&tsk->seccomp))
> -               do_exit(SIGKILL);
> -
>         /*
>          * With a real vsyscall, page faults cause SIGSEGV.  We want to
>          * preserve that behavior to make writing exploits harder.
> @@ -222,8 +228,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>          * address 0".
>          */
>         ret = -EFAULT;
> +       skip = 0;
>         switch (vsyscall_nr) {
>         case 0:
> +               skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
> +               if (skip)
> +                       break;
> +
>                 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
>                     !write_ok_or_segv(regs->si, sizeof(struct timezone)))
>                         break;
> @@ -234,6 +245,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>                 break;
>
>         case 1:
> +               skip = vsyscall_seccomp(tsk, __NR_time);
> +               if (skip)
> +                       break;
> +
>                 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
>                         break;
>
> @@ -241,6 +256,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>                 break;
>
>         case 2:
> +               skip = vsyscall_seccomp(tsk, __NR_getcpu);
> +               if (skip)
> +                       break;
> +
>                 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
>                     !write_ok_or_segv(regs->si, sizeof(unsigned)))
>                         break;
> @@ -253,6 +272,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>
>         current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
>
> +       if (skip)
> +               goto do_ret;
> +
>         if (ret == -EFAULT) {
>                 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
>                 warn_bad_vsyscall(KERN_INFO, regs,
> @@ -271,6 +293,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>
>         regs->ax = ret;
>
> +do_ret:
>         /* Emulate a ret instruction. */
>         regs->ip = caller;
>         regs->sp += 8;

Does this work correctly in SECCOMP_RET_TRAP, TRACE, or ERRNO mode?
errno looks okay, but trap and trace still emulate the ret
instruction, which looks like it could confuse debuggers.  (If, on the
other hand, no change is made to the registers, then the debugger will
see a syscall instruction at rip, albeit one that can't actually be
executed due to the nx bit.)

--Andy

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] x86/vsyscall: allow seccomp filter in vsyscall=emulate
  2012-07-13  5:17 [PATCH] x86/vsyscall: allow seccomp filter in vsyscall=emulate Will Drewry
  2012-07-13 14:32 ` Andrew Lutomirski
@ 2012-07-13 17:06 ` Will Drewry
  2012-07-13 23:00   ` Andrew Lutomirski
  1 sibling, 1 reply; 7+ messages in thread
From: Will Drewry @ 2012-07-13 17:06 UTC (permalink / raw)
  To: linux-kernel, torvalds
  Cc: qmewlo, eparis, keescook, james.l.morris, hpa, cevans, luto, Will Drewry

If a seccomp filter program is installed, older static binaries and
distributions with older libc implementations (glibc 2.13 and earlier)
that rely on vsyscall use will be terminated regardless of the filter
program policy when executing time, gettimeofday, or getcpu.  This is
only the case when vsyscall emulation is in use (vsyscall=emulate is the
default).

This patch emulates system call entry inside a vsyscall=emulate by
populating regs->ax and regs->orig_ax with the system call number prior
to calling into seccomp such that all seccomp-dependencies function
normally.  Additionally, system call return behavior is emulated in line
with other vsyscall entrypoints for the trace/trap cases.

Reported-by: Owen Kibel <qmewlo@gmail.com>
Signed-off-by: Will Drewry <wad@chromium.org>

v2: - fixed ip and sp on SECCOMP_RET_TRAP/TRACE (thanks to luto@mit.edu)
---
 arch/x86/kernel/vsyscall_64.c |   35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 7515cf0..08a18d0 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -139,6 +139,15 @@ static int addr_to_vsyscall_nr(unsigned long addr)
 	return nr;
 }
 
+static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
+{
+	if (!seccomp_mode(&tsk->seccomp))
+		return 0;
+	task_pt_regs(tsk)->orig_ax = syscall_nr;
+	task_pt_regs(tsk)->ax = syscall_nr;
+	return __secure_computing(syscall_nr);
+}
+
 static bool write_ok_or_segv(unsigned long ptr, size_t size)
 {
 	/*
@@ -174,6 +183,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	int vsyscall_nr;
 	int prev_sig_on_uaccess_error;
 	long ret;
+	int skip;
 
 	/*
 	 * No point in checking CS -- the only way to get here is a user mode
@@ -205,9 +215,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	}
 
 	tsk = current;
-	if (seccomp_mode(&tsk->seccomp))
-		do_exit(SIGKILL);
-
 	/*
 	 * With a real vsyscall, page faults cause SIGSEGV.  We want to
 	 * preserve that behavior to make writing exploits harder.
@@ -222,8 +229,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 	 * address 0".
 	 */
 	ret = -EFAULT;
+	skip = 0;
 	switch (vsyscall_nr) {
 	case 0:
+		skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
+		if (skip)
+			break;
+
 		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
 		    !write_ok_or_segv(regs->si, sizeof(struct timezone)))
 			break;
@@ -234,6 +246,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 		break;
 
 	case 1:
+		skip = vsyscall_seccomp(tsk, __NR_time);
+		if (skip)
+			break;
+
 		if (!write_ok_or_segv(regs->di, sizeof(time_t)))
 			break;
 
@@ -241,6 +257,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 		break;
 
 	case 2:
+		skip = vsyscall_seccomp(tsk, __NR_getcpu);
+		if (skip)
+			break;
+
 		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
 		    !write_ok_or_segv(regs->si, sizeof(unsigned)))
 			break;
@@ -253,6 +273,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
 
+	if (skip) {
+		if ((long)regs->ax <= 0L) /* seccomp errno emulation */
+			goto do_ret;
+		goto done; /* seccomp trace/trap */
+	}
+
 	if (ret == -EFAULT) {
 		/* Bad news -- userspace fed a bad pointer to a vsyscall. */
 		warn_bad_vsyscall(KERN_INFO, regs,
@@ -271,10 +297,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 	regs->ax = ret;
 
+do_ret:
 	/* Emulate a ret instruction. */
 	regs->ip = caller;
 	regs->sp += 8;
-
+done:
 	return true;
 
 sigsegv:
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86/vsyscall: allow seccomp filter in vsyscall=emulate
  2012-07-13 14:32 ` Andrew Lutomirski
@ 2012-07-13 17:11   ` Will Drewry
  0 siblings, 0 replies; 7+ messages in thread
From: Will Drewry @ 2012-07-13 17:11 UTC (permalink / raw)
  To: Andrew Lutomirski
  Cc: linux-kernel, torvalds, qmewlo, eparis, keescook, james.l.morris,
	hpa, cevans

On Fri, Jul 13, 2012 at 9:32 AM, Andrew Lutomirski <luto@mit.edu> wrote:
> On Thu, Jul 12, 2012 at 10:17 PM, Will Drewry <wad@chromium.org> wrote:
>> If a seccomp filter program is installed, older static binaries and
>> distributions with older libc implementations (glibc 2.13 and earlier)
>> that rely on vsyscall use will be terminated regardless of the filter
>> program policy when executing time, gettimeofday, or getcpu.  This is
>> only the case when vsyscall emulation is in use (vsyscall=emulate is the
>> default).
>>
>> This patch emulates system call entry inside a vsyscall=emulate trap
>> such that seccomp can properly evaluate the system call.
>>
>> Reported-by: Owen Kibel <qmewlo@gmail.com>
>> Signed-off-by: Will Drewry <wad@chromium.org>
>> ---
>>  arch/x86/kernel/vsyscall_64.c |   29 ++++++++++++++++++++++++++---
>>  1 file changed, 26 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
>> index 7515cf0..433545f 100644
>> --- a/arch/x86/kernel/vsyscall_64.c
>> +++ b/arch/x86/kernel/vsyscall_64.c
>> @@ -139,6 +139,14 @@ static int addr_to_vsyscall_nr(unsigned long addr)
>>         return nr;
>>  }
>>
>> +static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
>> +{
>> +       if (!seccomp_mode(&tsk->seccomp))
>> +               return 0;
>> +       task_pt_regs(tsk)->orig_ax = syscall_nr;
>> +       return __secure_computing(syscall_nr);
>> +}
>> +
>>  static bool write_ok_or_segv(unsigned long ptr, size_t size)
>>  {
>>         /*
>> @@ -174,6 +182,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>         int vsyscall_nr;
>>         int prev_sig_on_uaccess_error;
>>         long ret;
>> +       int skip;
>>
>>         /*
>>          * No point in checking CS -- the only way to get here is a user mode
>> @@ -205,9 +214,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>         }
>>
>>         tsk = current;
>> -       if (seccomp_mode(&tsk->seccomp))
>> -               do_exit(SIGKILL);
>> -
>>         /*
>>          * With a real vsyscall, page faults cause SIGSEGV.  We want to
>>          * preserve that behavior to make writing exploits harder.
>> @@ -222,8 +228,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>          * address 0".
>>          */
>>         ret = -EFAULT;
>> +       skip = 0;
>>         switch (vsyscall_nr) {
>>         case 0:
>> +               skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
>> +               if (skip)
>> +                       break;
>> +
>>                 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
>>                     !write_ok_or_segv(regs->si, sizeof(struct timezone)))
>>                         break;
>> @@ -234,6 +245,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>                 break;
>>
>>         case 1:
>> +               skip = vsyscall_seccomp(tsk, __NR_time);
>> +               if (skip)
>> +                       break;
>> +
>>                 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
>>                         break;
>>
>> @@ -241,6 +256,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>                 break;
>>
>>         case 2:
>> +               skip = vsyscall_seccomp(tsk, __NR_getcpu);
>> +               if (skip)
>> +                       break;
>> +
>>                 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
>>                     !write_ok_or_segv(regs->si, sizeof(unsigned)))
>>                         break;
>> @@ -253,6 +272,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>
>>         current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
>>
>> +       if (skip)
>> +               goto do_ret;
>> +
>>         if (ret == -EFAULT) {
>>                 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
>>                 warn_bad_vsyscall(KERN_INFO, regs,
>> @@ -271,6 +293,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>
>>         regs->ax = ret;
>>
>> +do_ret:
>>         /* Emulate a ret instruction. */
>>         regs->ip = caller;
>>         regs->sp += 8;
>
> Does this work correctly in SECCOMP_RET_TRAP, TRACE, or ERRNO mode?
> errno looks okay, but trap and trace still emulate the ret
> instruction, which looks like it could confuse debuggers.

You're right.  The current patch behaves well, but it does leave the
ip pointing to the instruction after the glibc call $0xff... address
rather than in the vsyscall page. gdb doesn't get confused, but it
doesn't think it is a syscall -- just a signal delivered inside
time().  Thankfully, this is arch specific, so we have some clear
indicators of what is happening (errno v trap v trace).  I've modified
it to behave as you suggest.  v2 incoming! I've updated my some of my
tests to ensure that they recognize the vsyscall page and act
appropriately.

>  (If, on the
> other hand, no change is made to the registers, then the debugger will
> see a syscall instruction at rip, albeit one that can't actually be
> executed due to the nx bit.)

Actually, that's not true!  vsyscall=emulate leaves regs->ip ==
address.  vsyscall=native sets it to address += 9:

vsyscall=emulate and time(0x1):
  Program received signal SIGSEGV, Bad system call.
  0xffffffffff600400 in ?? ()

vsyscall=native doesn't EFAULT->SIGSEGV, but if I do it with  SIGSYS, you see:
  Program received signal SIGSYS, Bad system call.
  0xffffffffff600409 in ?? ()

and of course, the userland vdso helper with time(0x1) will show a
SIGSEGV down inside glibc:

  Program received signal SIGSEGV, Segmentation fault.
  0x00007ffff7ffba91 in time ()

That said, my goal is to not change existing "working" behavior, so
this is all just educational :)


As usual, thanks for the review and good catches!
will

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] x86/vsyscall: allow seccomp filter in vsyscall=emulate
  2012-07-13 17:06 ` [PATCH v2] " Will Drewry
@ 2012-07-13 23:00   ` Andrew Lutomirski
  2012-07-14  0:48     ` Will Drewry
  0 siblings, 1 reply; 7+ messages in thread
From: Andrew Lutomirski @ 2012-07-13 23:00 UTC (permalink / raw)
  To: Will Drewry
  Cc: linux-kernel, torvalds, qmewlo, eparis, keescook, james.l.morris,
	hpa, cevans

On Fri, Jul 13, 2012 at 10:06 AM, Will Drewry <wad@chromium.org> wrote:
> If a seccomp filter program is installed, older static binaries and
> distributions with older libc implementations (glibc 2.13 and earlier)
> that rely on vsyscall use will be terminated regardless of the filter
> program policy when executing time, gettimeofday, or getcpu.  This is
> only the case when vsyscall emulation is in use (vsyscall=emulate is the
> default).
>
> This patch emulates system call entry inside a vsyscall=emulate by
> populating regs->ax and regs->orig_ax with the system call number prior
> to calling into seccomp such that all seccomp-dependencies function
> normally.  Additionally, system call return behavior is emulated in line
> with other vsyscall entrypoints for the trace/trap cases.
>
> Reported-by: Owen Kibel <qmewlo@gmail.com>
> Signed-off-by: Will Drewry <wad@chromium.org>
>
> v2: - fixed ip and sp on SECCOMP_RET_TRAP/TRACE (thanks to luto@mit.edu)

> @@ -253,6 +273,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>
>         current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
>
> +       if (skip) {
> +               if ((long)regs->ax <= 0L) /* seccomp errno emulation */
> +                       goto do_ret;
> +               goto done; /* seccomp trace/trap */
> +       }
> +
>         if (ret == -EFAULT) {
>                 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
>                 warn_bad_vsyscall(KERN_INFO, regs,
> @@ -271,10 +297,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>
>         regs->ax = ret;
>
> +do_ret:
>         /* Emulate a ret instruction. */
>         regs->ip = caller;
>         regs->sp += 8;
> -
> +done:
>         return true;
>
>  sigsegv:
> --
> 1.7.9.5
>

This has the same odd property as the sigsegv path that the faulting
instruction will appear to be the mov, not the syscall.  That seems to
be okay, though -- various pieces of code that try to restart the segv
are okay with that.

Is there any code that assumes that changing rax (i.e. the syscall
number) and restarting a syscall after SIGSYS will invoke the new
syscall?  (The RET_TRACE path might be similar -- does the
ptrace_event(PTRACE_EVENT_SECCOMP, data) in seccomp.c give a debugger
a chance to synchronously cancel or change the syscall?

If those issues aren't problems, then:

Reviewed-by: Andy Lutomirski <luto@amacapital.net>

(If the syscall number needs to change after the fact in the
SECCOMP_RET_TRAP case, it'll be a mess.)

--Andy

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] x86/vsyscall: allow seccomp filter in vsyscall=emulate
  2012-07-13 23:00   ` Andrew Lutomirski
@ 2012-07-14  0:48     ` Will Drewry
  2012-07-14  4:05       ` Will Drewry
  0 siblings, 1 reply; 7+ messages in thread
From: Will Drewry @ 2012-07-14  0:48 UTC (permalink / raw)
  To: Andrew Lutomirski
  Cc: linux-kernel, torvalds, qmewlo, eparis, keescook, james.l.morris,
	hpa, cevans

On Fri, Jul 13, 2012 at 6:00 PM, Andrew Lutomirski <luto@mit.edu> wrote:
> On Fri, Jul 13, 2012 at 10:06 AM, Will Drewry <wad@chromium.org> wrote:
>> If a seccomp filter program is installed, older static binaries and
>> distributions with older libc implementations (glibc 2.13 and earlier)
>> that rely on vsyscall use will be terminated regardless of the filter
>> program policy when executing time, gettimeofday, or getcpu.  This is
>> only the case when vsyscall emulation is in use (vsyscall=emulate is the
>> default).
>>
>> This patch emulates system call entry inside a vsyscall=emulate by
>> populating regs->ax and regs->orig_ax with the system call number prior
>> to calling into seccomp such that all seccomp-dependencies function
>> normally.  Additionally, system call return behavior is emulated in line
>> with other vsyscall entrypoints for the trace/trap cases.
>>
>> Reported-by: Owen Kibel <qmewlo@gmail.com>
>> Signed-off-by: Will Drewry <wad@chromium.org>
>>
>> v2: - fixed ip and sp on SECCOMP_RET_TRAP/TRACE (thanks to luto@mit.edu)
>
>> @@ -253,6 +273,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>
>>         current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
>>
>> +       if (skip) {
>> +               if ((long)regs->ax <= 0L) /* seccomp errno emulation */
>> +                       goto do_ret;
>> +               goto done; /* seccomp trace/trap */
>> +       }
>> +
>>         if (ret == -EFAULT) {
>>                 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
>>                 warn_bad_vsyscall(KERN_INFO, regs,
>> @@ -271,10 +297,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>
>>         regs->ax = ret;
>>
>> +do_ret:
>>         /* Emulate a ret instruction. */
>>         regs->ip = caller;
>>         regs->sp += 8;
>> -
>> +done:
>>         return true;
>>
>>  sigsegv:
>> --
>> 1.7.9.5
>>
>
> This has the same odd property as the sigsegv path that the faulting
> instruction will appear to be the mov, not the syscall.  That seems to
> be okay, though -- various pieces of code that try to restart the segv
> are okay with that.

Yeah - I would otherwise do
  regs->ip += 9;
but I wanted to match the code that was therefor SIGSEGV.  If regs->ip
+= 9 _just_ for the SIGSYS case is fine, then I'll make that change
shortly.  Since any code that sees the vsyscall address should be wise
enough to avoid it, perhaps that's why the SIGSEGV hasn't had a
problem so far.

> Is there any code that assumes that changing rax (i.e. the syscall
> number) and restarting a syscall after SIGSYS will invoke the new
> syscall?  (The RET_TRACE path might be similar -- does the
> ptrace_event(PTRACE_EVENT_SECCOMP, data) in seccomp.c give a debugger
> a chance to synchronously cancel or change the syscall?

Unfortunately, it does in normal interception. I don't see any way out
of that quirk with vsyscall=emulate.  As is without seccomp,
vsyscall=emulate doesn't allow ptrace interception (or syscall
auditing for that matter) while vsyscall=native does.   So the option
here is to document the quirky interaction in
Documentation/prctl/seccomp_filter.txt.  In particular, if the tracer
sees either (time|gettimeofday|getcpu) and rip in the vsyscall page,
it will know it can't rewrite or bypass the call.    Is there a better
option?

Given that, I will include a tweak to the documentation to indicate
that behavior so that userspace authors of BPF programs that use
SECCOMP_RET_TRACE will be aware of the behavior.

> If those issues aren't problems, then:
>
> Reviewed-by: Andy Lutomirski <luto@amacapital.net>
>
> (If the syscall number needs to change after the fact in the
> SECCOMP_RET_TRAP case, it'll be a mess.)

Nah - traps are delivered like the forced sigsegv path.

I'll spin a v3 soon including the documentation tweak and the ip
offset to match vsyscall=native behavior (regs->ip += 9 _just_ for the
skip case).  Of course, any better ideas for the trace-case will be
more than welcome, but it seems to me to be an acceptable tradeoff - I
hope others agree.

I'll make the changes and then put it through its paces to see if any
other little idiosyncrasies emerge.

Thanks for the close review!
will

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] x86/vsyscall: allow seccomp filter in vsyscall=emulate
  2012-07-14  0:48     ` Will Drewry
@ 2012-07-14  4:05       ` Will Drewry
  0 siblings, 0 replies; 7+ messages in thread
From: Will Drewry @ 2012-07-14  4:05 UTC (permalink / raw)
  To: Andrew Lutomirski
  Cc: linux-kernel, torvalds, qmewlo, eparis, keescook, james.l.morris,
	hpa, cevans

On Fri, Jul 13, 2012 at 7:48 PM, Will Drewry <wad@chromium.org> wrote:
> On Fri, Jul 13, 2012 at 6:00 PM, Andrew Lutomirski <luto@mit.edu> wrote:
>> On Fri, Jul 13, 2012 at 10:06 AM, Will Drewry <wad@chromium.org> wrote:
>>> If a seccomp filter program is installed, older static binaries and
>>> distributions with older libc implementations (glibc 2.13 and earlier)
>>> that rely on vsyscall use will be terminated regardless of the filter
>>> program policy when executing time, gettimeofday, or getcpu.  This is
>>> only the case when vsyscall emulation is in use (vsyscall=emulate is the
>>> default).
>>>
>>> This patch emulates system call entry inside a vsyscall=emulate by
>>> populating regs->ax and regs->orig_ax with the system call number prior
>>> to calling into seccomp such that all seccomp-dependencies function
>>> normally.  Additionally, system call return behavior is emulated in line
>>> with other vsyscall entrypoints for the trace/trap cases.
>>>
>>> Reported-by: Owen Kibel <qmewlo@gmail.com>
>>> Signed-off-by: Will Drewry <wad@chromium.org>
>>>
>>> v2: - fixed ip and sp on SECCOMP_RET_TRAP/TRACE (thanks to luto@mit.edu)
>>
>>> @@ -253,6 +273,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>>
>>>         current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
>>>
>>> +       if (skip) {
>>> +               if ((long)regs->ax <= 0L) /* seccomp errno emulation */
>>> +                       goto do_ret;
>>> +               goto done; /* seccomp trace/trap */
>>> +       }
>>> +
>>>         if (ret == -EFAULT) {
>>>                 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
>>>                 warn_bad_vsyscall(KERN_INFO, regs,
>>> @@ -271,10 +297,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
>>>
>>>         regs->ax = ret;
>>>
>>> +do_ret:
>>>         /* Emulate a ret instruction. */
>>>         regs->ip = caller;
>>>         regs->sp += 8;
>>> -
>>> +done:
>>>         return true;
>>>
>>>  sigsegv:
>>> --
>>> 1.7.9.5
>>>
>>
>> This has the same odd property as the sigsegv path that the faulting
>> instruction will appear to be the mov, not the syscall.  That seems to
>> be okay, though -- various pieces of code that try to restart the segv
>> are okay with that.
>
> Yeah - I would otherwise do
>   regs->ip += 9;
> but I wanted to match the code that was therefor SIGSEGV.  If regs->ip
> += 9 _just_ for the SIGSYS case is fine, then I'll make that change
> shortly.  Since any code that sees the vsyscall address should be wise
> enough to avoid it, perhaps that's why the SIGSEGV hasn't had a
> problem so far.

I dashed this off without more thought. It's best to leave it as is
because any return to the emulated page will cause a vsyscall fault
event.

>> Is there any code that assumes that changing rax (i.e. the syscall
>> number) and restarting a syscall after SIGSYS will invoke the new
>> syscall?  (The RET_TRACE path might be similar -- does the
>> ptrace_event(PTRACE_EVENT_SECCOMP, data) in seccomp.c give a debugger
>> a chance to synchronously cancel or change the syscall?
>
> Unfortunately, it does in normal interception. I don't see any way out
> of that quirk with vsyscall=emulate.  As is without seccomp,
> vsyscall=emulate doesn't allow ptrace interception (or syscall
> auditing for that matter) while vsyscall=native does.   So the option
> here is to document the quirky interaction in
> Documentation/prctl/seccomp_filter.txt.  In particular, if the tracer
> sees either (time|gettimeofday|getcpu) and rip in the vsyscall page,
> it will know it can't rewrite or bypass the call.    Is there a better
> option?
>
> Given that, I will include a tweak to the documentation to indicate
> that behavior so that userspace authors of BPF programs that use
> SECCOMP_RET_TRACE will be aware of the behavior.
>
>> If those issues aren't problems, then:
>>
>> Reviewed-by: Andy Lutomirski <luto@amacapital.net>
>>
>> (If the syscall number needs to change after the fact in the
>> SECCOMP_RET_TRAP case, it'll be a mess.)
>
> Nah - traps are delivered like the forced sigsegv path.
>
> I'll spin a v3 soon including the documentation tweak and the ip
> offset to match vsyscall=native behavior (regs->ip += 9 _just_ for the
> skip case).  Of course, any better ideas for the trace-case will be
> more than welcome, but it seems to me to be an acceptable tradeoff - I
> hope others agree.
>
> I'll make the changes and then put it through its paces to see if any
> other little idiosyncrasies emerge.

I've written up a documentation patch to accompany this one. It
reflects one more change I've made in a v3 of the patch, but it is
optional.  I've added support for SECCOMP_RET_TRACE to still
skip/emulate the system call if it desires.  In v2 it can't.  Either
way is fine in practice, but I'd need to change the accompanying
documentation.

thanks again!
will

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2012-07-14  4:05 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-13  5:17 [PATCH] x86/vsyscall: allow seccomp filter in vsyscall=emulate Will Drewry
2012-07-13 14:32 ` Andrew Lutomirski
2012-07-13 17:11   ` Will Drewry
2012-07-13 17:06 ` [PATCH v2] " Will Drewry
2012-07-13 23:00   ` Andrew Lutomirski
2012-07-14  0:48     ` Will Drewry
2012-07-14  4:05       ` Will Drewry

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.