Re: [5.2 REGRESSION] Generic vDSO breaks seccomp-enabled userspace on i386

From: Kees Cook <keescook@chromium.org>
To: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>,
	Sean Christopherson <sean.j.christopherson@intel.com>,
	Vincenzo Frascino <vincenzo.frascino@arm.com>,
	X86 ML <x86@kernel.org>, LKML <linux-kernel@vger.kernel.org>
Subject: Re: [5.2 REGRESSION] Generic vDSO breaks seccomp-enabled userspace on i386
Date: Mon, 22 Jul 2019 16:28:21 -0700	[thread overview]
Message-ID: <201907221620.F31B9A082@keescook> (raw)
In-Reply-To: <CALCETrVnV8o_jqRDZua1V0s_fMYweP2J2GbwWA-cLxqb_PShog@mail.gmail.com>

On Mon, Jul 22, 2019 at 12:17:16PM -0700, Andy Lutomirski wrote:
> On Mon, Jul 22, 2019 at 11:39 AM Kees Cook <keescook@chromium.org> wrote:
> >
> > On Mon, Jul 22, 2019 at 08:31:32PM +0200, Thomas Gleixner wrote:
> > > On Mon, 22 Jul 2019, Kees Cook wrote:
> > > > Just so I'm understanding: the vDSO change introduced code to make an
> > > > actual syscall on i386, which for most seccomp filters would be rejected?
> > >
> > > No. The old x86 specific VDSO implementation had a fallback syscall as
> > > well, i.e. clock_gettime(). On 32bit clock_gettime() uses the y2038
> > > endangered timespec.
> > >
> > > So when the VDSO was made generic we changed the internal data structures
> > > to be 2038 safe right away. As a consequence the fallback syscall is not
> > > clock_gettime(), it's clock_gettime64(). which seems to surprise seccomp.
> >
> > Okay, it's didn't add a syscall, it just changed it. Results are the
> > same: conservative filters suddenly start breaking due to the different
> > call. (And now I see why Andy's alias suggestion would help...)
> >
> > I'm not sure which direction to do with this. It seems like an alias
> > list is a large hammer for this case, and a "seccomp-bypass when calling
> > from vDSO" solution seems too fragile?
> >
> 
> I don't like the seccomp bypass at all.  If someone uses seccomp to
> disallow all clock_gettime() variants, there shouldn't be a back door
> to learn the time.
> 
> Here's the restart_syscall() logic that makes me want aliases: we have
> different syscall numbers for restart_syscall() on 32-bit and 64-bit.
> The logic to decide which one to use is dubious at best.  I'd like to
> introduce a restart_syscall2() that is identical to restart_syscall()
> except that it has the same number on both variants.

I've built a straw-man for this idea... but I have to say I don't
like it. This can lead to really unexpected behaviors if someone
were to have differing filters for the two syscalls. For example,
let's say someone was doing a paranoid audit of 2038-unsafe clock usage
and marked clock_gettime() with RET_KILL and marked clock_gettime64()
with RET_LOG. This aliasing would make clock_gettime64() trigger with
RET_KILL...

There's no sense of "default" in BPF so we can't check for "and they
weren't expecting it", and we can't check for exact matches since the
filter might be using a balance tree to bucket the allowed syscalls.

Anyway, here it is in case it sparks some more thoughts...

diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 9242b28418d5..1e82bd43286c 100644
--- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -13,6 +13,7 @@
  */
 #undef CONFIG_64BIT
 #undef CONFIG_X86_64
+#undef CONFIG_COMPAT
 #undef CONFIG_PGTABLE_LEVELS
 #undef CONFIG_ILLEGAL_POINTER_VALUE
 #undef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 2bd1338de236..5d0da5ee1b61 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -6,6 +6,12 @@
 
 #ifdef CONFIG_X86_32
 #define __NR_seccomp_sigreturn		__NR_sigreturn
+#define seccomp_syscall_alias(arch, syscall)			\
+	({							\
+		(syscall) == __NR_clock_gettime64 ?		\
+			__NR_clock_gettime :			\
+			-1;					\
+	})
 #endif
 
 #ifdef CONFIG_COMPAT
@@ -14,6 +20,13 @@
 #define __NR_seccomp_write_32		__NR_ia32_write
 #define __NR_seccomp_exit_32		__NR_ia32_exit
 #define __NR_seccomp_sigreturn_32	__NR_ia32_sigreturn
+#define seccomp_syscall_alias(arch, syscall)			\
+	({							\
+		(arch) != AUDIT_ARCH_I386 ? -1 :		\
+		(syscall) == __NR_ia32_clock_gettime64 ?	\
+			__NR_ia32_clock_gettime :		\
+			-1;					\
+	})
 #endif
 
 #include <asm-generic/seccomp.h>
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 84868d37b35d..06f5ca81d756 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -83,6 +83,14 @@ static inline int seccomp_mode(struct seccomp *s)
 #ifdef CONFIG_SECCOMP_FILTER
 extern void put_seccomp_filter(struct task_struct *tsk);
 extern void get_seccomp_filter(struct task_struct *tsk);
+/*
+ * Allow architectures to provide syscall aliases for special cases
+ * when attempting to make invisible transitions to new syscalls that
+ * have no arguments (e.g. clock_gettime64, restart_syscall).
+ */
+# ifndef seccomp_syscall_alias
+#  define seccomp_syscall_alias(arch, syscall)		(-1)
+# endif
 #else  /* CONFIG_SECCOMP_FILTER */
 static inline void put_seccomp_filter(struct task_struct *tsk)
 {
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dba52a7db5e8..5153c6155d9a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -807,6 +807,26 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 	data = filter_ret & SECCOMP_RET_DATA;
 	action = filter_ret & SECCOMP_RET_ACTION_FULL;
 
+	/* Handle syscall aliases when result is not SECCOMP_RET_ALLOW. */
+	if (unlikely(action != SECCOMP_RET_ALLOW)) {
+		int alias;
+
+		alias = seccomp_syscall_alias(sd->arch, sd->nr);
+		if (unlikely(alias != -1)) {
+			/* Use sd_local for an aliased syscall. */
+			if (sd != &sd_local) {
+				sd_local = *sd;
+				sd = &sd_local;
+			}
+			sd_local.nr = alias;
+
+			/* Run again, with the alias, accepting the results. */
+			filter_ret = seccomp_run_filters(sd, &match);
+			data = filter_ret & SECCOMP_RET_DATA;
+			action = filter_ret & SECCOMP_RET_ACTION_FULL;
+		}
+	}
+
 	switch (action) {
 	case SECCOMP_RET_ERRNO:
 		/* Set low-order bits as an errno, capped at MAX_ERRNO. */
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 6ef7f16c4cf5..778619d145ea 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -3480,6 +3519,55 @@ TEST(seccomp_get_notif_sizes)
 	EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
 }
 
+#if defined(__i386__)
+TEST(seccomp_syscall_alias)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+#ifdef __NR_sigreturn
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 6, 0),
+#endif
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 5, 0),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 4, 0),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 3, 0),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 2, 0),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 1, 0),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_gettime, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+	unsigned char buffer[128];
+	long ret;
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret) {
+		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+	}
+
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+	ASSERT_EQ(0, ret) {
+		TH_LOG("Kernel refused to install seccomp filter!");
+	}
+
+	/* We allow clock_gettime explicitly. */
+	EXPECT_EQ(0, syscall(__NR_clock_gettime, CLOCK_REALTIME, &buffer)) {
+		TH_LOG("Failed __NR_clock_gettime!?");
+	}
+
+	/* With aliases, clock_gettime64 should be allowed too. */
+	EXPECT_EQ(0, syscall(__NR_clock_gettime64, CLOCK_REALTIME, &buffer)) {
+		TH_LOG("Failed __NR_clock_gettime64!");
+	}
+
+	syscall(__NR_exit, 0);
+}
+#endif
+
 /*
  * TODO:
  * - add microbenchmarks

-- 
Kees Cook