LinuxPPC-Dev Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic
@ 2020-07-24  9:25 Michael Ellerman
  2020-07-24  9:25 ` [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame Michael Ellerman
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Michael Ellerman @ 2020-07-24  9:25 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel, dja

We have custom stack expansion checks that it turns out are extremely
badly tested and contain bugs, surprise. So add some tests that
exercise the code and capture the current boundary conditions.

The signal test currently fails on 64-bit kernels because the 2048
byte allowance for the signal frame is too small, we will fix that in
a subsequent patch.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---

v2:
 - Concentrate on used stack around the 1MB size, as that's where our
   custom logic kicks in.
 - Increment the used stack size by 64 so we can exercise the case
   where we overflow the page by less than 128 (__SIGNAL_FRAMESIZE).

---
 tools/testing/selftests/powerpc/mm/.gitignore |   2 +
 tools/testing/selftests/powerpc/mm/Makefile   |   9 +-
 .../powerpc/mm/stack_expansion_ldst.c         | 233 ++++++++++++++++++
 .../powerpc/mm/stack_expansion_signal.c       | 118 +++++++++
 tools/testing/selftests/powerpc/pmu/lib.h     |   1 +
 5 files changed, 362 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
 create mode 100644 tools/testing/selftests/powerpc/mm/stack_expansion_signal.c

diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore
index 8d041f508a51..52308f42b7de 100644
--- a/tools/testing/selftests/powerpc/mm/.gitignore
+++ b/tools/testing/selftests/powerpc/mm/.gitignore
@@ -8,3 +8,5 @@ large_vm_fork_separation
 bad_accesses
 tlbie_test
 pkey_exec_prot
+stack_expansion_ldst
+stack_expansion_signal
diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile
index 5a86d59441dc..6cd772e0e374 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -3,7 +3,9 @@
 	$(MAKE) -C ../
 
 TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot segv_errors wild_bctr \
-		  large_vm_fork_separation bad_accesses pkey_exec_prot
+		  large_vm_fork_separation bad_accesses pkey_exec_prot stack_expansion_signal \
+		  stack_expansion_ldst
+
 TEST_GEN_PROGS_EXTENDED := tlbie_test
 TEST_GEN_FILES := tempfile
 
@@ -17,6 +19,11 @@ $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64
 $(OUTPUT)/bad_accesses: CFLAGS += -m64
 $(OUTPUT)/pkey_exec_prot: CFLAGS += -m64
 
+$(OUTPUT)/stack_expansion_signal: ../utils.c ../pmu/lib.c
+
+$(OUTPUT)/stack_expansion_ldst: CFLAGS += -fno-stack-protector
+$(OUTPUT)/stack_expansion_ldst: ../utils.c
+
 $(OUTPUT)/tempfile:
 	dd if=/dev/zero of=$@ bs=64k count=1
 
diff --git a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
new file mode 100644
index 000000000000..0587e11437f5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that loads/stores expand the stack segment, or trigger a SEGV, in
+ * various conditions.
+ *
+ * Based on test code by Tom Lane.
+ */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <err.h>
+#include <errno.h>
+#include <stdio.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#define _KB (1024)
+#define _MB (1024 * 1024)
+
+volatile char *stack_top_ptr;
+volatile unsigned long stack_top_sp;
+volatile char c;
+
+enum access_type {
+	LOAD,
+	STORE,
+};
+
+/*
+ * Consume stack until the stack pointer is below @target_sp, then do an access
+ * (load or store) at offset @delta from either the base of the stack or the
+ * current stack pointer.
+ */
+__attribute__ ((noinline))
+int consume_stack(unsigned long target_sp, unsigned long stack_high, int delta, enum access_type type)
+{
+	unsigned long target;
+	char stack_cur;
+
+	if ((unsigned long)&stack_cur > target_sp)
+		return consume_stack(target_sp, stack_high, delta, type);
+	else {
+		// We don't really need this, but without it GCC might not
+		// generate a recursive call above.
+		stack_top_ptr = &stack_cur;
+
+#ifdef __powerpc__
+		asm volatile ("mr %[sp], %%r1" : [sp] "=r" (stack_top_sp));
+#else
+		asm volatile ("mov %%rsp, %[sp]" : [sp] "=r" (stack_top_sp));
+#endif
+
+		// Kludge, delta < 0 indicates relative to SP
+		if (delta < 0)
+			target = stack_top_sp + delta;
+		else
+			target = stack_high - delta + 1;
+
+		volatile char *p = (char *)target;
+
+		if (type == STORE)
+			*p = c;
+		else
+			c = *p;
+
+		// Do something to prevent the stack frame being popped prior to
+		// our access above.
+		getpid();
+	}
+
+	return 0;
+}
+
+static int search_proc_maps(char *needle, unsigned long *low, unsigned long *high)
+{
+	unsigned long start, end;
+	static char buf[4096];
+	char name[128];
+	FILE *f;
+	int rc;
+
+	f = fopen("/proc/self/maps", "r");
+	if (!f) {
+		perror("fopen");
+		return -1;
+	}
+
+	while (fgets(buf, sizeof(buf), f)) {
+		rc = sscanf(buf, "%lx-%lx %*c%*c%*c%*c %*x %*d:%*d %*d %127s\n",
+			    &start, &end, name);
+		if (rc == 2)
+			continue;
+
+		if (rc != 3) {
+			printf("sscanf errored\n");
+			rc = -1;
+			break;
+		}
+
+		if (strstr(name, needle)) {
+			*low = start;
+			*high = end - 1;
+			rc = 0;
+			break;
+		}
+	}
+
+	fclose(f);
+
+	return rc;
+}
+
+int child(unsigned int stack_used, int delta, enum access_type type)
+{
+	unsigned long low, stack_high;
+
+	assert(search_proc_maps("[stack]", &low, &stack_high) == 0);
+
+	assert(consume_stack(stack_high - stack_used, stack_high, delta, type) == 0);
+
+	printf("Access OK: %s delta %-7d used size 0x%06x stack high 0x%lx top_ptr %p top sp 0x%lx actual used 0x%lx\n",
+	       type == LOAD ? "load" : "store", delta, stack_used, stack_high,
+	       stack_top_ptr, stack_top_sp, stack_high - stack_top_sp + 1);
+
+	return 0;
+}
+
+static int test_one(unsigned int stack_used, int delta, enum access_type type)
+{
+	pid_t pid;
+	int rc;
+
+	pid = fork();
+	if (pid == 0)
+		exit(child(stack_used, delta, type));
+
+	assert(waitpid(pid, &rc, 0) != -1);
+
+	if (WIFEXITED(rc) && WEXITSTATUS(rc) == 0)
+		return 0;
+
+	// We don't expect a non-zero exit that's not a signal
+	assert(!WIFEXITED(rc));
+
+	printf("Faulted:   %s delta %-7d used size 0x%06x signal %d\n",
+	       type == LOAD ? "load" : "store", delta, stack_used,
+	       WTERMSIG(rc));
+
+	return 1;
+}
+
+// This is fairly arbitrary but is well below any of the targets below,
+// so that the delta between the stack pointer and the target is large.
+#define DEFAULT_SIZE	(32 * _KB)
+
+static void test_one_type(enum access_type type, unsigned long page_size, unsigned long rlim_cur)
+{
+	assert(test_one(DEFAULT_SIZE, 512 * _KB, type) == 0);
+
+	// powerpc has a special case to allow up to 1MB
+	assert(test_one(DEFAULT_SIZE, 1 * _MB, type) == 0);
+
+#ifdef __powerpc__
+	// This fails on powerpc because it's > 1MB and is not a stdu &
+	// not close to r1
+	assert(test_one(DEFAULT_SIZE, 1 * _MB + 8, type) != 0);
+#else
+	assert(test_one(DEFAULT_SIZE, 1 * _MB + 8, type) == 0);
+#endif
+
+#ifdef __powerpc__
+	// Accessing way past the stack pointer is not allowed on powerpc
+	assert(test_one(DEFAULT_SIZE, rlim_cur, type) != 0);
+#else
+	// We should be able to access anywhere within the rlimit
+	assert(test_one(DEFAULT_SIZE, rlim_cur, type) == 0);
+#endif
+
+	// But if we go past the rlimit it should fail
+	assert(test_one(DEFAULT_SIZE, rlim_cur + 1, type) != 0);
+
+	// Above 1MB powerpc only allows accesses within 2048 bytes of
+	// r1 for accesses that aren't stdu
+	assert(test_one(1 * _MB + page_size - 128, -2048, type) == 0);
+#ifdef __powerpc__
+	assert(test_one(1 * _MB + page_size - 128, -2049, type) != 0);
+#else
+	assert(test_one(1 * _MB + page_size - 128, -2049, type) == 0);
+#endif
+
+	// By consuming 2MB of stack we test the stdu case
+	assert(test_one(2 * _MB + page_size - 128, -2048, type) == 0);
+}
+
+static int test(void)
+{
+	unsigned long page_size;
+	struct rlimit rlimit;
+
+	page_size = getpagesize();
+	getrlimit(RLIMIT_STACK, &rlimit);
+	printf("Stack rlimit is 0x%lx\n", rlimit.rlim_cur);
+
+	printf("Testing loads ...\n");
+	test_one_type(LOAD, page_size, rlimit.rlim_cur);
+	printf("Testing stores ...\n");
+	test_one_type(STORE, page_size, rlimit.rlim_cur);
+
+	printf("All OK\n");
+
+	return 0;
+}
+
+#ifdef __powerpc__
+#include "utils.h"
+
+int main(void)
+{
+	return test_harness(test, "stack_expansion_ldst");
+}
+#else
+int main(void)
+{
+	return test();
+}
+#endif
diff --git a/tools/testing/selftests/powerpc/mm/stack_expansion_signal.c b/tools/testing/selftests/powerpc/mm/stack_expansion_signal.c
new file mode 100644
index 000000000000..c8b32a29e274
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/stack_expansion_signal.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that signal delivery is able to expand the stack segment without
+ * triggering a SEGV.
+ *
+ * Based on test code by Tom Lane.
+ */
+
+#include <err.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../pmu/lib.h"
+#include "utils.h"
+
+#define _KB (1024)
+#define _MB (1024 * 1024)
+
+static char *stack_base_ptr;
+static char *stack_top_ptr;
+
+static volatile sig_atomic_t sig_occurred = 0;
+
+static void sigusr1_handler(int signal_arg)
+{
+	sig_occurred = 1;
+}
+
+static int consume_stack(unsigned int stack_size, union pipe write_pipe)
+{
+	char stack_cur;
+
+	if ((stack_base_ptr - &stack_cur) < stack_size)
+		return consume_stack(stack_size, write_pipe);
+	else {
+		stack_top_ptr = &stack_cur;
+
+		FAIL_IF(notify_parent(write_pipe));
+
+		while (!sig_occurred)
+			barrier();
+	}
+
+	return 0;
+}
+
+static int child(unsigned int stack_size, union pipe write_pipe)
+{
+	struct sigaction act;
+	char stack_base;
+
+	act.sa_handler = sigusr1_handler;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = 0;
+	if (sigaction(SIGUSR1, &act, NULL) < 0)
+		err(1, "sigaction");
+
+	stack_base_ptr = (char *) (((size_t) &stack_base + 65535) & ~65535UL);
+
+	FAIL_IF(consume_stack(stack_size, write_pipe));
+
+	printf("size 0x%06x: OK, stack base %p top %p (%zx used)\n",
+		stack_size, stack_base_ptr, stack_top_ptr,
+		stack_base_ptr - stack_top_ptr);
+
+	return 0;
+}
+
+static int test_one_size(unsigned int stack_size)
+{
+	union pipe read_pipe, write_pipe;
+	pid_t pid;
+
+	FAIL_IF(pipe(read_pipe.fds) == -1);
+	FAIL_IF(pipe(write_pipe.fds) == -1);
+
+	pid = fork();
+	if (pid == 0) {
+		close(read_pipe.read_fd);
+		close(write_pipe.write_fd);
+		exit(child(stack_size, read_pipe));
+	}
+
+	close(read_pipe.write_fd);
+	close(write_pipe.read_fd);
+	FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+	kill(pid, SIGUSR1);
+
+	FAIL_IF(wait_for_child(pid));
+
+	close(read_pipe.read_fd);
+	close(write_pipe.write_fd);
+
+	return 0;
+}
+
+int test(void)
+{
+	unsigned int i, size;
+
+	// Test with used stack from 1MB - 64K to 1MB + 64K
+	// Increment by 64 to get more coverage of odd sizes
+	for (i = 0; i < (128 * _KB); i += 64) {
+		size = i + (1 * _MB) - (64 * _KB);
+		FAIL_IF(test_one_size(size));
+	}
+
+	return 0;
+}
+
+int main(void)
+{
+	return test_harness(test, "stack_expansion_signal");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/lib.h b/tools/testing/selftests/powerpc/pmu/lib.h
index fa12e7d0b4d3..bf1bec013bbb 100644
--- a/tools/testing/selftests/powerpc/pmu/lib.h
+++ b/tools/testing/selftests/powerpc/pmu/lib.h
@@ -6,6 +6,7 @@
 #ifndef __SELFTESTS_POWERPC_PMU_LIB_H
 #define __SELFTESTS_POWERPC_PMU_LIB_H
 
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
-- 
2.25.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame
  2020-07-24  9:25 [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic Michael Ellerman
@ 2020-07-24  9:25 ` Michael Ellerman
  2020-07-27  8:23   ` Gabriel Paubert
  2020-07-27 10:50   ` Daniel Axtens
  2020-07-24  9:25 ` [PATCH v2 3/5] selftests/powerpc: Update the stack expansion test Michael Ellerman
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 11+ messages in thread
From: Michael Ellerman @ 2020-07-24  9:25 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel, dja

We have powerpc specific logic in our page fault handling to decide if
an access to an unmapped address below the stack pointer should expand
the stack VMA.

The code was originally added in 2004 "ported from 2.4". The rough
logic is that the stack is allowed to grow to 1MB with no extra
checking. Over 1MB the access must be within 2048 bytes of the stack
pointer, or be from a user instruction that updates the stack pointer.

The 2048 byte allowance below the stack pointer is there to cover the
288 byte "red zone" as well as the "about 1.5kB" needed by the signal
delivery code.

Unfortunately since then the signal frame has expanded, and is now
4224 bytes on 64-bit kernels with transactional memory enabled. This
means if a process has consumed more than 1MB of stack, and its stack
pointer lies less than 4224 bytes from the next page boundary, signal
delivery will fault when trying to expand the stack and the process
will see a SEGV.

The total size of the signal frame is the size of struct rt_sigframe
(which includes the red zone) plus __SIGNAL_FRAMESIZE (128 bytes on
64-bit).

The 2048 byte allowance was correct until 2008 as the signal frame
was:

struct rt_sigframe {
        struct ucontext    uc;                           /*     0  1440 */
        /* --- cacheline 11 boundary (1408 bytes) was 32 bytes ago --- */
        long unsigned int          _unused[2];           /*  1440    16 */
        unsigned int               tramp[6];             /*  1456    24 */
        struct siginfo *           pinfo;                /*  1480     8 */
        void *                     puc;                  /*  1488     8 */
        struct siginfo     info;                         /*  1496   128 */
        /* --- cacheline 12 boundary (1536 bytes) was 88 bytes ago --- */
        char                       abigap[288];          /*  1624   288 */

        /* size: 1920, cachelines: 15, members: 7 */
        /* padding: 8 */
};

1920 + 128 = 2048

Then in commit ce48b2100785 ("powerpc: Add VSX context save/restore,
ptrace and signal support") (Jul 2008) the signal frame expanded to
2304 bytes:

struct rt_sigframe {
        struct ucontext    uc;                           /*     0  1696 */	<--
        /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
        long unsigned int          _unused[2];           /*  1696    16 */
        unsigned int               tramp[6];             /*  1712    24 */
        struct siginfo *           pinfo;                /*  1736     8 */
        void *                     puc;                  /*  1744     8 */
        struct siginfo     info;                         /*  1752   128 */
        /* --- cacheline 14 boundary (1792 bytes) was 88 bytes ago --- */
        char                       abigap[288];          /*  1880   288 */

        /* size: 2176, cachelines: 17, members: 7 */
        /* padding: 8 */
};

2176 + 128 = 2304

At this point we should have been exposed to the bug, though as far as
I know it was never reported. I no longer have a system old enough to
easily test on.

Then in 2010 commit 320b2b8de126 ("mm: keep a guard page below a
grow-down stack segment") caused our stack expansion code to never
trigger, as there was always a VMA found for a write up to PAGE_SIZE
below r1.

That meant the bug was hidden as we continued to expand the signal
frame in commit 2b0a576d15e0 ("powerpc: Add new transactional memory
state to the signal context") (Feb 2013):

struct rt_sigframe {
        struct ucontext    uc;                           /*     0  1696 */
        /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
        struct ucontext    uc_transact;                  /*  1696  1696 */	<--
        /* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
        long unsigned int          _unused[2];           /*  3392    16 */
        unsigned int               tramp[6];             /*  3408    24 */
        struct siginfo *           pinfo;                /*  3432     8 */
        void *                     puc;                  /*  3440     8 */
        struct siginfo     info;                         /*  3448   128 */
        /* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
        char                       abigap[288];          /*  3576   288 */

        /* size: 3872, cachelines: 31, members: 8 */
        /* padding: 8 */
        /* last cacheline: 32 bytes */
};

3872 + 128 = 4000

And commit 573ebfa6601f ("powerpc: Increase stack redzone for 64-bit
userspace to 512 bytes") (Feb 2014):

struct rt_sigframe {
        struct ucontext    uc;                           /*     0  1696 */
        /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
        struct ucontext    uc_transact;                  /*  1696  1696 */
        /* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
        long unsigned int          _unused[2];           /*  3392    16 */
        unsigned int               tramp[6];             /*  3408    24 */
        struct siginfo *           pinfo;                /*  3432     8 */
        void *                     puc;                  /*  3440     8 */
        struct siginfo     info;                         /*  3448   128 */
        /* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
        char                       abigap[512];          /*  3576   512 */	<--

        /* size: 4096, cachelines: 32, members: 8 */
        /* padding: 8 */
};

4096 + 128 = 4224

Then finally in 2017, commit 1be7107fbe18 ("mm: larger stack guard
gap, between vmas") exposed us to the existing bug, because it changed
the stack VMA to be the correct/real size, meaning our stack expansion
code is now triggered.

Fix it by increasing the allowance to 4224 bytes.

Hard-coding 4224 is obviously unsafe against future expansions of the
signal frame in the same way as the existing code. We can't easily use
sizeof() because the signal frame structure is not in a header. We
will either fix that, or rip out all the custom stack expansion
checking logic entirely.

Fixes: ce48b2100785 ("powerpc: Add VSX context save/restore, ptrace and signal support")
Cc: stable@vger.kernel.org # v2.6.27+
Reported-by: Tom Lane <tgl@sss.pgh.pa.us>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---

v2: Account for the extra 128 bytes of __SIGNAL_FRAMESIZE, making the
    total size 4224, as noticed by dja.

See also https://bugzilla.kernel.org/show_bug.cgi?id=205183
---
 arch/powerpc/mm/fault.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 641fc5f3d7dd..3ebb1792e636 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -267,6 +267,9 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 	return false;
 }
 
+// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
+#define SIGFRAME_MAX_SIZE	(4096 + 128)
+
 static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
 				struct vm_area_struct *vma, unsigned int flags,
 				bool *must_retry)
@@ -274,7 +277,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
 	/*
 	 * N.B. The POWER/Open ABI allows programs to access up to
 	 * 288 bytes below the stack pointer.
-	 * The kernel signal delivery code writes up to about 1.5kB
+	 * The kernel signal delivery code writes a bit over 4KB
 	 * below the stack pointer (r1) before decrementing it.
 	 * The exec code can write slightly over 640kB to the stack
 	 * before setting the user r1.  Thus we allow the stack to
@@ -299,7 +302,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
 		 * between the last mapped region and the stack will
 		 * expand the stack rather than segfaulting.
 		 */
-		if (address + 2048 >= uregs->gpr[1])
+		if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
 			return false;
 
 		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
-- 
2.25.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 3/5] selftests/powerpc: Update the stack expansion test
  2020-07-24  9:25 [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic Michael Ellerman
  2020-07-24  9:25 ` [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame Michael Ellerman
@ 2020-07-24  9:25 ` Michael Ellerman
  2020-07-24  9:25 ` [PATCH v2 4/5] powerpc/mm: Remove custom stack expansion checking Michael Ellerman
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Michael Ellerman @ 2020-07-24  9:25 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel, dja

Update the stack expansion load/store test to take into account the
new allowance of 4224 bytes below the stack pointer.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../selftests/powerpc/mm/stack_expansion_ldst.c        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

v2: Update for change of size to 4224.

diff --git a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
index 0587e11437f5..8dbfb51acf0f 100644
--- a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
+++ b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
@@ -186,17 +186,17 @@ static void test_one_type(enum access_type type, unsigned long page_size, unsign
 	// But if we go past the rlimit it should fail
 	assert(test_one(DEFAULT_SIZE, rlim_cur + 1, type) != 0);
 
-	// Above 1MB powerpc only allows accesses within 2048 bytes of
+	// Above 1MB powerpc only allows accesses within 4224 bytes of
 	// r1 for accesses that aren't stdu
-	assert(test_one(1 * _MB + page_size - 128, -2048, type) == 0);
+	assert(test_one(1 * _MB + page_size - 128, -4224, type) == 0);
 #ifdef __powerpc__
-	assert(test_one(1 * _MB + page_size - 128, -2049, type) != 0);
+	assert(test_one(1 * _MB + page_size - 128, -4225, type) != 0);
 #else
-	assert(test_one(1 * _MB + page_size - 128, -2049, type) == 0);
+	assert(test_one(1 * _MB + page_size - 128, -4225, type) == 0);
 #endif
 
 	// By consuming 2MB of stack we test the stdu case
-	assert(test_one(2 * _MB + page_size - 128, -2048, type) == 0);
+	assert(test_one(2 * _MB + page_size - 128, -4224, type) == 0);
 }
 
 static int test(void)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 4/5] powerpc/mm: Remove custom stack expansion checking
  2020-07-24  9:25 [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic Michael Ellerman
  2020-07-24  9:25 ` [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame Michael Ellerman
  2020-07-24  9:25 ` [PATCH v2 3/5] selftests/powerpc: Update the stack expansion test Michael Ellerman
@ 2020-07-24  9:25 ` Michael Ellerman
  2020-07-27 13:48   ` Daniel Axtens
  2020-07-24  9:25 ` [PATCH v2 5/5] selftests/powerpc: Remove powerpc special cases from stack expansion test Michael Ellerman
  2020-07-30 12:50 ` [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic Michael Ellerman
  4 siblings, 1 reply; 11+ messages in thread
From: Michael Ellerman @ 2020-07-24  9:25 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel, dja

We have powerpc specific logic in our page fault handling to decide if
an access to an unmapped address below the stack pointer should expand
the stack VMA.

The logic aims to prevent userspace from doing bad accesses below the
stack pointer. However as long as the stack is < 1MB in size, we allow
all accesses without further checks. Adding some debug I see that I
can do a full kernel build and LTP run, and not a single process has
used more than 1MB of stack. So for the majority of processes the
logic never even fires.

We also recently found a nasty bug in this code which could cause
userspace programs to be killed during signal delivery. It went
unnoticed presumably because most processes use < 1MB of stack.

The generic mm code has also grown support for stack guard pages since
this code was originally written, so the most heinous case of the
stack expanding into other mappings is now handled for us.

Finally although some other arches have special logic in this path,
from what I can tell none of x86, arm64, arm and s390 impose any extra
checks other than those in expand_stack().

So drop our complicated logic and like other architectures just let
the stack expand as long as its within the rlimit.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/fault.c | 109 ++--------------------------------------
 1 file changed, 5 insertions(+), 104 deletions(-)

v2: no change just rebased.

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 3ebb1792e636..925a7231abb3 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -42,39 +42,7 @@
 #include <asm/kup.h>
 #include <asm/inst.h>
 
-/*
- * Check whether the instruction inst is a store using
- * an update addressing form which will update r1.
- */
-static bool store_updates_sp(struct ppc_inst inst)
-{
-	/* check for 1 in the rA field */
-	if (((ppc_inst_val(inst) >> 16) & 0x1f) != 1)
-		return false;
-	/* check major opcode */
-	switch (ppc_inst_primary_opcode(inst)) {
-	case OP_STWU:
-	case OP_STBU:
-	case OP_STHU:
-	case OP_STFSU:
-	case OP_STFDU:
-		return true;
-	case OP_STD:	/* std or stdu */
-		return (ppc_inst_val(inst) & 3) == 1;
-	case OP_31:
-		/* check minor opcode */
-		switch ((ppc_inst_val(inst) >> 1) & 0x3ff) {
-		case OP_31_XOP_STDUX:
-		case OP_31_XOP_STWUX:
-		case OP_31_XOP_STBUX:
-		case OP_31_XOP_STHUX:
-		case OP_31_XOP_STFSUX:
-		case OP_31_XOP_STFDUX:
-			return true;
-		}
-	}
-	return false;
-}
+
 /*
  * do_page_fault error handling helpers
  */
@@ -267,57 +235,6 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
 	return false;
 }
 
-// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
-#define SIGFRAME_MAX_SIZE	(4096 + 128)
-
-static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
-				struct vm_area_struct *vma, unsigned int flags,
-				bool *must_retry)
-{
-	/*
-	 * N.B. The POWER/Open ABI allows programs to access up to
-	 * 288 bytes below the stack pointer.
-	 * The kernel signal delivery code writes a bit over 4KB
-	 * below the stack pointer (r1) before decrementing it.
-	 * The exec code can write slightly over 640kB to the stack
-	 * before setting the user r1.  Thus we allow the stack to
-	 * expand to 1MB without further checks.
-	 */
-	if (address + 0x100000 < vma->vm_end) {
-		struct ppc_inst __user *nip = (struct ppc_inst __user *)regs->nip;
-		/* get user regs even if this fault is in kernel mode */
-		struct pt_regs *uregs = current->thread.regs;
-		if (uregs == NULL)
-			return true;
-
-		/*
-		 * A user-mode access to an address a long way below
-		 * the stack pointer is only valid if the instruction
-		 * is one which would update the stack pointer to the
-		 * address accessed if the instruction completed,
-		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
-		 * (or the byte, halfword, float or double forms).
-		 *
-		 * If we don't check this then any write to the area
-		 * between the last mapped region and the stack will
-		 * expand the stack rather than segfaulting.
-		 */
-		if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
-			return false;
-
-		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
-		    access_ok(nip, sizeof(*nip))) {
-			struct ppc_inst inst;
-
-			if (!probe_user_read_inst(&inst, nip))
-				return !store_updates_sp(inst);
-			*must_retry = true;
-		}
-		return true;
-	}
-	return false;
-}
-
 #ifdef CONFIG_PPC_MEM_KEYS
 static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
 			      struct vm_area_struct *vma)
@@ -483,7 +400,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	int is_user = user_mode(regs);
 	int is_write = page_fault_is_write(error_code);
 	vm_fault_t fault, major = 0;
-	bool must_retry = false;
 	bool kprobe_fault = kprobe_page_fault(regs, 11);
 
 	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
@@ -572,30 +488,15 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	vma = find_vma(mm, address);
 	if (unlikely(!vma))
 		return bad_area(regs, address);
-	if (likely(vma->vm_start <= address))
-		goto good_area;
-	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
-		return bad_area(regs, address);
 
-	/* The stack is being expanded, check if it's valid */
-	if (unlikely(bad_stack_expansion(regs, address, vma, flags,
-					 &must_retry))) {
-		if (!must_retry)
+	if (unlikely(vma->vm_start > address)) {
+		if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
 			return bad_area(regs, address);
 
-		mmap_read_unlock(mm);
-		if (fault_in_pages_readable((const char __user *)regs->nip,
-					    sizeof(unsigned int)))
-			return bad_area_nosemaphore(regs, address);
-		goto retry;
+		if (unlikely(expand_stack(vma, address)))
+			return bad_area(regs, address);
 	}
 
-	/* Try to expand it */
-	if (unlikely(expand_stack(vma, address)))
-		return bad_area(regs, address);
-
-good_area:
-
 #ifdef CONFIG_PPC_MEM_KEYS
 	if (unlikely(access_pkey_error(is_write, is_exec,
 				       (error_code & DSISR_KEYFAULT), vma)))
-- 
2.25.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 5/5] selftests/powerpc: Remove powerpc special cases from stack expansion test
  2020-07-24  9:25 [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic Michael Ellerman
                   ` (2 preceding siblings ...)
  2020-07-24  9:25 ` [PATCH v2 4/5] powerpc/mm: Remove custom stack expansion checking Michael Ellerman
@ 2020-07-24  9:25 ` Michael Ellerman
  2020-07-30 12:50 ` [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic Michael Ellerman
  4 siblings, 0 replies; 11+ messages in thread
From: Michael Ellerman @ 2020-07-24  9:25 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: linux-kernel, dja

Now that the powerpc code behaves the same as other architectures we
can drop the special cases we had.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 .../powerpc/mm/stack_expansion_ldst.c         | 41 +++----------------
 1 file changed, 5 insertions(+), 36 deletions(-)

v2: no change just rebased.

diff --git a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
index 8dbfb51acf0f..ed9143990888 100644
--- a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
+++ b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
@@ -56,13 +56,7 @@ int consume_stack(unsigned long target_sp, unsigned long stack_high, int delta,
 #else
 		asm volatile ("mov %%rsp, %[sp]" : [sp] "=r" (stack_top_sp));
 #endif
-
-		// Kludge, delta < 0 indicates relative to SP
-		if (delta < 0)
-			target = stack_top_sp + delta;
-		else
-			target = stack_high - delta + 1;
-
+		target = stack_high - delta + 1;
 		volatile char *p = (char *)target;
 
 		if (type == STORE)
@@ -162,41 +156,16 @@ static int test_one(unsigned int stack_used, int delta, enum access_type type)
 
 static void test_one_type(enum access_type type, unsigned long page_size, unsigned long rlim_cur)
 {
-	assert(test_one(DEFAULT_SIZE, 512 * _KB, type) == 0);
+	unsigned long delta;
 
-	// powerpc has a special case to allow up to 1MB
-	assert(test_one(DEFAULT_SIZE, 1 * _MB, type) == 0);
-
-#ifdef __powerpc__
-	// This fails on powerpc because it's > 1MB and is not a stdu &
-	// not close to r1
-	assert(test_one(DEFAULT_SIZE, 1 * _MB + 8, type) != 0);
-#else
-	assert(test_one(DEFAULT_SIZE, 1 * _MB + 8, type) == 0);
-#endif
-
-#ifdef __powerpc__
-	// Accessing way past the stack pointer is not allowed on powerpc
-	assert(test_one(DEFAULT_SIZE, rlim_cur, type) != 0);
-#else
 	// We should be able to access anywhere within the rlimit
+	for (delta = page_size; delta <= rlim_cur; delta += page_size)
+		assert(test_one(DEFAULT_SIZE, delta, type) == 0);
+
 	assert(test_one(DEFAULT_SIZE, rlim_cur, type) == 0);
-#endif
 
 	// But if we go past the rlimit it should fail
 	assert(test_one(DEFAULT_SIZE, rlim_cur + 1, type) != 0);
-
-	// Above 1MB powerpc only allows accesses within 4224 bytes of
-	// r1 for accesses that aren't stdu
-	assert(test_one(1 * _MB + page_size - 128, -4224, type) == 0);
-#ifdef __powerpc__
-	assert(test_one(1 * _MB + page_size - 128, -4225, type) != 0);
-#else
-	assert(test_one(1 * _MB + page_size - 128, -4225, type) == 0);
-#endif
-
-	// By consuming 2MB of stack we test the stdu case
-	assert(test_one(2 * _MB + page_size - 128, -4224, type) == 0);
 }
 
 static int test(void)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame
  2020-07-24  9:25 ` [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame Michael Ellerman
@ 2020-07-27  8:23   ` Gabriel Paubert
  2020-07-27 12:28     ` Michael Ellerman
  2020-07-27 10:50   ` Daniel Axtens
  1 sibling, 1 reply; 11+ messages in thread
From: Gabriel Paubert @ 2020-07-27  8:23 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, linux-kernel, dja

On Fri, Jul 24, 2020 at 07:25:25PM +1000, Michael Ellerman wrote:
> We have powerpc specific logic in our page fault handling to decide if
> an access to an unmapped address below the stack pointer should expand
> the stack VMA.
> 
> The code was originally added in 2004 "ported from 2.4". The rough
> logic is that the stack is allowed to grow to 1MB with no extra
> checking. Over 1MB the access must be within 2048 bytes of the stack
> pointer, or be from a user instruction that updates the stack pointer.
> 
> The 2048 byte allowance below the stack pointer is there to cover the
> 288 byte "red zone" as well as the "about 1.5kB" needed by the signal
> delivery code.
> 
> Unfortunately since then the signal frame has expanded, and is now
> 4224 bytes on 64-bit kernels with transactional memory enabled.

Are there really users of transactional memory in the wild? 

Just asking because Power10 removes TM, and Power9 has had some issues
with it AFAICT.

Getting rid of it (if possible) would result in smaller signal frames,
with simpler signal delivery code (probably slightly faster also).

	Gabriel
 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame
  2020-07-24  9:25 ` [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame Michael Ellerman
  2020-07-27  8:23   ` Gabriel Paubert
@ 2020-07-27 10:50   ` Daniel Axtens
  1 sibling, 0 replies; 11+ messages in thread
From: Daniel Axtens @ 2020-07-27 10:50 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: linux-kernel

Hi Michael,

I have tested this with the test from the bug and it now seems to pass
fine. On that basis:

Tested-by: Daniel Axtens <dja@axtens.net>

Thank you for coming up with a better solution than my gross hack!

Kind regards,
Daniel

> We have powerpc specific logic in our page fault handling to decide if
> an access to an unmapped address below the stack pointer should expand
> the stack VMA.
>
> The code was originally added in 2004 "ported from 2.4". The rough
> logic is that the stack is allowed to grow to 1MB with no extra
> checking. Over 1MB the access must be within 2048 bytes of the stack
> pointer, or be from a user instruction that updates the stack pointer.
>
> The 2048 byte allowance below the stack pointer is there to cover the
> 288 byte "red zone" as well as the "about 1.5kB" needed by the signal
> delivery code.
>
> Unfortunately since then the signal frame has expanded, and is now
> 4224 bytes on 64-bit kernels with transactional memory enabled. This
> means if a process has consumed more than 1MB of stack, and its stack
> pointer lies less than 4224 bytes from the next page boundary, signal
> delivery will fault when trying to expand the stack and the process
> will see a SEGV.
>
> The total size of the signal frame is the size of struct rt_sigframe
> (which includes the red zone) plus __SIGNAL_FRAMESIZE (128 bytes on
> 64-bit).
>
> The 2048 byte allowance was correct until 2008 as the signal frame
> was:
>
> struct rt_sigframe {
>         struct ucontext    uc;                           /*     0  1440 */
>         /* --- cacheline 11 boundary (1408 bytes) was 32 bytes ago --- */
>         long unsigned int          _unused[2];           /*  1440    16 */
>         unsigned int               tramp[6];             /*  1456    24 */
>         struct siginfo *           pinfo;                /*  1480     8 */
>         void *                     puc;                  /*  1488     8 */
>         struct siginfo     info;                         /*  1496   128 */
>         /* --- cacheline 12 boundary (1536 bytes) was 88 bytes ago --- */
>         char                       abigap[288];          /*  1624   288 */
>
>         /* size: 1920, cachelines: 15, members: 7 */
>         /* padding: 8 */
> };
>
> 1920 + 128 = 2048
>
> Then in commit ce48b2100785 ("powerpc: Add VSX context save/restore,
> ptrace and signal support") (Jul 2008) the signal frame expanded to
> 2304 bytes:
>
> struct rt_sigframe {
>         struct ucontext    uc;                           /*     0  1696 */	<--
>         /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
>         long unsigned int          _unused[2];           /*  1696    16 */
>         unsigned int               tramp[6];             /*  1712    24 */
>         struct siginfo *           pinfo;                /*  1736     8 */
>         void *                     puc;                  /*  1744     8 */
>         struct siginfo     info;                         /*  1752   128 */
>         /* --- cacheline 14 boundary (1792 bytes) was 88 bytes ago --- */
>         char                       abigap[288];          /*  1880   288 */
>
>         /* size: 2176, cachelines: 17, members: 7 */
>         /* padding: 8 */
> };
>
> 2176 + 128 = 2304
>
> At this point we should have been exposed to the bug, though as far as
> I know it was never reported. I no longer have a system old enough to
> easily test on.
>
> Then in 2010 commit 320b2b8de126 ("mm: keep a guard page below a
> grow-down stack segment") caused our stack expansion code to never
> trigger, as there was always a VMA found for a write up to PAGE_SIZE
> below r1.
>
> That meant the bug was hidden as we continued to expand the signal
> frame in commit 2b0a576d15e0 ("powerpc: Add new transactional memory
> state to the signal context") (Feb 2013):
>
> struct rt_sigframe {
>         struct ucontext    uc;                           /*     0  1696 */
>         /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
>         struct ucontext    uc_transact;                  /*  1696  1696 */	<--
>         /* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
>         long unsigned int          _unused[2];           /*  3392    16 */
>         unsigned int               tramp[6];             /*  3408    24 */
>         struct siginfo *           pinfo;                /*  3432     8 */
>         void *                     puc;                  /*  3440     8 */
>         struct siginfo     info;                         /*  3448   128 */
>         /* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
>         char                       abigap[288];          /*  3576   288 */
>
>         /* size: 3872, cachelines: 31, members: 8 */
>         /* padding: 8 */
>         /* last cacheline: 32 bytes */
> };
>
> 3872 + 128 = 4000
>
> And commit 573ebfa6601f ("powerpc: Increase stack redzone for 64-bit
> userspace to 512 bytes") (Feb 2014):
>
> struct rt_sigframe {
>         struct ucontext    uc;                           /*     0  1696 */
>         /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
>         struct ucontext    uc_transact;                  /*  1696  1696 */
>         /* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
>         long unsigned int          _unused[2];           /*  3392    16 */
>         unsigned int               tramp[6];             /*  3408    24 */
>         struct siginfo *           pinfo;                /*  3432     8 */
>         void *                     puc;                  /*  3440     8 */
>         struct siginfo     info;                         /*  3448   128 */
>         /* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
>         char                       abigap[512];          /*  3576   512 */	<--
>
>         /* size: 4096, cachelines: 32, members: 8 */
>         /* padding: 8 */
> };
>
> 4096 + 128 = 4224
>
> Then finally in 2017, commit 1be7107fbe18 ("mm: larger stack guard
> gap, between vmas") exposed us to the existing bug, because it changed
> the stack VMA to be the correct/real size, meaning our stack expansion
> code is now triggered.
>
> Fix it by increasing the allowance to 4224 bytes.
>
> Hard-coding 4224 is obviously unsafe against future expansions of the
> signal frame in the same way as the existing code. We can't easily use
> sizeof() because the signal frame structure is not in a header. We
> will either fix that, or rip out all the custom stack expansion
> checking logic entirely.
>
> Fixes: ce48b2100785 ("powerpc: Add VSX context save/restore, ptrace and signal support")
> Cc: stable@vger.kernel.org # v2.6.27+
> Reported-by: Tom Lane <tgl@sss.pgh.pa.us>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>
> v2: Account for the extra 128 bytes of __SIGNAL_FRAMESIZE, making the
>     total size 4224, as noticed by dja.
>
> See also https://bugzilla.kernel.org/show_bug.cgi?id=205183
> ---
>  arch/powerpc/mm/fault.c | 7 +++++--
>  1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 641fc5f3d7dd..3ebb1792e636 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -267,6 +267,9 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
>  	return false;
>  }
>  
> +// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
> +#define SIGFRAME_MAX_SIZE	(4096 + 128)
> +
>  static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>  				struct vm_area_struct *vma, unsigned int flags,
>  				bool *must_retry)
> @@ -274,7 +277,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>  	/*
>  	 * N.B. The POWER/Open ABI allows programs to access up to
>  	 * 288 bytes below the stack pointer.
> -	 * The kernel signal delivery code writes up to about 1.5kB
> +	 * The kernel signal delivery code writes a bit over 4KB
>  	 * below the stack pointer (r1) before decrementing it.
>  	 * The exec code can write slightly over 640kB to the stack
>  	 * before setting the user r1.  Thus we allow the stack to
> @@ -299,7 +302,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
>  		 * between the last mapped region and the stack will
>  		 * expand the stack rather than segfaulting.
>  		 */
> -		if (address + 2048 >= uregs->gpr[1])
> +		if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
>  			return false;
>  
>  		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
> -- 
> 2.25.1

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame
  2020-07-27  8:23   ` Gabriel Paubert
@ 2020-07-27 12:28     ` Michael Ellerman
  0 siblings, 0 replies; 11+ messages in thread
From: Michael Ellerman @ 2020-07-27 12:28 UTC (permalink / raw)
  To: Gabriel Paubert; +Cc: linuxppc-dev, linux-kernel, dja

Gabriel Paubert <paubert@iram.es> writes:
> On Fri, Jul 24, 2020 at 07:25:25PM +1000, Michael Ellerman wrote:
>> We have powerpc specific logic in our page fault handling to decide if
>> an access to an unmapped address below the stack pointer should expand
>> the stack VMA.
>> 
>> The code was originally added in 2004 "ported from 2.4". The rough
>> logic is that the stack is allowed to grow to 1MB with no extra
>> checking. Over 1MB the access must be within 2048 bytes of the stack
>> pointer, or be from a user instruction that updates the stack pointer.
>> 
>> The 2048 byte allowance below the stack pointer is there to cover the
>> 288 byte "red zone" as well as the "about 1.5kB" needed by the signal
>> delivery code.
>> 
>> Unfortunately since then the signal frame has expanded, and is now
>> 4224 bytes on 64-bit kernels with transactional memory enabled.
>
> Are there really users of transactional memory in the wild? 

Not many that I've heard of, but some.

Though anything that does use it needs to be written to fallback to
regular locking if TM is not available anyway.

> Just asking because Power10 removes TM, and Power9 has had some issues
> with it AFAICT.

It varies on different Power9 chip levels. For guests it should work.

> Getting rid of it (if possible) would result in smaller signal frames,
> with simpler signal delivery code (probably slightly faster also).

All the kernel code should be behind CONFIG_PPC_TRANSACTIONAL_MEM.

Deciding to disable that is really a distro decision.

In upstream we tend not to drop support for existing hardware while
people are still using it. But we could make a special case for TM,
because it's quite intrusive. I think we'd wait for a major distro to
ship without TM enabled before we did that though.

cheers

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 4/5] powerpc/mm: Remove custom stack expansion checking
  2020-07-24  9:25 ` [PATCH v2 4/5] powerpc/mm: Remove custom stack expansion checking Michael Ellerman
@ 2020-07-27 13:48   ` Daniel Axtens
  2020-07-28  2:32     ` Michael Ellerman
  0 siblings, 1 reply; 11+ messages in thread
From: Daniel Axtens @ 2020-07-27 13:48 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: linux-kernel

Hi Michael,

I tested v1 of this. I ran the test from the bug with a range of stack
sizes, in a loop, for several hours and didn't see any crashes/signal
delivery failures.

I retested v2 for a few minutes just to be sure, and I ran stress-ng's
stack, stackmmap and bad-altstack stressors to make sure no obvious
kernel bugs were exposed. Nothing crashed.

All tests done on a P8 LE guest under KVM.

On that basis:

Tested-by: Daniel Axtens <dja@axtens.net>

The more I look at this the less qualified I feel to Review it, but
certainly it looks better than my ugly hack from late last year.

Kind regards,
Daniel

> We have powerpc specific logic in our page fault handling to decide if
> an access to an unmapped address below the stack pointer should expand
> the stack VMA.
>
> The logic aims to prevent userspace from doing bad accesses below the
> stack pointer. However as long as the stack is < 1MB in size, we allow
> all accesses without further checks. Adding some debug I see that I
> can do a full kernel build and LTP run, and not a single process has
> used more than 1MB of stack. So for the majority of processes the
> logic never even fires.
>
> We also recently found a nasty bug in this code which could cause
> userspace programs to be killed during signal delivery. It went
> unnoticed presumably because most processes use < 1MB of stack.
>
> The generic mm code has also grown support for stack guard pages since
> this code was originally written, so the most heinous case of the
> stack expanding into other mappings is now handled for us.
>
> Finally although some other arches have special logic in this path,
> from what I can tell none of x86, arm64, arm and s390 impose any extra
> checks other than those in expand_stack().
>
> So drop our complicated logic and like other architectures just let
> the stack expand as long as its within the rlimit.
>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>  arch/powerpc/mm/fault.c | 109 ++--------------------------------------
>  1 file changed, 5 insertions(+), 104 deletions(-)
>
> v2: no change just rebased.
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 3ebb1792e636..925a7231abb3 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -42,39 +42,7 @@
>  #include <asm/kup.h>
>  #include <asm/inst.h>
>  
> -/*
> - * Check whether the instruction inst is a store using
> - * an update addressing form which will update r1.
> - */
> -static bool store_updates_sp(struct ppc_inst inst)
> -{
> -	/* check for 1 in the rA field */
> -	if (((ppc_inst_val(inst) >> 16) & 0x1f) != 1)
> -		return false;
> -	/* check major opcode */
> -	switch (ppc_inst_primary_opcode(inst)) {
> -	case OP_STWU:
> -	case OP_STBU:
> -	case OP_STHU:
> -	case OP_STFSU:
> -	case OP_STFDU:
> -		return true;
> -	case OP_STD:	/* std or stdu */
> -		return (ppc_inst_val(inst) & 3) == 1;
> -	case OP_31:
> -		/* check minor opcode */
> -		switch ((ppc_inst_val(inst) >> 1) & 0x3ff) {
> -		case OP_31_XOP_STDUX:
> -		case OP_31_XOP_STWUX:
> -		case OP_31_XOP_STBUX:
> -		case OP_31_XOP_STHUX:
> -		case OP_31_XOP_STFSUX:
> -		case OP_31_XOP_STFDUX:
> -			return true;
> -		}
> -	}
> -	return false;
> -}
> +
>  /*
>   * do_page_fault error handling helpers
>   */
> @@ -267,57 +235,6 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
>  	return false;
>  }
>  
> -// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
> -#define SIGFRAME_MAX_SIZE	(4096 + 128)
> -
> -static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> -				struct vm_area_struct *vma, unsigned int flags,
> -				bool *must_retry)
> -{
> -	/*
> -	 * N.B. The POWER/Open ABI allows programs to access up to
> -	 * 288 bytes below the stack pointer.
> -	 * The kernel signal delivery code writes a bit over 4KB
> -	 * below the stack pointer (r1) before decrementing it.
> -	 * The exec code can write slightly over 640kB to the stack
> -	 * before setting the user r1.  Thus we allow the stack to
> -	 * expand to 1MB without further checks.
> -	 */
> -	if (address + 0x100000 < vma->vm_end) {
> -		struct ppc_inst __user *nip = (struct ppc_inst __user *)regs->nip;
> -		/* get user regs even if this fault is in kernel mode */
> -		struct pt_regs *uregs = current->thread.regs;
> -		if (uregs == NULL)
> -			return true;
> -
> -		/*
> -		 * A user-mode access to an address a long way below
> -		 * the stack pointer is only valid if the instruction
> -		 * is one which would update the stack pointer to the
> -		 * address accessed if the instruction completed,
> -		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
> -		 * (or the byte, halfword, float or double forms).
> -		 *
> -		 * If we don't check this then any write to the area
> -		 * between the last mapped region and the stack will
> -		 * expand the stack rather than segfaulting.
> -		 */
> -		if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
> -			return false;
> -
> -		if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
> -		    access_ok(nip, sizeof(*nip))) {
> -			struct ppc_inst inst;
> -
> -			if (!probe_user_read_inst(&inst, nip))
> -				return !store_updates_sp(inst);
> -			*must_retry = true;
> -		}
> -		return true;
> -	}
> -	return false;
> -}
> -
>  #ifdef CONFIG_PPC_MEM_KEYS
>  static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
>  			      struct vm_area_struct *vma)
> @@ -483,7 +400,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>  	int is_user = user_mode(regs);
>  	int is_write = page_fault_is_write(error_code);
>  	vm_fault_t fault, major = 0;
> -	bool must_retry = false;
>  	bool kprobe_fault = kprobe_page_fault(regs, 11);
>  
>  	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
> @@ -572,30 +488,15 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
>  	vma = find_vma(mm, address);
>  	if (unlikely(!vma))
>  		return bad_area(regs, address);
> -	if (likely(vma->vm_start <= address))
> -		goto good_area;
> -	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
> -		return bad_area(regs, address);
>  
> -	/* The stack is being expanded, check if it's valid */
> -	if (unlikely(bad_stack_expansion(regs, address, vma, flags,
> -					 &must_retry))) {
> -		if (!must_retry)
> +	if (unlikely(vma->vm_start > address)) {
> +		if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
>  			return bad_area(regs, address);
>  
> -		mmap_read_unlock(mm);
> -		if (fault_in_pages_readable((const char __user *)regs->nip,
> -					    sizeof(unsigned int)))
> -			return bad_area_nosemaphore(regs, address);
> -		goto retry;
> +		if (unlikely(expand_stack(vma, address)))
> +			return bad_area(regs, address);
>  	}
>  
> -	/* Try to expand it */
> -	if (unlikely(expand_stack(vma, address)))
> -		return bad_area(regs, address);
> -
> -good_area:
> -
>  #ifdef CONFIG_PPC_MEM_KEYS
>  	if (unlikely(access_pkey_error(is_write, is_exec,
>  				       (error_code & DSISR_KEYFAULT), vma)))
> -- 
> 2.25.1

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 4/5] powerpc/mm: Remove custom stack expansion checking
  2020-07-27 13:48   ` Daniel Axtens
@ 2020-07-28  2:32     ` Michael Ellerman
  0 siblings, 0 replies; 11+ messages in thread
From: Michael Ellerman @ 2020-07-28  2:32 UTC (permalink / raw)
  To: Daniel Axtens, linuxppc-dev; +Cc: linux-kernel

Daniel Axtens <dja@axtens.net> writes:
> Hi Michael,
>
> I tested v1 of this. I ran the test from the bug with a range of stack
> sizes, in a loop, for several hours and didn't see any crashes/signal
> delivery failures.
>
> I retested v2 for a few minutes just to be sure, and I ran stress-ng's
> stack, stackmmap and bad-altstack stressors to make sure no obvious
> kernel bugs were exposed. Nothing crashed.
>
> All tests done on a P8 LE guest under KVM.
>
> On that basis:
>
> Tested-by: Daniel Axtens <dja@axtens.net>

Thanks.

Always nice to have someone review my patches!

cheers

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic
  2020-07-24  9:25 [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic Michael Ellerman
                   ` (3 preceding siblings ...)
  2020-07-24  9:25 ` [PATCH v2 5/5] selftests/powerpc: Remove powerpc special cases from stack expansion test Michael Ellerman
@ 2020-07-30 12:50 ` Michael Ellerman
  4 siblings, 0 replies; 11+ messages in thread
From: Michael Ellerman @ 2020-07-30 12:50 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: linux-kernel, dja

On Fri, 24 Jul 2020 19:25:24 +1000, Michael Ellerman wrote:
> We have custom stack expansion checks that it turns out are extremely
> badly tested and contain bugs, surprise. So add some tests that
> exercise the code and capture the current boundary conditions.
> 
> The signal test currently fails on 64-bit kernels because the 2048
> byte allowance for the signal frame is too small, we will fix that in
> a subsequent patch.

Applied to powerpc/next.

[1/5] selftests/powerpc: Add test of stack expansion logic
      https://git.kernel.org/powerpc/c/c9938a9dac95be7650218cdd8e9d1f882e7b5691
[2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame
      https://git.kernel.org/powerpc/c/63dee5df43a31f3844efabc58972f0a206ca4534
[3/5] selftests/powerpc: Update the stack expansion test
      https://git.kernel.org/powerpc/c/9ee571d84bf8cfdd587a1acbf3490ca90fc40c9d
[4/5] powerpc/mm: Remove custom stack expansion checking
      https://git.kernel.org/powerpc/c/773b3e53df5b84e73bf64998e4019f50a6662ad1
[5/5] selftests/powerpc: Remove powerpc special cases from stack expansion test
      https://git.kernel.org/powerpc/c/73da08f6966b81feb429af4fb3229da4cf21d6d9

cheers

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, back to index

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-24  9:25 [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic Michael Ellerman
2020-07-24  9:25 ` [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame Michael Ellerman
2020-07-27  8:23   ` Gabriel Paubert
2020-07-27 12:28     ` Michael Ellerman
2020-07-27 10:50   ` Daniel Axtens
2020-07-24  9:25 ` [PATCH v2 3/5] selftests/powerpc: Update the stack expansion test Michael Ellerman
2020-07-24  9:25 ` [PATCH v2 4/5] powerpc/mm: Remove custom stack expansion checking Michael Ellerman
2020-07-27 13:48   ` Daniel Axtens
2020-07-28  2:32     ` Michael Ellerman
2020-07-24  9:25 ` [PATCH v2 5/5] selftests/powerpc: Remove powerpc special cases from stack expansion test Michael Ellerman
2020-07-30 12:50 ` [PATCH v2 1/5] selftests/powerpc: Add test of stack expansion logic Michael Ellerman

LinuxPPC-Dev Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linuxppc-dev/0 linuxppc-dev/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linuxppc-dev linuxppc-dev/ https://lore.kernel.org/linuxppc-dev \
		linuxppc-dev@lists.ozlabs.org linuxppc-dev@ozlabs.org
	public-inbox-index linuxppc-dev

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.ozlabs.lists.linuxppc-dev


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git