All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] m68k: save extra registers on more syscall entry points
@ 2021-06-18  1:27 Michael Schmitz
  2021-06-18 17:17 ` Linus Torvalds
  2021-06-18 18:39 ` Eric W. Biederman
  0 siblings, 2 replies; 12+ messages in thread
From: Michael Schmitz @ 2021-06-18  1:27 UTC (permalink / raw)
  To: geert, linux-arch, linux-m68k; +Cc: ebiederm, torvalds, schwab, Michael Schmitz

Multiple syscalls are liable to PTRACE_EVENT_* tracing and thus
require full user context saved on the kernel stack. We only
save those registers not preserved by C code currently.

do_exit() calls ptrace_stop() which may require access to all
saved registers. Add code to save additional registers in the
switch_stack struct for exit and exit_group syscalls (similar
to what is already done for fork, vfork and clone3). According
to Eric's analysis, execve and execveat can be traced as well,
so have been given the same treatment.

I'm not sure what to do about io_uring_setup - added code to
save full context on entry, and some more code to save/restore
additional registers on the switch stack around the kworker thread
call in ret_from_kernel_thread, but this may well be redundant.

I'd need specific test cases to exercise io_uring_setup in
particular, to see whether stack offsets for pt_regs and the
switch stack have been messed up.

Boot tested on ARAnym - earlier version including io_uring entry
save also tested on real hardware unter heavy IO load.

CC: Eric W. Biederman <ebiederm@xmission.com>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andreas Schwab <schwab@linux-m68k.org>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>

--
Changes from v1:

- added exec, execve and io_uring_setup syscalls
- save extra registers around kworker thread calls
---
 arch/m68k/kernel/entry.S              | 41 ++++++++++++++++++++++++++++++++++-
 arch/m68k/kernel/process.c            | 39 +++++++++++++++++++++++++++++++++
 arch/m68k/kernel/syscalls/syscall.tbl | 10 ++++-----
 3 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index 9dd76fb..02bf2a1 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -76,6 +76,41 @@ ENTRY(__sys_clone3)
 	lea	%sp@(28),%sp
 	rts
 
+ENTRY(__sys_exit)
+	SAVE_SWITCH_STACK
+	pea	%sp@(SWITCH_STACK_SIZE)
+	jbsr	m68k_exit
+	lea	%sp@(28),%sp
+	rts
+
+ENTRY(__sys_exit_group)
+	SAVE_SWITCH_STACK
+	pea	%sp@(SWITCH_STACK_SIZE)
+	jbsr	m68k_exit_group
+	lea	%sp@(28),%sp
+	rts
+
+ENTRY(__sys_execve)
+	SAVE_SWITCH_STACK
+	pea	%sp@(SWITCH_STACK_SIZE)
+	jbsr	m68k_execve
+	lea	%sp@(28),%sp
+	rts
+
+ENTRY(__sys_execveat)
+	SAVE_SWITCH_STACK
+	pea	%sp@(SWITCH_STACK_SIZE)
+	jbsr	m68k_execveat
+	lea	%sp@(28),%sp
+	rts
+
+ENTRY(__sys_io_uring_setup)
+	SAVE_SWITCH_STACK
+	pea	%sp@(SWITCH_STACK_SIZE)
+	jbsr	m68k_io_uring_setup
+	lea	%sp@(28),%sp
+	rts
+
 ENTRY(sys_sigreturn)
 	SAVE_SWITCH_STACK
 	movel	%sp,%sp@-		  | switch_stack pointer
@@ -123,9 +158,13 @@ ENTRY(ret_from_kernel_thread)
 	| a3 contains the kernel thread payload, d7 - its argument
 	movel	%d1,%sp@-
 	jsr	schedule_tail
-	movel	%d7,(%sp)
+	addql	#4,%sp
+	| kernel threads can be traced!
+	SAVE_SWITCH_STACK
+	movel	%d7,%sp@-
 	jsr	%a3@
 	addql	#4,%sp
+	RESTORE_SWITCH_STACK
 	jra	ret_from_exception
 
 #if defined(CONFIG_COLDFIRE) || !defined(CONFIG_MMU)
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index da83cc8..298ac99 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -138,6 +138,45 @@ asmlinkage int m68k_clone3(struct pt_regs *regs)
 	return sys_clone3((struct clone_args __user *)regs->d1, regs->d2);
 }
 
+/*
+ * Because extra registers are saved on the stack after the sys_exit()
+ * arguments, this C wrapper extracts them from pt_regs * and then calls the
+ * generic sys_exit() implementation.
+ */
+asmlinkage int m68k_exit(struct pt_regs *regs)
+{
+	return sys_exit(regs->d1);
+}
+
+/* Same for sys_exit_group ... */
+asmlinkage int m68k_exit_group(struct pt_regs *regs)
+{
+	return sys_exit_group(regs->d1);
+}
+
+/* Same for sys_exit_group ... */
+asmlinkage int m68k_execve(struct pt_regs *regs)
+{
+	return sys_execve((const char __user *)regs->d1,
+			(const char __user *const __user*)regs->d2,
+			(const char __user *const __user*)regs->d3);
+}
+
+/* Same for sys_exit_group ... */
+asmlinkage int m68k_execveat(struct pt_regs *regs)
+{
+	return sys_execveat(regs->d1, (const char __user *)regs->d2,
+			(const char __user *const __user*)regs->d3,
+			(const char __user *const __user*)regs->d4,
+			regs->d5);
+}
+
+/* and for sys_io_uring_create */
+asmlinkage int m68k_io_uring_setup(struct pt_regs *regs)
+{
+	return sys_io_uring_setup(regs->d1,(struct io_uring_params __user *)regs->d2);
+}
+
 int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
 		struct task_struct *p, unsigned long tls)
 {
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 0dd019d..4a1ba1d 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -8,7 +8,7 @@
 # The <abi> is always "common" for this file
 #
 0	common	restart_syscall			sys_restart_syscall
-1	common	exit				sys_exit
+1	common	exit				__sys_exit
 2	common	fork				__sys_fork
 3	common	read				sys_read
 4	common	write				sys_write
@@ -18,7 +18,7 @@
 8	common	creat				sys_creat
 9	common	link				sys_link
 10	common	unlink				sys_unlink
-11	common	execve				sys_execve
+11	common	execve				__sys_execve
 12	common	chdir				sys_chdir
 13	common	time				sys_time32
 14	common	mknod				sys_mknod
@@ -254,7 +254,7 @@
 244	common	io_submit			sys_io_submit
 245	common	io_cancel			sys_io_cancel
 246	common	fadvise64			sys_fadvise64
-247	common	exit_group			sys_exit_group
+247	common	exit_group			__sys_exit_group
 248	common	lookup_dcookie			sys_lookup_dcookie
 249	common	epoll_create			sys_epoll_create
 250	common	epoll_ctl			sys_epoll_ctl
@@ -362,7 +362,7 @@
 352	common	getrandom			sys_getrandom
 353	common	memfd_create			sys_memfd_create
 354	common	bpf				sys_bpf
-355	common	execveat			sys_execveat
+355	common	execveat			__sys_execveat
 356	common	socket				sys_socket
 357	common	socketpair			sys_socketpair
 358	common	bind				sys_bind
@@ -424,7 +424,7 @@
 422	common	futex_time64			sys_futex
 423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
 424	common	pidfd_send_signal		sys_pidfd_send_signal
-425	common	io_uring_setup			sys_io_uring_setup
+425	common	io_uring_setup			__sys_io_uring_setup
 426	common	io_uring_enter			sys_io_uring_enter
 427	common	io_uring_register		sys_io_uring_register
 428	common	open_tree			sys_open_tree
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-18  1:27 [PATCH v2] m68k: save extra registers on more syscall entry points Michael Schmitz
@ 2021-06-18 17:17 ` Linus Torvalds
  2021-06-18 22:34   ` Michael Schmitz
  2021-06-18 18:39 ` Eric W. Biederman
  1 sibling, 1 reply; 12+ messages in thread
From: Linus Torvalds @ 2021-06-18 17:17 UTC (permalink / raw)
  To: Michael Schmitz
  Cc: Geert Uytterhoeven, linux-arch, linux-m68k, Eric W. Biederman,
	Andreas Schwab

On Thu, Jun 17, 2021 at 6:27 PM Michael Schmitz <schmitzmic@gmail.com> wrote:
>
> I'd need specific test cases to exercise io_uring_setup in
> particular, to see whether stack offsets for pt_regs and the
> switch stack have been messed up.

I don't think doing this for io_uring_setup() will help any - the
problem is not in that system call thread itself, it's purely in the
kernel thread that it then starts.

And the fact that io_uring_setup() has the full stack frame won't then
help that kernel thread, for exactly the same reason that was true on
alpha: copy_thread() will actually _create_ the full stack, but when
we switch to it (through switch_to() -> resume()), the resume code in
arch/m68k/kernel/entry.S will switch to that stack, and then do
RESTORE_SWITCH_STACK which will consume it again.

So I think m68k should do the same thing as Eric's patch for alpha: do
the full stack for exit and exit_group, and for kernel thread creation
- or at least PF_IO_WORKER), do an extra stack frame on the kernel
stack, so that even after resume() we'll still have another copy of
the frame.

The alternative would be to do what x86 does: see __switch_to_asm().
Instead of doing that normal kernel entry/exit stack (with
SAVE_SWITCH_STACK and RESTORE_SWITCH_STACK), x86 has it's own very
special "only for task switching" stack frame thing, and leaves the
pt_regs etc entirely alone.

Of course, that "only for task switching" is _kind_of_ what the whole
SAVE_SWITCH_STACK is for - it's part of the name after all - but the
difference is that on alpha and m68k, it's also (and primarily) the
"full state" stack frame, used not just for task switching, but for
signal handling state and for ptrace too.

So in theory, it would be good to split this up:

 (a) have the signal handling and ptrace stack be one thing (maybe
rename the "SWITCH" part of the operations to something else, like
"EXTRA" or "SIGNAL" or whatever)

 (b) make a separate "for task switching only" stack frame, which is
used by that switch_to() -> resume() sequence, and that copy_thread()
has a "struct inactive_task_frame" thing for..

That way, the pt_regs/extra_regs stack frame that copy_thread()
creates wouldn't then be eaten up by the task switch.

But while that sounds like the right thing to do, it would be a rather
bigger change. I'm not entirely sure it's worth it.

Eric, comments?

                Linus

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-18  1:27 [PATCH v2] m68k: save extra registers on more syscall entry points Michael Schmitz
  2021-06-18 17:17 ` Linus Torvalds
@ 2021-06-18 18:39 ` Eric W. Biederman
  2021-06-18 19:06   ` Michael Schmitz
  1 sibling, 1 reply; 12+ messages in thread
From: Eric W. Biederman @ 2021-06-18 18:39 UTC (permalink / raw)
  To: Michael Schmitz; +Cc: geert, linux-arch, linux-m68k, torvalds, schwab

Michael Schmitz <schmitzmic@gmail.com> writes:

> Multiple syscalls are liable to PTRACE_EVENT_* tracing and thus
> require full user context saved on the kernel stack. We only
> save those registers not preserved by C code currently.
>
> do_exit() calls ptrace_stop() which may require access to all
> saved registers. Add code to save additional registers in the
> switch_stack struct for exit and exit_group syscalls (similar
> to what is already done for fork, vfork and clone3). According
> to Eric's analysis, execve and execveat can be traced as well,
> so have been given the same treatment.
>
> I'm not sure what to do about io_uring_setup - added code to
> save full context on entry, and some more code to save/restore
> additional registers on the switch stack around the kworker thread
> call in ret_from_kernel_thread, but this may well be redundant.
>
> I'd need specific test cases to exercise io_uring_setup in
> particular, to see whether stack offsets for pt_regs and the
> switch stack have been messed up.
>
> Boot tested on ARAnym - earlier version including io_uring entry
> save also tested on real hardware unter heavy IO load.


Are the registers that SAVE_SWITCH_STACK saves on m68k caller saved
registers?

The fact that the registers are saved somewhere on the stack on alpha is
what forces the registers to be saved before calling exit and exec.

Eric


> CC: Eric W. Biederman <ebiederm@xmission.com>
> CC: Linus Torvalds <torvalds@linux-foundation.org>
> CC: Andreas Schwab <schwab@linux-m68k.org>
> Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
>
> --
> Changes from v1:
>
> - added exec, execve and io_uring_setup syscalls
> - save extra registers around kworker thread calls
> ---
>  arch/m68k/kernel/entry.S              | 41 ++++++++++++++++++++++++++++++++++-
>  arch/m68k/kernel/process.c            | 39 +++++++++++++++++++++++++++++++++
>  arch/m68k/kernel/syscalls/syscall.tbl | 10 ++++-----
>  3 files changed, 84 insertions(+), 6 deletions(-)
>
> diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
> index 9dd76fb..02bf2a1 100644
> --- a/arch/m68k/kernel/entry.S
> +++ b/arch/m68k/kernel/entry.S
> @@ -76,6 +76,41 @@ ENTRY(__sys_clone3)
>  	lea	%sp@(28),%sp
>  	rts
>  
> +ENTRY(__sys_exit)
> +	SAVE_SWITCH_STACK
> +	pea	%sp@(SWITCH_STACK_SIZE)
> +	jbsr	m68k_exit
> +	lea	%sp@(28),%sp
> +	rts
> +
> +ENTRY(__sys_exit_group)
> +	SAVE_SWITCH_STACK
> +	pea	%sp@(SWITCH_STACK_SIZE)
> +	jbsr	m68k_exit_group
> +	lea	%sp@(28),%sp
> +	rts
> +
> +ENTRY(__sys_execve)
> +	SAVE_SWITCH_STACK
> +	pea	%sp@(SWITCH_STACK_SIZE)
> +	jbsr	m68k_execve
> +	lea	%sp@(28),%sp
> +	rts
> +
> +ENTRY(__sys_execveat)
> +	SAVE_SWITCH_STACK
> +	pea	%sp@(SWITCH_STACK_SIZE)
> +	jbsr	m68k_execveat
> +	lea	%sp@(28),%sp
> +	rts
> +
> +ENTRY(__sys_io_uring_setup)
> +	SAVE_SWITCH_STACK
> +	pea	%sp@(SWITCH_STACK_SIZE)
> +	jbsr	m68k_io_uring_setup
> +	lea	%sp@(28),%sp
> +	rts
> +
>  ENTRY(sys_sigreturn)
>  	SAVE_SWITCH_STACK
>  	movel	%sp,%sp@-		  | switch_stack pointer
> @@ -123,9 +158,13 @@ ENTRY(ret_from_kernel_thread)
>  	| a3 contains the kernel thread payload, d7 - its argument
>  	movel	%d1,%sp@-
>  	jsr	schedule_tail
> -	movel	%d7,(%sp)
> +	addql	#4,%sp
> +	| kernel threads can be traced!
> +	SAVE_SWITCH_STACK
> +	movel	%d7,%sp@-
>  	jsr	%a3@
>  	addql	#4,%sp
> +	RESTORE_SWITCH_STACK
>  	jra	ret_from_exception
>  
>  #if defined(CONFIG_COLDFIRE) || !defined(CONFIG_MMU)
> diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
> index da83cc8..298ac99 100644
> --- a/arch/m68k/kernel/process.c
> +++ b/arch/m68k/kernel/process.c
> @@ -138,6 +138,45 @@ asmlinkage int m68k_clone3(struct pt_regs *regs)
>  	return sys_clone3((struct clone_args __user *)regs->d1, regs->d2);
>  }
>  
> +/*
> + * Because extra registers are saved on the stack after the sys_exit()
> + * arguments, this C wrapper extracts them from pt_regs * and then calls the
> + * generic sys_exit() implementation.
> + */
> +asmlinkage int m68k_exit(struct pt_regs *regs)
> +{
> +	return sys_exit(regs->d1);
> +}
> +
> +/* Same for sys_exit_group ... */
> +asmlinkage int m68k_exit_group(struct pt_regs *regs)
> +{
> +	return sys_exit_group(regs->d1);
> +}
> +
> +/* Same for sys_exit_group ... */
> +asmlinkage int m68k_execve(struct pt_regs *regs)
> +{
> +	return sys_execve((const char __user *)regs->d1,
> +			(const char __user *const __user*)regs->d2,
> +			(const char __user *const __user*)regs->d3);
> +}
> +
> +/* Same for sys_exit_group ... */
> +asmlinkage int m68k_execveat(struct pt_regs *regs)
> +{
> +	return sys_execveat(regs->d1, (const char __user *)regs->d2,
> +			(const char __user *const __user*)regs->d3,
> +			(const char __user *const __user*)regs->d4,
> +			regs->d5);
> +}
> +
> +/* and for sys_io_uring_create */
> +asmlinkage int m68k_io_uring_setup(struct pt_regs *regs)
> +{
> +	return sys_io_uring_setup(regs->d1,(struct io_uring_params __user *)regs->d2);
> +}
> +
>  int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
>  		struct task_struct *p, unsigned long tls)
>  {
> diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
> index 0dd019d..4a1ba1d 100644
> --- a/arch/m68k/kernel/syscalls/syscall.tbl
> +++ b/arch/m68k/kernel/syscalls/syscall.tbl
> @@ -8,7 +8,7 @@
>  # The <abi> is always "common" for this file
>  #
>  0	common	restart_syscall			sys_restart_syscall
> -1	common	exit				sys_exit
> +1	common	exit				__sys_exit
>  2	common	fork				__sys_fork
>  3	common	read				sys_read
>  4	common	write				sys_write
> @@ -18,7 +18,7 @@
>  8	common	creat				sys_creat
>  9	common	link				sys_link
>  10	common	unlink				sys_unlink
> -11	common	execve				sys_execve
> +11	common	execve				__sys_execve
>  12	common	chdir				sys_chdir
>  13	common	time				sys_time32
>  14	common	mknod				sys_mknod
> @@ -254,7 +254,7 @@
>  244	common	io_submit			sys_io_submit
>  245	common	io_cancel			sys_io_cancel
>  246	common	fadvise64			sys_fadvise64
> -247	common	exit_group			sys_exit_group
> +247	common	exit_group			__sys_exit_group
>  248	common	lookup_dcookie			sys_lookup_dcookie
>  249	common	epoll_create			sys_epoll_create
>  250	common	epoll_ctl			sys_epoll_ctl
> @@ -362,7 +362,7 @@
>  352	common	getrandom			sys_getrandom
>  353	common	memfd_create			sys_memfd_create
>  354	common	bpf				sys_bpf
> -355	common	execveat			sys_execveat
> +355	common	execveat			__sys_execveat
>  356	common	socket				sys_socket
>  357	common	socketpair			sys_socketpair
>  358	common	bind				sys_bind
> @@ -424,7 +424,7 @@
>  422	common	futex_time64			sys_futex
>  423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
>  424	common	pidfd_send_signal		sys_pidfd_send_signal
> -425	common	io_uring_setup			sys_io_uring_setup
> +425	common	io_uring_setup			__sys_io_uring_setup
>  426	common	io_uring_enter			sys_io_uring_enter
>  427	common	io_uring_register		sys_io_uring_register
>  428	common	open_tree			sys_open_tree

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-18 18:39 ` Eric W. Biederman
@ 2021-06-18 19:06   ` Michael Schmitz
  0 siblings, 0 replies; 12+ messages in thread
From: Michael Schmitz @ 2021-06-18 19:06 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: geert, linux-arch, linux-m68k, torvalds, schwab

Hi Eric,

Am 19.06.2021 um 06:39 schrieb Eric W. Biederman:
> Michael Schmitz <schmitzmic@gmail.com> writes:
>
>> Multiple syscalls are liable to PTRACE_EVENT_* tracing and thus
>> require full user context saved on the kernel stack. We only
>> save those registers not preserved by C code currently.
>>
>> do_exit() calls ptrace_stop() which may require access to all
>> saved registers. Add code to save additional registers in the
>> switch_stack struct for exit and exit_group syscalls (similar
>> to what is already done for fork, vfork and clone3). According
>> to Eric's analysis, execve and execveat can be traced as well,
>> so have been given the same treatment.
>>
>> I'm not sure what to do about io_uring_setup - added code to
>> save full context on entry, and some more code to save/restore
>> additional registers on the switch stack around the kworker thread
>> call in ret_from_kernel_thread, but this may well be redundant.
>>
>> I'd need specific test cases to exercise io_uring_setup in
>> particular, to see whether stack offsets for pt_regs and the
>> switch stack have been messed up.
>>
>> Boot tested on ARAnym - earlier version including io_uring entry
>> save also tested on real hardware unter heavy IO load.
>
>
> Are the registers that SAVE_SWITCH_STACK saves on m68k caller saved
> registers?

Yes, see comments in arch/m68k/include/asm/entry.h. From my 
understanding of arch/alpha/kernel/entry.S, that's the same on alpha.

Cheers,

	Michael

> The fact that the registers are saved somewhere on the stack on alpha is
> what forces the registers to be saved before calling exit and exec.
>
> Eric
>
>
>> CC: Eric W. Biederman <ebiederm@xmission.com>
>> CC: Linus Torvalds <torvalds@linux-foundation.org>
>> CC: Andreas Schwab <schwab@linux-m68k.org>
>> Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
>>
>> --
>> Changes from v1:
>>
>> - added exec, execve and io_uring_setup syscalls
>> - save extra registers around kworker thread calls
>> ---
>>  arch/m68k/kernel/entry.S              | 41 ++++++++++++++++++++++++++++++++++-
>>  arch/m68k/kernel/process.c            | 39 +++++++++++++++++++++++++++++++++
>>  arch/m68k/kernel/syscalls/syscall.tbl | 10 ++++-----
>>  3 files changed, 84 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
>> index 9dd76fb..02bf2a1 100644
>> --- a/arch/m68k/kernel/entry.S
>> +++ b/arch/m68k/kernel/entry.S
>> @@ -76,6 +76,41 @@ ENTRY(__sys_clone3)
>>  	lea	%sp@(28),%sp
>>  	rts
>>
>> +ENTRY(__sys_exit)
>> +	SAVE_SWITCH_STACK
>> +	pea	%sp@(SWITCH_STACK_SIZE)
>> +	jbsr	m68k_exit
>> +	lea	%sp@(28),%sp
>> +	rts
>> +
>> +ENTRY(__sys_exit_group)
>> +	SAVE_SWITCH_STACK
>> +	pea	%sp@(SWITCH_STACK_SIZE)
>> +	jbsr	m68k_exit_group
>> +	lea	%sp@(28),%sp
>> +	rts
>> +
>> +ENTRY(__sys_execve)
>> +	SAVE_SWITCH_STACK
>> +	pea	%sp@(SWITCH_STACK_SIZE)
>> +	jbsr	m68k_execve
>> +	lea	%sp@(28),%sp
>> +	rts
>> +
>> +ENTRY(__sys_execveat)
>> +	SAVE_SWITCH_STACK
>> +	pea	%sp@(SWITCH_STACK_SIZE)
>> +	jbsr	m68k_execveat
>> +	lea	%sp@(28),%sp
>> +	rts
>> +
>> +ENTRY(__sys_io_uring_setup)
>> +	SAVE_SWITCH_STACK
>> +	pea	%sp@(SWITCH_STACK_SIZE)
>> +	jbsr	m68k_io_uring_setup
>> +	lea	%sp@(28),%sp
>> +	rts
>> +
>>  ENTRY(sys_sigreturn)
>>  	SAVE_SWITCH_STACK
>>  	movel	%sp,%sp@-		  | switch_stack pointer
>> @@ -123,9 +158,13 @@ ENTRY(ret_from_kernel_thread)
>>  	| a3 contains the kernel thread payload, d7 - its argument
>>  	movel	%d1,%sp@-
>>  	jsr	schedule_tail
>> -	movel	%d7,(%sp)
>> +	addql	#4,%sp
>> +	| kernel threads can be traced!
>> +	SAVE_SWITCH_STACK
>> +	movel	%d7,%sp@-
>>  	jsr	%a3@
>>  	addql	#4,%sp
>> +	RESTORE_SWITCH_STACK
>>  	jra	ret_from_exception
>>
>>  #if defined(CONFIG_COLDFIRE) || !defined(CONFIG_MMU)
>> diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
>> index da83cc8..298ac99 100644
>> --- a/arch/m68k/kernel/process.c
>> +++ b/arch/m68k/kernel/process.c
>> @@ -138,6 +138,45 @@ asmlinkage int m68k_clone3(struct pt_regs *regs)
>>  	return sys_clone3((struct clone_args __user *)regs->d1, regs->d2);
>>  }
>>
>> +/*
>> + * Because extra registers are saved on the stack after the sys_exit()
>> + * arguments, this C wrapper extracts them from pt_regs * and then calls the
>> + * generic sys_exit() implementation.
>> + */
>> +asmlinkage int m68k_exit(struct pt_regs *regs)
>> +{
>> +	return sys_exit(regs->d1);
>> +}
>> +
>> +/* Same for sys_exit_group ... */
>> +asmlinkage int m68k_exit_group(struct pt_regs *regs)
>> +{
>> +	return sys_exit_group(regs->d1);
>> +}
>> +
>> +/* Same for sys_exit_group ... */
>> +asmlinkage int m68k_execve(struct pt_regs *regs)
>> +{
>> +	return sys_execve((const char __user *)regs->d1,
>> +			(const char __user *const __user*)regs->d2,
>> +			(const char __user *const __user*)regs->d3);
>> +}
>> +
>> +/* Same for sys_exit_group ... */
>> +asmlinkage int m68k_execveat(struct pt_regs *regs)
>> +{
>> +	return sys_execveat(regs->d1, (const char __user *)regs->d2,
>> +			(const char __user *const __user*)regs->d3,
>> +			(const char __user *const __user*)regs->d4,
>> +			regs->d5);
>> +}
>> +
>> +/* and for sys_io_uring_create */
>> +asmlinkage int m68k_io_uring_setup(struct pt_regs *regs)
>> +{
>> +	return sys_io_uring_setup(regs->d1,(struct io_uring_params __user *)regs->d2);
>> +}
>> +
>>  int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
>>  		struct task_struct *p, unsigned long tls)
>>  {
>> diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
>> index 0dd019d..4a1ba1d 100644
>> --- a/arch/m68k/kernel/syscalls/syscall.tbl
>> +++ b/arch/m68k/kernel/syscalls/syscall.tbl
>> @@ -8,7 +8,7 @@
>>  # The <abi> is always "common" for this file
>>  #
>>  0	common	restart_syscall			sys_restart_syscall
>> -1	common	exit				sys_exit
>> +1	common	exit				__sys_exit
>>  2	common	fork				__sys_fork
>>  3	common	read				sys_read
>>  4	common	write				sys_write
>> @@ -18,7 +18,7 @@
>>  8	common	creat				sys_creat
>>  9	common	link				sys_link
>>  10	common	unlink				sys_unlink
>> -11	common	execve				sys_execve
>> +11	common	execve				__sys_execve
>>  12	common	chdir				sys_chdir
>>  13	common	time				sys_time32
>>  14	common	mknod				sys_mknod
>> @@ -254,7 +254,7 @@
>>  244	common	io_submit			sys_io_submit
>>  245	common	io_cancel			sys_io_cancel
>>  246	common	fadvise64			sys_fadvise64
>> -247	common	exit_group			sys_exit_group
>> +247	common	exit_group			__sys_exit_group
>>  248	common	lookup_dcookie			sys_lookup_dcookie
>>  249	common	epoll_create			sys_epoll_create
>>  250	common	epoll_ctl			sys_epoll_ctl
>> @@ -362,7 +362,7 @@
>>  352	common	getrandom			sys_getrandom
>>  353	common	memfd_create			sys_memfd_create
>>  354	common	bpf				sys_bpf
>> -355	common	execveat			sys_execveat
>> +355	common	execveat			__sys_execveat
>>  356	common	socket				sys_socket
>>  357	common	socketpair			sys_socketpair
>>  358	common	bind				sys_bind
>> @@ -424,7 +424,7 @@
>>  422	common	futex_time64			sys_futex
>>  423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
>>  424	common	pidfd_send_signal		sys_pidfd_send_signal
>> -425	common	io_uring_setup			sys_io_uring_setup
>> +425	common	io_uring_setup			__sys_io_uring_setup
>>  426	common	io_uring_enter			sys_io_uring_enter
>>  427	common	io_uring_register		sys_io_uring_register
>>  428	common	open_tree			sys_open_tree

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-18 17:17 ` Linus Torvalds
@ 2021-06-18 22:34   ` Michael Schmitz
  2021-06-18 23:38     ` Linus Torvalds
  0 siblings, 1 reply; 12+ messages in thread
From: Michael Schmitz @ 2021-06-18 22:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Geert Uytterhoeven, linux-arch, linux-m68k, Eric W. Biederman,
	Andreas Schwab

Hi Linus,

Am 19.06.2021 um 05:17 schrieb Linus Torvalds:
> On Thu, Jun 17, 2021 at 6:27 PM Michael Schmitz <schmitzmic@gmail.com> wrote:
>>
>> I'd need specific test cases to exercise io_uring_setup in
>> particular, to see whether stack offsets for pt_regs and the
>> switch stack have been messed up.
>
> I don't think doing this for io_uring_setup() will help any - the
> problem is not in that system call thread itself, it's purely in the
> kernel thread that it then starts.
>
> And the fact that io_uring_setup() has the full stack frame won't then
> help that kernel thread, for exactly the same reason that was true on
> alpha: copy_thread() will actually _create_ the full stack, but when
> we switch to it (through switch_to() -> resume()), the resume code in
> arch/m68k/kernel/entry.S will switch to that stack, and then do
> RESTORE_SWITCH_STACK which will consume it again.

Now I see ... I had glossed over the point in resume() where the stack 
pointer was switched to the new task.

> So I think m68k should do the same thing as Eric's patch for alpha: do
> the full stack for exit and exit_group, and for kernel thread creation
> - or at least PF_IO_WORKER), do an extra stack frame on the kernel
> stack, so that even after resume() we'll still have another copy of
> the frame.

Is your patch to copy_thread() to add the extra stack frame still needed?

(Eric added switch stack save/restore around the worker thread call in 
ret_from_exception, which pushes back the missing stack contents as far 
as I understand ... )

There seems to be no expectation for kernel worker threads to have valid 
saved user context, just space on the stack for the tracer to read. I'll 
drop saving full context for io_uring_setup in v3.

> The alternative would be to do what x86 does: see __switch_to_asm().
> Instead of doing that normal kernel entry/exit stack (with
> SAVE_SWITCH_STACK and RESTORE_SWITCH_STACK), x86 has it's own very
> special "only for task switching" stack frame thing, and leaves the
> pt_regs etc entirely alone.
>
> Of course, that "only for task switching" is _kind_of_ what the whole
> SAVE_SWITCH_STACK is for - it's part of the name after all - but the
> difference is that on alpha and m68k, it's also (and primarily) the
> "full state" stack frame, used not just for task switching, but for
> signal handling state and for ptrace too.
>
> So in theory, it would be good to split this up:
>
>  (a) have the signal handling and ptrace stack be one thing (maybe
> rename the "SWITCH" part of the operations to something else, like
> "EXTRA" or "SIGNAL" or whatever)

We can rename the macros, but we also expose struct switch_stack in 
uapi/asm/ptrace.h - hard to rename that one.

>  (b) make a separate "for task switching only" stack frame, which is
> used by that switch_to() -> resume() sequence, and that copy_thread()
> has a "struct inactive_task_frame" thing for..

That's more than a bit beyond my m68k assembly level, sorry. Maybe 
Andreas can help if we decide to go that way?

Cheers,

	Michael

>
> That way, the pt_regs/extra_regs stack frame that copy_thread()
> creates wouldn't then be eaten up by the task switch.
>
> But while that sounds like the right thing to do, it would be a rather
> bigger change. I'm not entirely sure it's worth it.
>
> Eric, comments?
>
>                 Linus
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-18 22:34   ` Michael Schmitz
@ 2021-06-18 23:38     ` Linus Torvalds
  2021-06-18 23:59       ` Michael Schmitz
  2021-06-19  1:32       ` Michael Schmitz
  0 siblings, 2 replies; 12+ messages in thread
From: Linus Torvalds @ 2021-06-18 23:38 UTC (permalink / raw)
  To: Michael Schmitz
  Cc: Geert Uytterhoeven, linux-arch, linux-m68k, Eric W. Biederman,
	Andreas Schwab

[-- Attachment #1: Type: text/plain, Size: 654 bytes --]

On Fri, Jun 18, 2021 at 3:34 PM Michael Schmitz <schmitzmic@gmail.com> wrote:
>
> Is your patch to copy_thread() to add the extra stack frame still needed?

So it's been a long time since I did any m68k assembly, but I think
the m68k patch for the PF_IO_WORKER thread case should look something
like the attached.

Note: my only m68k work was ever on the 68008, and used the Motorola
syntax, not the odd Sun assembler syntax, so my m68k asm skills really
aren't good.

Put another way: I'd be surprised if the attached patch actually
works, but I think it's fairly close. I tried to add comments to
explain the code at least a bit.

Hmm?

         Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 2100 bytes --]

 arch/m68k/kernel/entry.S   | 10 ++++++++++
 arch/m68k/kernel/process.c | 14 +++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index 9dd76fbb7c6b..499f14d79640 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -119,6 +119,15 @@ ENTRY(ret_from_fork)
 	addql	#4,%sp
 	jra	ret_from_exception
 
+	| A kernel thread will jump here directly from resume,
+	| with the stack containing the full register state
+	| (pt_regs and switch_stack).
+	|
+	| The argument will be in d7, and the kernel function
+	| to call will be in a3.
+	|
+	| If the kernel function returns, we want to return
+	| to user space - it has done a kernel_execve().
 ENTRY(ret_from_kernel_thread)
 	| a3 contains the kernel thread payload, d7 - its argument
 	movel	%d1,%sp@-
@@ -126,6 +135,7 @@ ENTRY(ret_from_kernel_thread)
 	movel	%d7,(%sp)
 	jsr	%a3@
 	addql	#4,%sp
+	RESTORE_SWITCH_STACK
 	jra	ret_from_exception
 
 #if defined(CONFIG_COLDFIRE) || !defined(CONFIG_MMU)
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index da83cc83e791..0705f14871a3 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -158,13 +158,17 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
 	p->thread.fs = get_fs().seg;
 
 	if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
-		/* kernel thread */
-		memset(frame, 0, sizeof(struct fork_frame));
+		struct switch_stack *kstp = &frame->sw - 1;
+
+		/* kernel thread - a kernel-side switch-stack and the full user fork_frame */
+		memset(kstp, 0, sizeof(struct switch_stack) + sizeof(struct fork_frame));
+
 		frame->regs.sr = PS_S;
-		frame->sw.a3 = usp; /* function */
-		frame->sw.d7 = arg;
-		frame->sw.retpc = (unsigned long)ret_from_kernel_thread;
+		kstp->a3 = usp; /* function */
+		kstp->d7 = arg;
+		kstp->retpc = (unsigned long)ret_from_kernel_thread;
 		p->thread.usp = 0;
+		p->thread.ksp = (unsigned long)kstp;
 		return 0;
 	}
 	memcpy(frame, container_of(current_pt_regs(), struct fork_frame, regs),

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-18 23:38     ` Linus Torvalds
@ 2021-06-18 23:59       ` Michael Schmitz
  2021-06-19  1:32       ` Michael Schmitz
  1 sibling, 0 replies; 12+ messages in thread
From: Michael Schmitz @ 2021-06-18 23:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Geert Uytterhoeven, linux-arch, linux-m68k, Eric W. Biederman,
	Andreas Schwab

Hi Linus,

Am 19.06.2021 um 11:38 schrieb Linus Torvalds:
> On Fri, Jun 18, 2021 at 3:34 PM Michael Schmitz <schmitzmic@gmail.com> wrote:
>>
>> Is your patch to copy_thread() to add the extra stack frame still needed?
>
> So it's been a long time since I did any m68k assembly, but I think
> the m68k patch for the PF_IO_WORKER thread case should look something
> like the attached.

OK, that answers my question, thanks!

>
> Note: my only m68k work was ever on the 68008, and used the Motorola
> syntax, not the odd Sun assembler syntax, so my m68k asm skills really
> aren't good.
>
> Put another way: I'd be surprised if the attached patch actually
> works, but I think it's fairly close. I tried to add comments to
> explain the code at least a bit.
>
> Hmm?

I'll give it a spin (on the emulator).

Cheers,

	Michael

>
>          Linus
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-18 23:38     ` Linus Torvalds
  2021-06-18 23:59       ` Michael Schmitz
@ 2021-06-19  1:32       ` Michael Schmitz
  2021-06-19  1:54         ` Linus Torvalds
  1 sibling, 1 reply; 12+ messages in thread
From: Michael Schmitz @ 2021-06-19  1:32 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Geert Uytterhoeven, linux-arch, linux-m68k, Eric W. Biederman,
	Andreas Schwab

Hi Linus,

Am 19.06.2021 um 11:38 schrieb Linus Torvalds:
> On Fri, Jun 18, 2021 at 3:34 PM Michael Schmitz <schmitzmic@gmail.com> wrote:
>>
>> Is your patch to copy_thread() to add the extra stack frame still needed?
>
> So it's been a long time since I did any m68k assembly, but I think
> the m68k patch for the PF_IO_WORKER thread case should look something
> like the attached.
>
> Note: my only m68k work was ever on the 68008, and used the Motorola
> syntax, not the odd Sun assembler syntax, so my m68k asm skills really
> aren't good.
>
> Put another way: I'd be surprised if the attached patch actually
> works, but I think it's fairly close. I tried to add comments to
> explain the code at least a bit.

That went well:

*** FORMAT ERROR ***   FORMAT=0
Current process id is 1
BAD KERNEL TRAP: 00000000
Modules linked in:
PC: [<00002af0>] resume_userspace+0x14/0x16
SR: 2204  SP: (ptrval)  a2: 00000000
d0: 00000000    d1: 00000000    d2: 00000000    d3: 00000000
d4: 00000000    d5: 00000000    a0: 00000000    a1: 00000000
Process init (pid: 1, task=(ptrval))
Frame format=0
Stack from 0081bffc:
         19bc0000
Call Trace:
Code: 1029 0007 660c 4cdf 073e 201f 588f dfdf <4e73> 254f 03ec e308 660a 
487a ffe0 60ff 002a f6ba 598f 48e7 031e 486f 001c 61ff
Disabling lock debugging due to kernel taint
Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b

Looks like the zeroed frame was restored where we'd have expected an 
actual save frame?

I'll next try and apply your solution to IO worker threads only ...

Cheers,

	Michael

>
> Hmm?
>
>          Linus
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-19  1:32       ` Michael Schmitz
@ 2021-06-19  1:54         ` Linus Torvalds
  2021-06-19  2:13           ` Linus Torvalds
  2021-06-19  2:17           ` Michael Schmitz
  0 siblings, 2 replies; 12+ messages in thread
From: Linus Torvalds @ 2021-06-19  1:54 UTC (permalink / raw)
  To: Michael Schmitz
  Cc: Geert Uytterhoeven, linux-arch, linux-m68k, Eric W. Biederman,
	Andreas Schwab

On Fri, Jun 18, 2021 at 6:32 PM Michael Schmitz <schmitzmic@gmail.com> wrote:
>
> *** FORMAT ERROR ***   FORMAT=0
> Current process id is 1
> BAD KERNEL TRAP: 00000000
> Modules linked in:
> PC: [<00002af0>] resume_userspace+0x14/0x16
> SR: 2204  SP: (ptrval)  a2: 00000000
> d0: 00000000    d1: 00000000    d2: 00000000    d3: 00000000
> d4: 00000000    d5: 00000000    a0: 00000000    a1: 00000000

Yeah, so that's presumably the rte that causes an exception due to
garbage on the stack.

The registers being zero at that point is actually expected, so that's
not much of a hint. But yeah, clearly I got some stack initialization
offset or something wrong there, and I don't know modern m68k nearly
well enough to even guess where I screwed up.

             Linus

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-19  1:54         ` Linus Torvalds
@ 2021-06-19  2:13           ` Linus Torvalds
  2021-06-19  2:52             ` Michael Schmitz
  2021-06-19  2:17           ` Michael Schmitz
  1 sibling, 1 reply; 12+ messages in thread
From: Linus Torvalds @ 2021-06-19  2:13 UTC (permalink / raw)
  To: Michael Schmitz
  Cc: Geert Uytterhoeven, linux-arch, linux-m68k, Eric W. Biederman,
	Andreas Schwab

[-- Attachment #1: Type: text/plain, Size: 2068 bytes --]

On Fri, Jun 18, 2021 at 6:54 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> The registers being zero at that point is actually expected, so that's
> not much of a hint. But yeah, clearly I got some stack initialization
> offset or something wrong there, and I don't know modern m68k nearly
> well enough to even guess where I screwed up.

Oh. I think I might have an idea.

In that ret_from_kernel_thread code, after it returns from thge kernel
thread, I did

        addql   #4,%sp
        RESTORE_SWITCH_STACK
        jra     ret_from_exception

where that "RESTORE_SWITCH_STACK" loads the user space registers from
the user-space switch stack.

BUT.

The switch stack also has that "retpc", and that one should just be popped.

So I think that code should read

        addql   #4,%sp
        RESTORE_SWITCH_STACK
        addql   #4,%sp
        jra     ret_from_exception

because we want to restore the switch stack registers (they'll all be
0) but we then want to get rid of retpc too before we jump to
ret_from_exception, which will do the RESTORE_ALL.

(The first 'addql' is to remove the argument we've pushed on the stack
earlier in ret_from_kernel_thread, the second addql is to remove
retpc).

 All the *normal* uses of RESTORE_SWITCH_STACK will just do "rts", but
that's because they were called from the shallower stack. The
ret_from_kernel_thread case is kind of special, because it had that
stack frame layout built up manually, rather than have a call to it.

In that sense, it's a bit more like the 'do_trace_entry/exit' code,
which builds that switch stack using

        subql   #4,%sp
        SAVE_SWITCH_STACK

and then undoes it using that same

        RESTORE_SWITCH_STACK
        addql   #4,%sp

sequence that I _should_ have done in ret_from_kernel_thread. (Also,
see ret_from_signal)

I've attached an updated patch just in case my incoherent ramblings
above didn't explain what I meant. It's the same as the previous
patch, it just has that one extra stack update there.

Does _this_ one perhaps work?

                 Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 2116 bytes --]

 arch/m68k/kernel/entry.S   | 11 +++++++++++
 arch/m68k/kernel/process.c | 14 +++++++++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S
index 9dd76fbb7c6b..88fe0e1a3793 100644
--- a/arch/m68k/kernel/entry.S
+++ b/arch/m68k/kernel/entry.S
@@ -119,6 +119,15 @@ ENTRY(ret_from_fork)
 	addql	#4,%sp
 	jra	ret_from_exception
 
+	| A kernel thread will jump here directly from resume,
+	| with the stack containing the full register state
+	| (pt_regs and switch_stack).
+	|
+	| The argument will be in d7, and the kernel function
+	| to call will be in a3.
+	|
+	| If the kernel function returns, we want to return
+	| to user space - it has done a kernel_execve().
 ENTRY(ret_from_kernel_thread)
 	| a3 contains the kernel thread payload, d7 - its argument
 	movel	%d1,%sp@-
@@ -126,6 +135,8 @@ ENTRY(ret_from_kernel_thread)
 	movel	%d7,(%sp)
 	jsr	%a3@
 	addql	#4,%sp
+	RESTORE_SWITCH_STACK
+	addql	#4,%sp
 	jra	ret_from_exception
 
 #if defined(CONFIG_COLDFIRE) || !defined(CONFIG_MMU)
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index da83cc83e791..0705f14871a3 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -158,13 +158,17 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
 	p->thread.fs = get_fs().seg;
 
 	if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
-		/* kernel thread */
-		memset(frame, 0, sizeof(struct fork_frame));
+		struct switch_stack *kstp = &frame->sw - 1;
+
+		/* kernel thread - a kernel-side switch-stack and the full user fork_frame */
+		memset(kstp, 0, sizeof(struct switch_stack) + sizeof(struct fork_frame));
+
 		frame->regs.sr = PS_S;
-		frame->sw.a3 = usp; /* function */
-		frame->sw.d7 = arg;
-		frame->sw.retpc = (unsigned long)ret_from_kernel_thread;
+		kstp->a3 = usp; /* function */
+		kstp->d7 = arg;
+		kstp->retpc = (unsigned long)ret_from_kernel_thread;
 		p->thread.usp = 0;
+		p->thread.ksp = (unsigned long)kstp;
 		return 0;
 	}
 	memcpy(frame, container_of(current_pt_regs(), struct fork_frame, regs),

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-19  1:54         ` Linus Torvalds
  2021-06-19  2:13           ` Linus Torvalds
@ 2021-06-19  2:17           ` Michael Schmitz
  1 sibling, 0 replies; 12+ messages in thread
From: Michael Schmitz @ 2021-06-19  2:17 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Geert Uytterhoeven, linux-arch, linux-m68k, Eric W. Biederman,
	Andreas Schwab

Hi Linus,

Am 19.06.2021 um 13:54 schrieb Linus Torvalds:
> On Fri, Jun 18, 2021 at 6:32 PM Michael Schmitz <schmitzmic@gmail.com> wrote:
>>
>> *** FORMAT ERROR ***   FORMAT=0
>> Current process id is 1
>> BAD KERNEL TRAP: 00000000
>> Modules linked in:
>> PC: [<00002af0>] resume_userspace+0x14/0x16
>> SR: 2204  SP: (ptrval)  a2: 00000000
>> d0: 00000000    d1: 00000000    d2: 00000000    d3: 00000000
>> d4: 00000000    d5: 00000000    a0: 00000000    a1: 00000000
>
> Yeah, so that's presumably the rte that causes an exception due to
> garbage on the stack.
>
> The registers being zero at that point is actually expected, so that's
> not much of a hint. But yeah, clearly I got some stack initialization
> offset or something wrong there, and I don't know modern m68k nearly
> well enough to even guess where I screwed up.

It might have been me screwing up - I hand applied the patch on top of 
my last one and fat fingered one bit (forgot to remove the addql #4,sp@ 
I had added before the switch stack save).

With the patch correctly applied, I get this dump:

Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
CPU: 0 PID: 1 Comm: init Not tainted 5.13.0-rc1-atari-fpuemu-exitfix+ #1126
Stack from 0081be08:
         0081be08 003363d8 003363d8 002aebaa 000000ff 00000001 0002fa64 
00818a90
         0081a000 0000000b 0081be70 00028610 0032df62 0000000b 0000000b 
0002e0d2
         0002fa64 00000001 001a978c 0000000b 0081bf30 0081daf8 0081bf44 
00000000
         00000000 00000000 0081ec40 00029458 0000000b 0081a007 00030504 
0000000b
         00000000 00000000 00000000 00000000 00818550 00000000 0081bf90 
0081bf30
         00000000 0081bf68 00030066 0081da30 000042c4 0081bf30 00000000 
00000000
Call Trace: [<002aebaa>] panic+0xc0/0x282
  [<0002fa64>] do_signal_stop+0x0/0x14a
  [<00028610>] do_exit+0x152/0x6f4
  [<0002e0d2>] recalc_sigpending+0x0/0x1e
  [<0002fa64>] do_signal_stop+0x0/0x14a
  [<001a978c>] memcpy+0x0/0x88
  [<00029458>] do_group_exit+0x40/0x7e
  [<00030504>] get_signal+0x22c/0x510
  [<00030066>] force_sig_info_to_task+0x7e/0x8a
  [<000042c4>] do_notify_resume+0x3c/0x484
  [<000302b2>] force_sig_fault_to_task+0x30/0x3c
  [<000302d2>] force_sig_fault+0x14/0x1a
  [<00005eb8>] send_fault_sig+0x24/0x86
  [<00002b14>] do_signal_return+0x10/0x1a
  [<00007008>] atari_reset+0x90/0xbc
  [<0000c000>] clr_mant+0x8/0x14

No registers dumped at all - no idea how that happened.

I'll try your latest patch next ... bear with me, got a 9-year old 
chewing my ear off to entertain in between tests.

Cheers,

	Michael


>
>              Linus
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] m68k: save extra registers on more syscall entry points
  2021-06-19  2:13           ` Linus Torvalds
@ 2021-06-19  2:52             ` Michael Schmitz
  0 siblings, 0 replies; 12+ messages in thread
From: Michael Schmitz @ 2021-06-19  2:52 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Geert Uytterhoeven, linux-arch, linux-m68k, Eric W. Biederman,
	Andreas Schwab

Hi Linus,

that one boots OK now, thanks (applied on top of mine but that should 
not matter). I'll run a test on the actual hardware once the previous 
one is done.

Cheers,

	Michael


Am 19.06.2021 um 14:13 schrieb Linus Torvalds:
> On Fri, Jun 18, 2021 at 6:54 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
>>
>> The registers being zero at that point is actually expected, so that's
>> not much of a hint. But yeah, clearly I got some stack initialization
>> offset or something wrong there, and I don't know modern m68k nearly
>> well enough to even guess where I screwed up.
>
> Oh. I think I might have an idea.
>
> In that ret_from_kernel_thread code, after it returns from thge kernel
> thread, I did
>
>         addql   #4,%sp
>         RESTORE_SWITCH_STACK
>         jra     ret_from_exception
>
> where that "RESTORE_SWITCH_STACK" loads the user space registers from
> the user-space switch stack.
>
> BUT.
>
> The switch stack also has that "retpc", and that one should just be popped.
>
> So I think that code should read
>
>         addql   #4,%sp
>         RESTORE_SWITCH_STACK
>         addql   #4,%sp
>         jra     ret_from_exception
>
> because we want to restore the switch stack registers (they'll all be
> 0) but we then want to get rid of retpc too before we jump to
> ret_from_exception, which will do the RESTORE_ALL.
>
> (The first 'addql' is to remove the argument we've pushed on the stack
> earlier in ret_from_kernel_thread, the second addql is to remove
> retpc).
>
>  All the *normal* uses of RESTORE_SWITCH_STACK will just do "rts", but
> that's because they were called from the shallower stack. The
> ret_from_kernel_thread case is kind of special, because it had that
> stack frame layout built up manually, rather than have a call to it.
>
> In that sense, it's a bit more like the 'do_trace_entry/exit' code,
> which builds that switch stack using
>
>         subql   #4,%sp
>         SAVE_SWITCH_STACK
>
> and then undoes it using that same
>
>         RESTORE_SWITCH_STACK
>         addql   #4,%sp
>
> sequence that I _should_ have done in ret_from_kernel_thread. (Also,
> see ret_from_signal)
>
> I've attached an updated patch just in case my incoherent ramblings
> above didn't explain what I meant. It's the same as the previous
> patch, it just has that one extra stack update there.
>
> Does _this_ one perhaps work?
>
>                  Linus
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2021-06-19  2:53 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-18  1:27 [PATCH v2] m68k: save extra registers on more syscall entry points Michael Schmitz
2021-06-18 17:17 ` Linus Torvalds
2021-06-18 22:34   ` Michael Schmitz
2021-06-18 23:38     ` Linus Torvalds
2021-06-18 23:59       ` Michael Schmitz
2021-06-19  1:32       ` Michael Schmitz
2021-06-19  1:54         ` Linus Torvalds
2021-06-19  2:13           ` Linus Torvalds
2021-06-19  2:52             ` Michael Schmitz
2021-06-19  2:17           ` Michael Schmitz
2021-06-18 18:39 ` Eric W. Biederman
2021-06-18 19:06   ` Michael Schmitz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.