* [PATCHv2 1/4] x86_64: cleanup THREAD_INFO(reg,offset) macro
2015-01-18 11:45 [PATCHv2 0/4] x86, entry: some cleanup and simplification Alexander van Heukelum
@ 2015-01-18 11:45 ` Alexander van Heukelum
2015-01-21 13:40 ` Denys Vlasenko
2015-01-18 11:45 ` [PATCHv2 2/4] x86_64: embrace KERNEL_STACK_OFFSET Alexander van Heukelum
` (3 subsequent siblings)
4 siblings, 1 reply; 18+ messages in thread
From: Alexander van Heukelum @ 2015-01-18 11:45 UTC (permalink / raw)
To: Andy Lutomirski, x86, linux-kernel
Cc: Frederic Weisbecker, Oleg Nesterov, Borislav Petkov, Rik van Riel
The macro THREAD_INFO(reg,offset) is used in assembly to compute the
offset between the user ptregs and the thread_info struct. Change
the macro and all its uses so that offset is given as the current
top of stack in the pt_regs frame. The generated code is identical.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
---
arch/x86/ia32/ia32entry.S | 30 +++++++++++++++---------------
arch/x86/include/asm/calling.h | 1 +
arch/x86/include/asm/thread_info.h | 2 +-
arch/x86/kernel/entry_64.S | 4 ++--
4 files changed, 19 insertions(+), 18 deletions(-)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebca..1c74f39 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -135,7 +135,7 @@ ENTRY(ia32_sysenter_target)
CFI_REL_OFFSET rsp,0
pushfq_cfi
/*CFI_REL_OFFSET rflags,0*/
- movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
+ movl TI_sysenter_return+THREAD_INFO(%rsp,EFLAGS),%r10d
CFI_REGISTER rip,r10
pushq_cfi $__USER32_CS
/*CFI_REL_OFFSET cs,0*/
@@ -161,8 +161,8 @@ ENTRY(ia32_sysenter_target)
jnz sysenter_fix_flags
sysenter_flags_fixed:
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
CFI_REMEMBER_STATE
jnz sysenter_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
@@ -174,10 +174,10 @@ sysenter_dispatch:
movq %rax,RAX-ARGOFFSET(%rsp)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
jnz sysexit_audit
sysexit_from_sys_call:
- andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,ARGOFFSET)
/* clear IF, that popfq doesn't enable interrupts early */
andl $~0x200,EFLAGS-ARGOFFSET(%rsp)
movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */
@@ -216,7 +216,7 @@ sysexit_from_sys_call:
.endm
.macro auditsys_exit exit
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -231,7 +231,7 @@ sysexit_from_sys_call:
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl %edi,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
jz \exit
CLEAR_RREGS -ARGOFFSET
jmp int_with_check
@@ -253,7 +253,7 @@ sysenter_fix_flags:
sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
jz sysenter_auditsys
#endif
SAVE_REST
@@ -324,8 +324,8 @@ ENTRY(ia32_cstar_target)
1: movl (%r8),%r9d
_ASM_EXTABLE(1b,ia32_badarg)
ASM_CLAC
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
CFI_REMEMBER_STATE
jnz cstar_tracesys
cmpq $IA32_NR_syscalls-1,%rax
@@ -337,10 +337,10 @@ cstar_dispatch:
movq %rax,RAX-ARGOFFSET(%rsp)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
jnz sysretl_audit
sysretl_from_sys_call:
- andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,ARGOFFSET)
RESTORE_ARGS 0,-ARG_SKIP,0,0,0
movl RIP-ARGOFFSET(%rsp),%ecx
CFI_REGISTER rip,rcx
@@ -368,7 +368,7 @@ sysretl_audit:
cstar_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
jz cstar_auditsys
#endif
xchgl %r9d,%ebp
@@ -434,8 +434,8 @@ ENTRY(ia32_syscall)
/* note the registers are not zero extended to the sf.
this could be a problem. */
SAVE_ARGS 0,1,0
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
jnz ia32_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 1f1297b..16ab13d 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -81,6 +81,7 @@ For 32-bit we have the following conventions - kernel is built with
#define EFLAGS 144
#define RSP 152
#define SS 160
+#define PTREGS_SIZE 168
#define ARGOFFSET R11
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e82e95a..471037d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -190,7 +190,7 @@ static inline unsigned long current_stack_pointer(void)
* Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
* a certain register (to be used in assembler memory operands).
*/
-#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg)
+#define THREAD_INFO(reg, off) PTREGS_SIZE-(off)-THREAD_SIZE(reg)
#endif
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index db13655..9f9ca20 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -343,7 +343,7 @@ GLOBAL(system_call_after_swapgs)
movq_cfi rax,(ORIG_RAX-ARGOFFSET)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
jnz tracesys
system_call_fastpath:
#if __SYSCALL_MASK == ~0
@@ -361,7 +361,7 @@ system_call_fastpath:
* Has incomplete stack frame and undefined top of stack.
*/
ret_from_sys_call:
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
jnz int_ret_from_sys_call_fixup /* Go the the slow path */
LOCKDEP_SYS_EXIT
--
2.1.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCHv2 1/4] x86_64: cleanup THREAD_INFO(reg,offset) macro
2015-01-18 11:45 ` [PATCHv2 1/4] x86_64: cleanup THREAD_INFO(reg,offset) macro Alexander van Heukelum
@ 2015-01-21 13:40 ` Denys Vlasenko
2015-01-21 16:20 ` Alexander van Heukelum
0 siblings, 1 reply; 18+ messages in thread
From: Denys Vlasenko @ 2015-01-21 13:40 UTC (permalink / raw)
To: Alexander van Heukelum
Cc: Andy Lutomirski, X86 ML, Linux Kernel Mailing List,
Frederic Weisbecker, Oleg Nesterov, Borislav Petkov,
Rik van Riel
On Sun, Jan 18, 2015 at 12:45 PM, Alexander van Heukelum
<heukelum@fastmail.fm> wrote:
> The macro THREAD_INFO(reg,offset) is used in assembly to compute the
> offset between the user ptregs and the thread_info struct. Change
> the macro and all its uses so that offset is given as the current
> top of stack in the pt_regs frame. The generated code is identical.
What is the purpose of doing this?
I don't mean to say that it's pointless, but it is also not obvious
why it's better than the code before patch.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHv2 1/4] x86_64: cleanup THREAD_INFO(reg,offset) macro
2015-01-21 13:40 ` Denys Vlasenko
@ 2015-01-21 16:20 ` Alexander van Heukelum
2015-01-21 18:04 ` Borislav Petkov
0 siblings, 1 reply; 18+ messages in thread
From: Alexander van Heukelum @ 2015-01-21 16:20 UTC (permalink / raw)
To: Denys Vlasenko
Cc: Andy Lutomirski, X86 ML, Linux Kernel Mailing List,
Frederic Weisbecker, Oleg Nesterov, Borislav Petkov,
Rik van Riel
On Wed, Jan 21, 2015, at 14:40, Denys Vlasenko wrote:
> On Sun, Jan 18, 2015 at 12:45 PM, Alexander van Heukelum
> <heukelum@fastmail.fm> wrote:
> > The macro THREAD_INFO(reg,offset) is used in assembly to compute the
> > offset between the user ptregs and the thread_info struct. Change
> > the macro and all its uses so that offset is given as the current
> > top of stack in the pt_regs frame. The generated code is identical.
>
> What is the purpose of doing this?
>
> I don't mean to say that it's pointless, but it is also not obvious
> why it's better than the code before patch.
Just because I had a hard time understanding what the offset
parameter means. Turns out Borislav did another variant in his
patchset. Quoting him from the commit message which made
that the change:
> Semi-mysterious expressions THREAD_INFO(%rsp,RIP) - "why RIP??"
> are now replaced by more logical THREAD_INFO(%rsp,SIZEOF_PTREGS) -
> "rsp is SIZEOF_PTREGS bytes below the top, calculate
> thread_info's address using that information"
Borislav expresses "offset" as an offset from the stack's page
boundary,while I chose the offset from the start of struct
pt_regs on the stack.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHv2 1/4] x86_64: cleanup THREAD_INFO(reg,offset) macro
2015-01-21 16:20 ` Alexander van Heukelum
@ 2015-01-21 18:04 ` Borislav Petkov
2015-01-21 18:48 ` Alexander van Heukelum
0 siblings, 1 reply; 18+ messages in thread
From: Borislav Petkov @ 2015-01-21 18:04 UTC (permalink / raw)
To: Alexander van Heukelum
Cc: Denys Vlasenko, Andy Lutomirski, X86 ML,
Linux Kernel Mailing List, Frederic Weisbecker, Oleg Nesterov,
Rik van Riel
On Wed, Jan 21, 2015 at 05:20:09PM +0100, Alexander van Heukelum wrote:
> On Wed, Jan 21, 2015, at 14:40, Denys Vlasenko wrote:
> > On Sun, Jan 18, 2015 at 12:45 PM, Alexander van Heukelum
> > <heukelum@fastmail.fm> wrote:
> > > The macro THREAD_INFO(reg,offset) is used in assembly to compute the
> > > offset between the user ptregs and the thread_info struct. Change
> > > the macro and all its uses so that offset is given as the current
> > > top of stack in the pt_regs frame. The generated code is identical.
> >
> > What is the purpose of doing this?
> >
> > I don't mean to say that it's pointless, but it is also not obvious
> > why it's better than the code before patch.
>
> Just because I had a hard time understanding what the offset
> parameter means. Turns out Borislav did another variant in his
> patchset. Quoting him from the commit message which made
> that the change:
> > Semi-mysterious expressions THREAD_INFO(%rsp,RIP) - "why RIP??"
> > are now replaced by more logical THREAD_INFO(%rsp,SIZEOF_PTREGS) -
> > "rsp is SIZEOF_PTREGS bytes below the top, calculate
> > thread_info's address using that information"
>
> Borislav expresses "offset" as an offset from the stack's page
> boundary,while I chose the offset from the start of struct
> pt_regs on the stack.
I've never done any such thing :-)
s/Borislav/Denys/g.
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
--
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHv2 1/4] x86_64: cleanup THREAD_INFO(reg,offset) macro
2015-01-21 18:04 ` Borislav Petkov
@ 2015-01-21 18:48 ` Alexander van Heukelum
0 siblings, 0 replies; 18+ messages in thread
From: Alexander van Heukelum @ 2015-01-21 18:48 UTC (permalink / raw)
To: Borislav Petkov
Cc: Denys Vlasenko, Andy Lutomirski, X86 ML,
Linux Kernel Mailing List, Frederic Weisbecker, Oleg Nesterov,
Rik van Riel
On Wed, Jan 21, 2015, at 19:04, Borislav Petkov wrote:
> On Wed, Jan 21, 2015 at 05:20:09PM +0100, Alexander van Heukelum wrote:
> > On Wed, Jan 21, 2015, at 14:40, Denys Vlasenko wrote:
> > > On Sun, Jan 18, 2015 at 12:45 PM, Alexander van Heukelum
> > > <heukelum@fastmail.fm> wrote:
> > > > The macro THREAD_INFO(reg,offset) is used in assembly to compute the
> > > > offset between the user ptregs and the thread_info struct. Change
> > > > the macro and all its uses so that offset is given as the current
> > > > top of stack in the pt_regs frame. The generated code is identical.
> > >
> > > What is the purpose of doing this?
> > >
> > > I don't mean to say that it's pointless, but it is also not obvious
> > > why it's better than the code before patch.
> >
> > Just because I had a hard time understanding what the offset
> > parameter means. Turns out Borislav did another variant in his
> > patchset. Quoting him from the commit message which made
> > that the change:
> > > Semi-mysterious expressions THREAD_INFO(%rsp,RIP) - "why RIP??"
> > > are now replaced by more logical THREAD_INFO(%rsp,SIZEOF_PTREGS) -
> > > "rsp is SIZEOF_PTREGS bytes below the top, calculate
> > > thread_info's address using that information"
> >
> > Borislav expresses "offset" as an offset from the stack's page
> > boundary,while I chose the offset from the start of struct
> > pt_regs on the stack.
>
> I've never done any such thing :-)
>
> s/Borislav/Denys/g.
twice ... and stop using third person too... Oops.
Err. sorry :-/.
> --
> Regards/Gruss,
> Boris.
>
> ECO tip #101: Trim your mails when you reply.
> --
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCHv2 2/4] x86_64: embrace KERNEL_STACK_OFFSET
2015-01-18 11:45 [PATCHv2 0/4] x86, entry: some cleanup and simplification Alexander van Heukelum
2015-01-18 11:45 ` [PATCHv2 1/4] x86_64: cleanup THREAD_INFO(reg,offset) macro Alexander van Heukelum
@ 2015-01-18 11:45 ` Alexander van Heukelum
2015-01-21 13:44 ` Denys Vlasenko
2015-01-18 11:45 ` [PATCHv2 3/4] i386: clean up KERNEL_STACK_OFFSET Alexander van Heukelum
` (2 subsequent siblings)
4 siblings, 1 reply; 18+ messages in thread
From: Alexander van Heukelum @ 2015-01-18 11:45 UTC (permalink / raw)
To: Andy Lutomirski, x86, linux-kernel
Cc: Frederic Weisbecker, Oleg Nesterov, Borislav Petkov, Rik van Riel
KERNEL_STACK_OFFSET is the offset from the top of the kernel stack
page to the value of the kernel_stack percpu variable. This patch
changes KERNEL_STACK_OFFSET to configure a reserved space of 16
bytes above the user ptregs frame. KERNEL_STACK_OFFSET must be
set to a multiple of 16 bytes due to the automatic stack alignment
of interrupts, traps, and exceptions on x86_64.
Also change task_pt_regs to be independant of the thread's current
sp0 setting, like i386, and use it to initialize thread.sp0 in
copy_thread.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
---
arch/x86/ia32/ia32entry.S | 3 +--
arch/x86/include/asm/processor.h | 11 ++++++++---
arch/x86/include/asm/thread_info.h | 13 ++++++++++++-
arch/x86/kernel/entry_64.S | 2 +-
arch/x86/kernel/process_64.c | 5 ++---
5 files changed, 24 insertions(+), 10 deletions(-)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 1c74f39..4c6c5d9 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -122,7 +122,6 @@ ENTRY(ia32_sysenter_target)
CFI_REGISTER rsp,rbp
SWAPGS_UNSAFE_STACK
movq PER_CPU_VAR(kernel_stack), %rsp
- addq $(KERNEL_STACK_OFFSET),%rsp
/*
* No need to follow this irqs on/off section: the syscall
* disabled irqs, here we enable it straight after entry:
@@ -304,7 +303,7 @@ ENTRY(ia32_cstar_target)
* disabled irqs and here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,0,0
+ SAVE_ARGS 6*8,0,0 /* skip: hardware stackframe and orig_rax */
movl %eax,%eax /* zero extension */
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a092a0c..97117d1 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -919,11 +919,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
#define STACK_TOP_MAX TASK_SIZE_MAX
#define INIT_THREAD { \
- .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+ .sp0 = (unsigned long)&init_stack + \
+ sizeof(init_stack) - KERNEL_STACK_OFFSET \
}
#define INIT_TSS { \
- .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+ .x86_tss.sp0 = (unsigned long)&init_stack + \
+ sizeof(init_stack) - KERNEL_STACK_OFFSET \
}
/*
@@ -932,7 +934,10 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
*/
#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
-#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+#define task_pt_regs(task) \
+ ((struct pt_regs *)((unsigned long)task_stack_page(task) + \
+ THREAD_SIZE - KERNEL_STACK_OFFSET) - 1)
+
extern unsigned long KSTK_ESP(struct task_struct *task);
/*
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 471037d..9f0c47f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -149,7 +149,18 @@ struct thread_info {
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
#define STACK_WARN (THREAD_SIZE/8)
+
+/*
+ * Amount of reserved space between the top of the kernel stack page and the
+ * user ptregs frame.
+ * On x86_64, KERNEL_STACK_OFFSET must be set to a multiple of 16 bytes due
+ * to its automatic stack alignment for interrupts, traps, and exceptions.
+ */
+#ifdef CONFIG_X86_32
#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8))
+#else
+#define KERNEL_STACK_OFFSET (2*(BITS_PER_LONG/8))
+#endif
/*
* macros/functions for gaining access to the thread information structure
@@ -190,7 +201,7 @@ static inline unsigned long current_stack_pointer(void)
* Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
* a certain register (to be used in assembler memory operands).
*/
-#define THREAD_INFO(reg, off) PTREGS_SIZE-(off)-THREAD_SIZE(reg)
+#define THREAD_INFO(reg, off) PTREGS_SIZE+KERNEL_STACK_OFFSET-(off)-THREAD_SIZE(reg)
#endif
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9f9ca20..6b95c2f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -339,7 +339,7 @@ GLOBAL(system_call_after_swapgs)
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8, 0, rax_enosys=1
+ SAVE_ARGS 6*8, 0, rax_enosys=1 /* skip: hardware stackframe and orig_rax */
movq_cfi rax,(ORIG_RAX-ARGOFFSET)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5a2c029..d579ebf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -155,12 +155,11 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p)
{
int err;
- struct pt_regs *childregs;
+ struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *me = current;
- p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
- childregs = task_pt_regs(p);
p->thread.sp = (unsigned long) childregs;
+ p->thread.sp0 = (unsigned long) (childregs + 1);
p->thread.usersp = me->thread.usersp;
set_tsk_thread_flag(p, TIF_FORK);
p->thread.io_bitmap_ptr = NULL;
--
2.1.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCHv2 2/4] x86_64: embrace KERNEL_STACK_OFFSET
2015-01-18 11:45 ` [PATCHv2 2/4] x86_64: embrace KERNEL_STACK_OFFSET Alexander van Heukelum
@ 2015-01-21 13:44 ` Denys Vlasenko
2015-01-21 16:29 ` Alexander van Heukelum
0 siblings, 1 reply; 18+ messages in thread
From: Denys Vlasenko @ 2015-01-21 13:44 UTC (permalink / raw)
To: Alexander van Heukelum
Cc: Andy Lutomirski, X86 ML, Linux Kernel Mailing List,
Frederic Weisbecker, Oleg Nesterov, Borislav Petkov,
Rik van Riel
On Sun, Jan 18, 2015 at 12:45 PM, Alexander van Heukelum
<heukelum@fastmail.fm> wrote:
> KERNEL_STACK_OFFSET is the offset from the top of the kernel stack
> page to the value of the kernel_stack percpu variable. This patch
> changes KERNEL_STACK_OFFSET to configure a reserved space of 16
> bytes above the user ptregs frame. KERNEL_STACK_OFFSET must be
> set to a multiple of 16 bytes due to the automatic stack alignment
> of interrupts, traps, and exceptions on x86_64.
I propose to set kernel_stack percpu variable to point
to the top of kernel stack (obvious, isn't it?)
and eliminate KERNEL_STACK_OFFSET altogether.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHv2 2/4] x86_64: embrace KERNEL_STACK_OFFSET
2015-01-21 13:44 ` Denys Vlasenko
@ 2015-01-21 16:29 ` Alexander van Heukelum
2015-01-23 0:53 ` Denys Vlasenko
0 siblings, 1 reply; 18+ messages in thread
From: Alexander van Heukelum @ 2015-01-21 16:29 UTC (permalink / raw)
To: Denys Vlasenko
Cc: Andy Lutomirski, X86 ML, Linux Kernel Mailing List,
Frederic Weisbecker, Oleg Nesterov, Borislav Petkov,
Rik van Riel
On Wed, Jan 21, 2015, at 14:44, Denys Vlasenko wrote:
> On Sun, Jan 18, 2015 at 12:45 PM, Alexander van Heukelum
> <heukelum@fastmail.fm> wrote:
> > KERNEL_STACK_OFFSET is the offset from the top of the kernel stack
> > page to the value of the kernel_stack percpu variable. This patch
> > changes KERNEL_STACK_OFFSET to configure a reserved space of 16
> > bytes above the user ptregs frame. KERNEL_STACK_OFFSET must be
> > set to a multiple of 16 bytes due to the automatic stack alignment
> > of interrupts, traps, and exceptions on x86_64.
>
> I propose to set kernel_stack percpu variable to point
> to the top of kernel stack (obvious, isn't it?)
> and eliminate KERNEL_STACK_OFFSET altogether.
By "top of kernel stack", do you mean the page boundary or the
top of struct pt_regs on the kernel stack? (is it really that obvious?)
I think Borislav did the latter for x86_64 in his patchset.
Eliminating KERNEL_STACK_OFFSET was what I did in my first
attempt, and I broke i386 by not thinking through what x86 common
code was really doing :).
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHv2 2/4] x86_64: embrace KERNEL_STACK_OFFSET
2015-01-21 16:29 ` Alexander van Heukelum
@ 2015-01-23 0:53 ` Denys Vlasenko
0 siblings, 0 replies; 18+ messages in thread
From: Denys Vlasenko @ 2015-01-23 0:53 UTC (permalink / raw)
To: Alexander van Heukelum
Cc: Andy Lutomirski, X86 ML, Linux Kernel Mailing List,
Frederic Weisbecker, Oleg Nesterov, Borislav Petkov,
Rik van Riel
On Wed, Jan 21, 2015 at 5:29 PM, Alexander van Heukelum
<heukelum@fastmail.fm> wrote:
> On Wed, Jan 21, 2015, at 14:44, Denys Vlasenko wrote:
>> On Sun, Jan 18, 2015 at 12:45 PM, Alexander van Heukelum
>> <heukelum@fastmail.fm> wrote:
>> > KERNEL_STACK_OFFSET is the offset from the top of the kernel stack
>> > page to the value of the kernel_stack percpu variable. This patch
>> > changes KERNEL_STACK_OFFSET to configure a reserved space of 16
>> > bytes above the user ptregs frame. KERNEL_STACK_OFFSET must be
>> > set to a multiple of 16 bytes due to the automatic stack alignment
>> > of interrupts, traps, and exceptions on x86_64.
>>
>> I propose to set kernel_stack percpu variable to point
>> to the top of kernel stack (obvious, isn't it?)
>> and eliminate KERNEL_STACK_OFFSET altogether.
>
> By "top of kernel stack", do you mean the page boundary or the
> top of struct pt_regs on the kernel stack? (is it really that obvious?)
> I think Borislav did the latter for x86_64 in his patchset.
Page boundary.
kernel_stack is currently initialized as follows:
this_cpu_write(kernel_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE - KERNEL_STACK_OFFSET);
i.e. it points KERNEL_STACK_OFFSET bytes below top-of-stack,
which is two pages above task_struct.
Why do we have KERNEL_STACK_OFFSET?
The original idea was that on SYSCALL instruction entry, which
does not create iret stack, we can eliminate one "sub $5*8,%rsp"
instruction. This idea currently does not work, because we
have such instruction anyway (it allocates pr_regs). Nothing is saved there.
And here, in 32-bit compat code:
ENTRY(ia32_sysenter_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,0
CFI_REGISTER rsp,rbp
SWAPGS_UNSAFE_STACK
movq PER_CPU_VAR(kernel_stack), %rsp
addq $(KERNEL_STACK_OFFSET),%rsp
we even need to _undo_ the "KERNEL_STACK_OFFSET optimization"
(last insn).
My patch "[PATCH 09/11] x86: get rid of KERNEL_STACK_OFFSET"
simply drops the KERNEL_STACK_OFFSET thing.
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCHv2 3/4] i386: clean up KERNEL_STACK_OFFSET
2015-01-18 11:45 [PATCHv2 0/4] x86, entry: some cleanup and simplification Alexander van Heukelum
2015-01-18 11:45 ` [PATCHv2 1/4] x86_64: cleanup THREAD_INFO(reg,offset) macro Alexander van Heukelum
2015-01-18 11:45 ` [PATCHv2 2/4] x86_64: embrace KERNEL_STACK_OFFSET Alexander van Heukelum
@ 2015-01-18 11:45 ` Alexander van Heukelum
2015-01-18 11:45 ` [PATCHv2 4/4] x86_64, entry: Create IRET-compatible stack frame at syscall entry Alexander van Heukelum
2015-01-18 12:05 ` [PATCHv2 0/4] x86, entry: some cleanup and simplification Borislav Petkov
4 siblings, 0 replies; 18+ messages in thread
From: Alexander van Heukelum @ 2015-01-18 11:45 UTC (permalink / raw)
To: Andy Lutomirski, x86, linux-kernel
Cc: Frederic Weisbecker, Oleg Nesterov, Borislav Petkov, Rik van Riel
On i386, 8 bytes are reserved above the user ptregs frame. The area is
unused, but "necessary to guarantee that the entire "struct pt_regs" is
accessible even if the CPU haven't stored the SS/ESP registers on the
stack (interrupt gate does not save these registers when switching to
the same priv ring)."
Use KERNEL_STACK_OFFSET to make the size of this area configurable and
remove the difference between the sp0 setting in the tss and the percpu
variable kernel_stack.
For i386, KERNEL_STACK_OFFSET must be at least 8 bytes for the reason
mentioned above and must be a multiple of 4 bytes, the minimal stack
alignment.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
---
arch/x86/include/asm/processor.h | 32 +++++++-------------------------
arch/x86/include/asm/thread_info.h | 10 ++++++----
arch/x86/kernel/entry_32.S | 5 +++--
3 files changed, 16 insertions(+), 31 deletions(-)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 97117d1..f424e5f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -842,7 +842,8 @@ static inline void spin_lock_prefetch(const void *x)
#define STACK_TOP_MAX STACK_TOP
#define INIT_THREAD { \
- .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .sp0 = sizeof(init_stack) + (long)&init_stack \
+ - KERNEL_STACK_OFFSET, \
.vm86_info = NULL, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
@@ -856,7 +857,8 @@ static inline void spin_lock_prefetch(const void *x)
*/
#define INIT_TSS { \
.x86_tss = { \
- .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .sp0 = sizeof(init_stack) + (long)&init_stack \
+ - KERNEL_STACK_OFFSET, \
.ss0 = __KERNEL_DS, \
.ss1 = __KERNEL_CS, \
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
@@ -866,29 +868,9 @@ static inline void spin_lock_prefetch(const void *x)
extern unsigned long thread_saved_pc(struct task_struct *tsk);
-#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
-#define KSTK_TOP(info) \
-({ \
- unsigned long *__ptr = (unsigned long *)(info); \
- (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
-})
-
-/*
- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
- * This is necessary to guarantee that the entire "struct pt_regs"
- * is accessible even if the CPU haven't stored the SS/ESP registers
- * on the stack (interrupt gate does not save these registers
- * when switching to the same priv ring).
- * Therefore beware: accessing the ss/esp fields of the
- * "struct pt_regs" is possible, but they may contain the
- * completely wrong values.
- */
-#define task_pt_regs(task) \
-({ \
- struct pt_regs *__regs__; \
- __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
- __regs__ - 1; \
-})
+#define task_pt_regs(task) \
+ ((struct pt_regs *)((unsigned long)task_stack_page(task) + \
+ THREAD_SIZE - KERNEL_STACK_OFFSET) - 1)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 9f0c47f..36b8a10 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -153,14 +153,16 @@ struct thread_info {
/*
* Amount of reserved space between the top of the kernel stack page and the
* user ptregs frame.
+ * On i386, this is necessary to guarantee that the entire "struct pt_regs"
+ * is accessible even if the CPU hasn't stored the SS/ESP registers on the
+ * stack (an interrupt gate does not save these registers when switching to
+ * the same priv ring). Therefore beware: accessing the ss/esp fields of the
+ * "struct pt_regs" is possible, but they may contain the completely wrong
+ * values.
* On x86_64, KERNEL_STACK_OFFSET must be set to a multiple of 16 bytes due
* to its automatic stack alignment for interrupts, traps, and exceptions.
*/
-#ifdef CONFIG_X86_32
-#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8))
-#else
#define KERNEL_STACK_OFFSET (2*(BITS_PER_LONG/8))
-#endif
/*
* macros/functions for gaining access to the thread information structure
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 000d419..e94b994 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -396,9 +396,10 @@ sysenter_past_esp:
/*
* Push current_thread_info()->sysenter_return to the stack.
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
- * pushed above; +8 corresponds to copy_thread's esp0 setting.
+ * pushed above; KERNEL_STACK_OFFSET corresponds to copy_thread's
+ * esp0 setting.
*/
- pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+ pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+KERNEL_STACK_OFFSET+4*4)(%esp)
CFI_REL_OFFSET eip, 0
pushl_cfi %eax
--
2.1.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* [PATCHv2 4/4] x86_64, entry: Create IRET-compatible stack frame at syscall entry
2015-01-18 11:45 [PATCHv2 0/4] x86, entry: some cleanup and simplification Alexander van Heukelum
` (2 preceding siblings ...)
2015-01-18 11:45 ` [PATCHv2 3/4] i386: clean up KERNEL_STACK_OFFSET Alexander van Heukelum
@ 2015-01-18 11:45 ` Alexander van Heukelum
2015-01-18 16:38 ` Andy Lutomirski
2015-01-18 12:05 ` [PATCHv2 0/4] x86, entry: some cleanup and simplification Borislav Petkov
4 siblings, 1 reply; 18+ messages in thread
From: Alexander van Heukelum @ 2015-01-18 11:45 UTC (permalink / raw)
To: Andy Lutomirski, x86, linux-kernel
Cc: Frederic Weisbecker, Oleg Nesterov, Borislav Petkov, Rik van Riel
Create an IRET-compatible top of stack at syscall entry and use this
information to return to user mode in the sysret path. This removes
the need for the FIXUP_TOP_OF_STACK and RESTORE_TOP_OF_STACK macros.
Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
---
arch/x86/kernel/entry_64.S | 75 +++++++++++++---------------------------------
1 file changed, 21 insertions(+), 54 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6b95c2f..c4cb8f1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -33,8 +33,6 @@
* - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
* Gives a full stack frame.
* - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
* - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
* - idtentry - Define exception entry points.
*/
@@ -130,33 +128,6 @@ ENDPROC(native_usergs_sysret64)
#endif
/*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
- */
-
- /* %rsp:at FRAMEEND */
- .macro FIXUP_TOP_OF_STACK tmp offset=0
- movq PER_CPU_VAR(old_rsp),\tmp
- movq \tmp,RSP+\offset(%rsp)
- movq $__USER_DS,SS+\offset(%rsp)
- movq $__USER_CS,CS+\offset(%rsp)
- movq RIP+\offset(%rsp),\tmp /* get rip */
- movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
- movq R11+\offset(%rsp),\tmp /* get eflags */
- movq \tmp,EFLAGS+\offset(%rsp)
- .endm
-
- .macro RESTORE_TOP_OF_STACK tmp offset=0
- movq RSP+\offset(%rsp),\tmp
- movq \tmp,PER_CPU_VAR(old_rsp)
- movq EFLAGS+\offset(%rsp),\tmp
- movq \tmp,R11+\offset(%rsp)
- .endm
-
-/*
* initial frame state for interrupts (and exceptions without error code)
*/
.macro EMPTY_FRAME start=1 offset=0
@@ -272,7 +243,6 @@ ENTRY(ret_from_fork)
testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
- RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call # go to the SYSRET fastpath
1:
@@ -339,10 +309,24 @@ GLOBAL(system_call_after_swapgs)
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 6*8, 0, rax_enosys=1 /* skip: hardware stackframe and orig_rax */
+ /*
+ * Save user mode rsp (temporarily saved above in old_rsp),
+ * rflags (%r11), rip (%rcx) and segments (fixed values) on
+ * the stack as a regular interrupt frame.
+ */
+ pushq_cfi $__USER_DS
+ /* CFI_REL_OFFSET ss, 0 */
+ pushq_cfi PER_CPU_VAR(old_rsp)
+ CFI_REL_OFFSET rsp, 0
+ pushq_cfi %r11 /* %r11 clobbered (userspace %rflags) */
+ /* CFI_REL_OFFSET rflags, 0 */
+ pushq_cfi $__USER_CS
+ /* CFI_REL_OFFSET cs, 0 */
+ pushq_cfi %rcx /* %rcx clobbered (userspace %rip) */
+ CFI_REL_OFFSET rip, 0
+
+ SAVE_ARGS 8, rax_enosys=1
movq_cfi rax,(ORIG_RAX-ARGOFFSET)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
jnz tracesys
system_call_fastpath:
@@ -362,7 +346,7 @@ system_call_fastpath:
*/
ret_from_sys_call:
testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,ARGOFFSET)
- jnz int_ret_from_sys_call_fixup /* Go the the slow path */
+ jnz int_ret_from_sys_call /* Go the the slow path */
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -372,19 +356,16 @@ ret_from_sys_call:
* sysretq will re-enable interrupts:
*/
TRACE_IRQS_ON
+ RESTORE_ARGS addskip=-ARG_SKIP, rstor_rcx=0, rstor_r11=0
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 1,-ARG_SKIP,0
+ mov EFLAGS-ARGOFFSET(%rsp), %r11
/*CFI_REGISTER rflags,r11*/
- movq PER_CPU_VAR(old_rsp), %rsp
+ mov RSP-ARGOFFSET(%rsp), %rsp
USERGS_SYSRET64
CFI_RESTORE_STATE
-int_ret_from_sys_call_fixup:
- FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
- jmp int_ret_from_sys_call
-
/* Do syscall tracing */
tracesys:
leaq -REST_SKIP(%rsp), %rdi
@@ -397,7 +378,6 @@ tracesys:
tracesys_phase2:
SAVE_REST
- FIXUP_TOP_OF_STACK %rdi
movq %rsp, %rdi
movq $AUDIT_ARCH_X86_64, %rsi
movq %rax,%rdx
@@ -493,10 +473,8 @@ ENTRY(stub_\func)
PARTIAL_FRAME 0
SAVE_REST
pushq %r11 /* put it back on stack */
- FIXUP_TOP_OF_STACK %r11, 8
DEFAULT_FRAME 0 8 /* offset 8: return address */
call sys_\func
- RESTORE_TOP_OF_STACK %r11, 8
ret $REST_SKIP /* pop extended registers */
CFI_ENDPROC
END(stub_\func)
@@ -506,9 +484,7 @@ END(stub_\func)
ENTRY(\label)
CFI_STARTPROC
PARTIAL_FRAME 0 8 /* offset 8: return address */
- FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
call \func
- RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
ret
CFI_ENDPROC
END(\label)
@@ -524,7 +500,6 @@ ENTRY(stub_execve)
addq $8, %rsp
PARTIAL_FRAME 0
SAVE_REST
- FIXUP_TOP_OF_STACK %r11
call sys_execve
movq %rax,RAX(%rsp)
RESTORE_REST
@@ -537,9 +512,7 @@ ENTRY(stub_execveat)
addq $8, %rsp
PARTIAL_FRAME 0
SAVE_REST
- FIXUP_TOP_OF_STACK %r11
call sys_execveat
- RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
RESTORE_REST
jmp int_ret_from_sys_call
@@ -555,7 +528,6 @@ ENTRY(stub_rt_sigreturn)
addq $8, %rsp
PARTIAL_FRAME 0
SAVE_REST
- FIXUP_TOP_OF_STACK %r11
call sys_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
RESTORE_REST
@@ -569,7 +541,6 @@ ENTRY(stub_x32_rt_sigreturn)
addq $8, %rsp
PARTIAL_FRAME 0
SAVE_REST
- FIXUP_TOP_OF_STACK %r11
call sys32_x32_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
RESTORE_REST
@@ -582,9 +553,7 @@ ENTRY(stub_x32_execve)
addq $8, %rsp
PARTIAL_FRAME 0
SAVE_REST
- FIXUP_TOP_OF_STACK %r11
call compat_sys_execve
- RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
RESTORE_REST
jmp int_ret_from_sys_call
@@ -596,9 +565,7 @@ ENTRY(stub_x32_execveat)
addq $8, %rsp
PARTIAL_FRAME 0
SAVE_REST
- FIXUP_TOP_OF_STACK %r11
call compat_sys_execveat
- RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
RESTORE_REST
jmp int_ret_from_sys_call
--
2.1.0
^ permalink raw reply related [flat|nested] 18+ messages in thread
* Re: [PATCHv2 4/4] x86_64, entry: Create IRET-compatible stack frame at syscall entry
2015-01-18 11:45 ` [PATCHv2 4/4] x86_64, entry: Create IRET-compatible stack frame at syscall entry Alexander van Heukelum
@ 2015-01-18 16:38 ` Andy Lutomirski
2015-01-18 17:22 ` Alexander van Heukelum
0 siblings, 1 reply; 18+ messages in thread
From: Andy Lutomirski @ 2015-01-18 16:38 UTC (permalink / raw)
To: Alexander van Heukelum
Cc: X86 ML, linux-kernel, Frederic Weisbecker, Oleg Nesterov,
Borislav Petkov, Rik van Riel
On Sun, Jan 18, 2015 at 3:45 AM, Alexander van Heukelum
<heukelum@fastmail.fm> wrote:
> Create an IRET-compatible top of stack at syscall entry and use this
> information to return to user mode in the sysret path. This removes
> the need for the FIXUP_TOP_OF_STACK and RESTORE_TOP_OF_STACK macros.
Since I have limited bandwidth, I'd like to tackle these one at a time.
I like the idea of this patch, but it has some issues.
First, it needs to be benchmarked. The syscall fast path entry code
is *very* hot in some workloads, and it needs to be fast.
Second, I think you're really making three changes here.
a) You're putting rsp where it belongs -- it's in pt_regs instead of
being magically shoved into a combination of per-cpu variables and
extra arch state (thread->usersp). This ideally consists of (AFAICS)
two tiny asm changes: one extra mov (most likely cache-hot) on entry
and a change of where you're reading from when you reload rsp on exit.
The former change could easily add a cycle (or zero cycles, or lots of
cycles -- hardware can be complicated, and I have no idea how well
store forwarding works on gs-relative accesses). The latter change is
probably a speedup -- we'd be reading from pt_regs (almost certainly
hot or at least easily detected by the hardware prefetcher) instead of
a random percpu variable on exit.
*However*, this change enables the removal of all the usersp crap when
context switching, and all of the old_rsp references need to be
audited, and (having added yet another of them a week or two ago) I
know that you missed at least one and probably three or four :) Also,
removing the usersp crap could easily speed up context switches by a
cache line or so.
Can you do that and split out just the old_rsp, usersp, and rsp part
as its own patch?
b) You're putting the saved flags into the EFLAGS pt_regs slot, which
seems to me to be an unambiguous win -- it removes two instructions
from RESTORE_TOP_OF_STACK, and it adds nothing whatsoever (except to
the extent that you continue to initialize R11 on entry instead of in
FIXUP_TOP_OF_STACK).
(a) and (b) alone should be enough to eliminate RESTORE_TOP_OF_STACK.
c) You're initializing the rest of the "top of stack" (cs, ss, and
rcx) unconditionally. This is simpler, but I'm not sure it's
worthwhile -- we still lazily save the caller-saved regs, and
FIXUP_TOP_OF_STACK fits right in. It also may have a performance
impact.
I think that (a) and (b) are clear wins (a is a really nice cleanup
and I bet it's a speedup, too, and b seems to be better in all
respects). (c) is much less clearly a win to me.
Would you be willing to send split-out patches along with benchmarks?
--Andy
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHv2 4/4] x86_64, entry: Create IRET-compatible stack frame at syscall entry
2015-01-18 16:38 ` Andy Lutomirski
@ 2015-01-18 17:22 ` Alexander van Heukelum
0 siblings, 0 replies; 18+ messages in thread
From: Alexander van Heukelum @ 2015-01-18 17:22 UTC (permalink / raw)
To: Andy Lutomirski
Cc: X86 ML, linux-kernel, Frederic Weisbecker, Oleg Nesterov,
Borislav Petkov, Rik van Riel
On Sun, Jan 18, 2015, at 17:38, Andy Lutomirski wrote:
> On Sun, Jan 18, 2015 at 3:45 AM, Alexander van Heukelum
> <heukelum@fastmail.fm> wrote:
> > Create an IRET-compatible top of stack at syscall entry and use this
> > information to return to user mode in the sysret path. This removes
> > the need for the FIXUP_TOP_OF_STACK and RESTORE_TOP_OF_STACK macros.
>
> Since I have limited bandwidth, I'd like to tackle these one at a time.
>
> I like the idea of this patch, but it has some issues.
>
> First, it needs to be benchmarked. The syscall fast path entry code
> is *very* hot in some workloads, and it needs to be fast.
Yeah. I didn't even try to benchmark it. What do you propose to use in this case?
> Second, I think you're really making three changes here.
>
> a) You're putting rsp where it belongs -- it's in pt_regs instead of
> being magically shoved into a combination of per-cpu variables and
> extra arch state (thread->usersp). This ideally consists of (AFAICS)
> two tiny asm changes: one extra mov (most likely cache-hot) on entry
> and a change of where you're reading from when you reload rsp on exit.
> The former change could easily add a cycle (or zero cycles, or lots of
> cycles -- hardware can be complicated, and I have no idea how well
> store forwarding works on gs-relative accesses). The latter change is
> probably a speedup -- we'd be reading from pt_regs (almost certainly
> hot or at least easily detected by the hardware prefetcher) instead of
> a random percpu variable on exit.
>
> *However*, this change enables the removal of all the usersp crap when
> context switching, and all of the old_rsp references need to be
> audited, and (having added yet another of them a week or two ago) I
> know that you missed at least one and probably three or four :) Also,
> removing the usersp crap could easily speed up context switches by a
> cache line or so.
Yes. I missed that part. I'll look into it. But nothing seems to blow up :)
> Can you do that and split out just the old_rsp, usersp, and rsp part
> as its own patch?
>
> b) You're putting the saved flags into the EFLAGS pt_regs slot, which
> seems to me to be an unambiguous win -- it removes two instructions
> from RESTORE_TOP_OF_STACK, and it adds nothing whatsoever (except to
> the extent that you continue to initialize R11 on entry instead of in
> FIXUP_TOP_OF_STACK).
>
> (a) and (b) alone should be enough to eliminate RESTORE_TOP_OF_STACK.
>
> c) You're initializing the rest of the "top of stack" (cs, ss, and
> rcx) unconditionally. This is simpler, but I'm not sure it's
> worthwhile -- we still lazily save the caller-saved regs, and
> FIXUP_TOP_OF_STACK fits right in. It also may have a performance
> impact.
>
> I think that (a) and (b) are clear wins (a is a really nice cleanup
> and I bet it's a speedup, too, and b seems to be better in all
> respects). (c) is much less clearly a win to me.
>
> Would you be willing to send split-out patches along with benchmarks?
Timing will depend highly on amounts of spare time, but I will give it
a shot.
Thanks for your valuable input!
Greetings,
Alexander
> --Andy
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHv2 0/4] x86, entry: some cleanup and simplification...
2015-01-18 11:45 [PATCHv2 0/4] x86, entry: some cleanup and simplification Alexander van Heukelum
` (3 preceding siblings ...)
2015-01-18 11:45 ` [PATCHv2 4/4] x86_64, entry: Create IRET-compatible stack frame at syscall entry Alexander van Heukelum
@ 2015-01-18 12:05 ` Borislav Petkov
2015-01-18 15:47 ` Alexander van Heukelum
4 siblings, 1 reply; 18+ messages in thread
From: Borislav Petkov @ 2015-01-18 12:05 UTC (permalink / raw)
To: Alexander van Heukelum
Cc: Andy Lutomirski, x86, linux-kernel, Frederic Weisbecker,
Oleg Nesterov, Rik van Riel, Denys Vlasenko
Hi,
On Sun, Jan 18, 2015 at 12:45:16PM +0100, Alexander van Heukelum wrote:
> Hi Andy,
>
> The last patchset did not compile on i386. Please ignore it. This one
> should be better. Instead of removing KERNEL_STACK_OFFSET, it is now
> used consistently on both i386 and x86_64.
>
> Boot tested using qemu (using klibc for userspace)
> - x86_64, 32-bit userspace, core2duo (sysenter32)
> - x86_64, 32-bit userspace, phenom (syscall32)
> - x86_64, 32-bit userspace, vdso=0 (int 0x80)
> - x86_64, 64-bit userspace
> - i386, pentium3 (sysenter)
> - i386, athlon (syscall)
> - i386, vdso=0 (int 0x80)
>
> They were tested on top of 22f2aa4a0361707a5cfb1de9d45260b39965dead
> (x86/entry-devel in your tree) and this kernel is now running on my
> laptop.
btw, you might wanna sync with Denys who's doing cleanups in that area too:
https://lkml.kernel.org/r/1421272101-16847-1-git-send-email-dvlasenk@redhat.com
and touching some of the stuff you're changing too.
Thanks.
--
Regards/Gruss,
Boris.
ECO tip #101: Trim your mails when you reply.
--
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHv2 0/4] x86, entry: some cleanup and simplification...
2015-01-18 12:05 ` [PATCHv2 0/4] x86, entry: some cleanup and simplification Borislav Petkov
@ 2015-01-18 15:47 ` Alexander van Heukelum
2015-01-21 13:26 ` Denys Vlasenko
0 siblings, 1 reply; 18+ messages in thread
From: Alexander van Heukelum @ 2015-01-18 15:47 UTC (permalink / raw)
To: Borislav Petkov
Cc: Andy Lutomirski, x86, linux-kernel, Frederic Weisbecker,
Oleg Nesterov, Rik van Riel, Denys Vlasenko
On Sun, Jan 18, 2015, at 13:05, Borislav Petkov wrote:
> Hi,
>
> btw, you might wanna sync with Denys who's doing cleanups in that area too:
>
> https://lkml.kernel.org/r/1421272101-16847-1-git-send-email-dvlasenk@redhat.com
>
> and touching some of the stuff you're changing too.
Thanks. FWIW, the change touches the same area in code, but is orthogonal in concept.
Greetings,
Alexander
> Thanks.
>
> --
> Regards/Gruss,
> Boris.
>
> ECO tip #101: Trim your mails when you reply.
> --
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHv2 0/4] x86, entry: some cleanup and simplification...
2015-01-18 15:47 ` Alexander van Heukelum
@ 2015-01-21 13:26 ` Denys Vlasenko
2015-01-21 15:51 ` Alexander van Heukelum
0 siblings, 1 reply; 18+ messages in thread
From: Denys Vlasenko @ 2015-01-21 13:26 UTC (permalink / raw)
To: Alexander van Heukelum
Cc: Borislav Petkov, Andy Lutomirski, X86 ML,
Linux Kernel Mailing List, Frederic Weisbecker, Oleg Nesterov,
Rik van Riel, Denys Vlasenko
On Sun, Jan 18, 2015 at 4:47 PM, Alexander van Heukelum
<heukelum@fastmail.fm> wrote:
> On Sun, Jan 18, 2015, at 13:05, Borislav Petkov wrote:
>> Hi,
>>
>> btw, you might wanna sync with Denys who's doing cleanups in that area too:
>>
>> https://lkml.kernel.org/r/1421272101-16847-1-git-send-email-dvlasenk@redhat.com
>>
>> and touching some of the stuff you're changing too.
>
> Thanks. FWIW, the change touches the same area in code, but is orthogonal in concept.
My patches eventually eliminate both ARGOFFSET and KERNEL_STACK_OFFSET.
For example, eventually TI_flags tests look like this after all my patches:
testl $const, TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
Your patches massage the same defines in a different way than mine do.
We clearly have a conflict here.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCHv2 0/4] x86, entry: some cleanup and simplification...
2015-01-21 13:26 ` Denys Vlasenko
@ 2015-01-21 15:51 ` Alexander van Heukelum
0 siblings, 0 replies; 18+ messages in thread
From: Alexander van Heukelum @ 2015-01-21 15:51 UTC (permalink / raw)
To: Denys Vlasenko
Cc: Borislav Petkov, Andy Lutomirski, X86 ML,
Linux Kernel Mailing List, Frederic Weisbecker, Oleg Nesterov,
Rik van Riel, Denys Vlasenko
On Wed, Jan 21, 2015, at 14:26, Denys Vlasenko wrote:
> On Sun, Jan 18, 2015 at 4:47 PM, Alexander van Heukelum
> <heukelum@fastmail.fm> wrote:
> > On Sun, Jan 18, 2015, at 13:05, Borislav Petkov wrote:
> >> Hi,
> >>
> >> btw, you might wanna sync with Denys who's doing cleanups in that area too:
> >>
> >> https://lkml.kernel.org/r/1421272101-16847-1-git-send-email-dvlasenk@redhat.com
> >>
> >> and touching some of the stuff you're changing too.
> >
> > Thanks. FWIW, the change touches the same area in code, but is orthogonal in concept.
>
> My patches eventually eliminate both ARGOFFSET and KERNEL_STACK_OFFSET.
> For example, eventually TI_flags tests look like this after all my patches:
>
> testl $const, TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
>
> Your patches massage the same defines in a different way than mine do.
> We clearly have a conflict here.
Indeed. I missed that, because Borislav's link only pointed to a single
change and I failed to notice the "[01/11]" in the title.
^ permalink raw reply [flat|nested] 18+ messages in thread