x86: Remove force_iret()
diff mbox series

Message ID 20191219115812.102620-1-brgerst@gmail.com
State Accepted
Commit 2b10906f2d25515bba58070b8183babc89063597
Headers show
Series
  • x86: Remove force_iret()
Related show

Commit Message

Brian Gerst Dec. 19, 2019, 11:58 a.m. UTC
force_iret() was originally intended to prevent the return to user mode with
the SYSRET or SYSEXIT instructions, in cases where the register state could
have been changed to be incompatible with those instructions.  The entry code
has been significantly reworked since then, and register state is validated
before SYSRET or SYSEXIT are used.  force_iret() no longer serves its original
purpose and can be eliminated.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
 arch/x86/ia32/ia32_signal.c        |  2 --
 arch/x86/include/asm/ptrace.h      | 16 ----------------
 arch/x86/include/asm/thread_info.h |  9 ---------
 arch/x86/kernel/process_32.c       |  1 -
 arch/x86/kernel/process_64.c       |  1 -
 arch/x86/kernel/signal.c           |  2 --
 arch/x86/kernel/vm86_32.c          |  1 -
 7 files changed, 32 deletions(-)

Comments

Andy Lutomirski Dec. 20, 2019, 1:49 a.m. UTC | #1
On Thu, Dec 19, 2019 at 3:58 AM Brian Gerst <brgerst@gmail.com> wrote:
>
> force_iret() was originally intended to prevent the return to user mode with
> the SYSRET or SYSEXIT instructions, in cases where the register state could
> have been changed to be incompatible with those instructions.

It's more than that.  Before the big syscall rework, we didn't restore
the caller-saved regs.  See:

commit 21d375b6b34ff511a507de27bf316b3dde6938d9
Author: Andy Lutomirski <luto@kernel.org>
Date:   Sun Jan 28 10:38:49 2018 -0800

    x86/entry/64: Remove the SYSCALL64 fast path

So if you changed r12, for example, the change would get lost.

 The entry code
> has been significantly reworked since then, and register state is validated
> before SYSRET or SYSEXIT are used.  force_iret() no longer serves its original
> purpose and can be eliminated.
>
> Signed-off-by: Brian Gerst <brgerst@gmail.com>
> ---
>  arch/x86/ia32/ia32_signal.c        |  2 --
>  arch/x86/include/asm/ptrace.h      | 16 ----------------
>  arch/x86/include/asm/thread_info.h |  9 ---------
>  arch/x86/kernel/process_32.c       |  1 -
>  arch/x86/kernel/process_64.c       |  1 -
>  arch/x86/kernel/signal.c           |  2 --
>  arch/x86/kernel/vm86_32.c          |  1 -
>  7 files changed, 32 deletions(-)
>
> diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
> index 30416d7f19d4..a3aefe9b9401 100644
> --- a/arch/x86/ia32/ia32_signal.c
> +++ b/arch/x86/ia32/ia32_signal.c
> @@ -114,8 +114,6 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
>
>         err |= fpu__restore_sig(buf, 1);
>
> -       force_iret();
> -
>         return err;
>  }
>
> diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
> index 5057a8ed100b..78897a8da01f 100644
> --- a/arch/x86/include/asm/ptrace.h
> +++ b/arch/x86/include/asm/ptrace.h
> @@ -339,22 +339,6 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
>
>  #define ARCH_HAS_USER_SINGLE_STEP_REPORT
>
> -/*
> - * When hitting ptrace_stop(), we cannot return using SYSRET because
> - * that does not restore the full CPU state, only a minimal set.  The
> - * ptracer can change arbitrary register values, which is usually okay
> - * because the usual ptrace stops run off the signal delivery path which
> - * forces IRET; however, ptrace_event() stops happen in arbitrary places
> - * in the kernel and don't force IRET path.
> - *
> - * So force IRET path after a ptrace stop.
> - */
> -#define arch_ptrace_stop_needed(code, info)                            \
> -({                                                                     \
> -       force_iret();                                                   \
> -       false;                                                          \
> -})
> -
>  struct user_desc;
>  extern int do_get_thread_area(struct task_struct *p, int idx,
>                               struct user_desc __user *info);
> diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
> index d779366ce3f8..cf4327986e98 100644
> --- a/arch/x86/include/asm/thread_info.h
> +++ b/arch/x86/include/asm/thread_info.h
> @@ -239,15 +239,6 @@ static inline int arch_within_stack_frames(const void * const stack,
>                            current_thread_info()->status & TS_COMPAT)
>  #endif
>
> -/*
> - * Force syscall return via IRET by making it look as if there was
> - * some work pending. IRET is our most capable (but slowest) syscall
> - * return path, which is able to restore modified SS, CS and certain
> - * EFLAGS values that other (fast) syscall return instructions
> - * are not able to restore properly.
> - */
> -#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
> -
>  extern void arch_task_cache_init(void);
>  extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
>  extern void arch_release_task_struct(struct task_struct *tsk);
> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
> index 323499f48858..5052ced43373 100644
> --- a/arch/x86/kernel/process_32.c
> +++ b/arch/x86/kernel/process_32.c
> @@ -124,7 +124,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
>         regs->ip                = new_ip;
>         regs->sp                = new_sp;
>         regs->flags             = X86_EFLAGS_IF;
> -       force_iret();
>  }
>  EXPORT_SYMBOL_GPL(start_thread);
>
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index 506d66830d4d..ffd497804dbc 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -394,7 +394,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
>         regs->cs                = _cs;
>         regs->ss                = _ss;
>         regs->flags             = X86_EFLAGS_IF;
> -       force_iret();
>  }
>
>  void
> diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
> index 8eb7193e158d..8a29573851a3 100644
> --- a/arch/x86/kernel/signal.c
> +++ b/arch/x86/kernel/signal.c
> @@ -151,8 +151,6 @@ static int restore_sigcontext(struct pt_regs *regs,
>
>         err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
>
> -       force_iret();
> -
>         return err;
>  }
>
> diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
> index a76c12b38e92..91d55454e702 100644
> --- a/arch/x86/kernel/vm86_32.c
> +++ b/arch/x86/kernel/vm86_32.c
> @@ -381,7 +381,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
>                 mark_screen_rdonly(tsk->mm);
>
>         memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
> -       force_iret();
>         return regs->ax;
>  }
>
> --
> 2.23.0
>
Brian Gerst Dec. 20, 2019, 3:48 a.m. UTC | #2
On Thu, Dec 19, 2019 at 8:50 PM Andy Lutomirski <luto@kernel.org> wrote:
>
> On Thu, Dec 19, 2019 at 3:58 AM Brian Gerst <brgerst@gmail.com> wrote:
> >
> > force_iret() was originally intended to prevent the return to user mode with
> > the SYSRET or SYSEXIT instructions, in cases where the register state could
> > have been changed to be incompatible with those instructions.
>
> It's more than that.  Before the big syscall rework, we didn't restore
> the caller-saved regs.  See:
>
> commit 21d375b6b34ff511a507de27bf316b3dde6938d9
> Author: Andy Lutomirski <luto@kernel.org>
> Date:   Sun Jan 28 10:38:49 2018 -0800
>
>     x86/entry/64: Remove the SYSCALL64 fast path
>
> So if you changed r12, for example, the change would get lost.

force_iret() specifically dealt with changes to CS, SS and EFLAGS.
Saving and restoring the extra registers was a different problem
although it affected the same functions like ptrace, signals, and
exec.

--
Brian Gerst
David Laight Dec. 20, 2019, 10:10 a.m. UTC | #3
From: Brian Gerst
> Sent: 20 December 2019 03:48
> On Thu, Dec 19, 2019 at 8:50 PM Andy Lutomirski <luto@kernel.org> wrote:
> >
> > On Thu, Dec 19, 2019 at 3:58 AM Brian Gerst <brgerst@gmail.com> wrote:
> > >
> > > force_iret() was originally intended to prevent the return to user mode with
> > > the SYSRET or SYSEXIT instructions, in cases where the register state could
> > > have been changed to be incompatible with those instructions.
> >
> > It's more than that.  Before the big syscall rework, we didn't restore
> > the caller-saved regs.  See:
> >
> > commit 21d375b6b34ff511a507de27bf316b3dde6938d9
> > Author: Andy Lutomirski <luto@kernel.org>
> > Date:   Sun Jan 28 10:38:49 2018 -0800
> >
> >     x86/entry/64: Remove the SYSCALL64 fast path
> >
> > So if you changed r12, for example, the change would get lost.
> 
> force_iret() specifically dealt with changes to CS, SS and EFLAGS.
> Saving and restoring the extra registers was a different problem
> although it affected the same functions like ptrace, signals, and
> exec.

Is it ever possible for any of the segment registers to refer to the LDT
and for another thread to invalidate the entries 'very late' ?

So even though the values were valid when changed, they are
invalid during the 'return to user' sequence.

I remember writing a signal handler that 'corrupted' all the
segment registers (etc) and fixing the NetBSD kernel to handle
all the faults restoring the segment registers and IRET faulting
in kernel (IIRC invalid user %SS or %CS).
(IRET can also fault in user space, but that is a normal fault.)

Is it actually cheaper to properly validate the segment registers,
or take the 'hit' of the slightly slower IRET path and get the cpu
to do it for you?

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Andy Lutomirski Dec. 20, 2019, 10:30 a.m. UTC | #4
> On Dec 20, 2019, at 6:10 PM, David Laight <David.Laight@aculab.com> wrote:
> 
> From: Brian Gerst
>> Sent: 20 December 2019 03:48
>>> On Thu, Dec 19, 2019 at 8:50 PM Andy Lutomirski <luto@kernel.org> wrote:
>>> 
>>> On Thu, Dec 19, 2019 at 3:58 AM Brian Gerst <brgerst@gmail.com> wrote:
>>>> 
>>>> force_iret() was originally intended to prevent the return to user mode with
>>>> the SYSRET or SYSEXIT instructions, in cases where the register state could
>>>> have been changed to be incompatible with those instructions.
>>> 
>>> It's more than that.  Before the big syscall rework, we didn't restore
>>> the caller-saved regs.  See:
>>> 
>>> commit 21d375b6b34ff511a507de27bf316b3dde6938d9
>>> Author: Andy Lutomirski <luto@kernel.org>
>>> Date:   Sun Jan 28 10:38:49 2018 -0800
>>> 
>>>    x86/entry/64: Remove the SYSCALL64 fast path
>>> 
>>> So if you changed r12, for example, the change would get lost.
>> 
>> force_iret() specifically dealt with changes to CS, SS and EFLAGS.
>> Saving and restoring the extra registers was a different problem
>> although it affected the same functions like ptrace, signals, and
>> exec.
> 
> Is it ever possible for any of the segment registers to refer to the LDT
> and for another thread to invalidate the entries 'very late' ?

Not in newer kernels, because the actual LDT is never modified.  Instead, LDT changes create a whole new LDT and propagate it with an IPI.

But the IRET path can fail due to changes to the selectors while in the kernel, due to sigreturn or ptrace.  We have delightful selftests for this.

> 
> So even though the values were valid when changed, they are
> invalid during the 'return to user' sequence.
> 
> I remember writing a signal handler that 'corrupted' all the
> segment registers (etc) and fixing the NetBSD kernel to handle
> all the faults restoring the segment registers and IRET faulting
> in kernel (IIRC invalid user %SS or %CS).
> (IRET can also fault in user space, but that is a normal fault.)

Did you remember to test the #NP case?  Many kernels forgot that this was possible :)

> 
> Is it actually cheaper to properly validate the segment registers,
> or take the 'hit' of the slightly slower IRET path and get the cpu
> to do it for you?
> 
> 

The validation we’re talking about is for SYSRET, not IRET.  It has its own set of nasty conditions involving EFLAGS, R11, RIP, and RCX.  Fortunately no segments are involved. The algorithm is, roughly:

if (okay for SYSRET) {
  SYSRET (and assume it can’t fail)
} else {
  if (need ESPFIX)
   Horrible hacks;
  IRET;
}

And we handle #GP, #SS, #NP and #DF from IRET. And we have selftests for all of this. And no one runs the bloody selftests on 32-bit kernels, resulting in truly awful bugs.

We can’t handle #GP from SYSRET. Thanks, Intel.

(AMD gets this more right. SYSRET is still a turd, but it can’t fault. Intel handles RIP canonical checks differently from AMD, and SYSRET will #GP if RCX is noncanonical.  The result was privilege escalation on basically every OS when this was noticed.)
David Laight Dec. 20, 2019, 10:59 a.m. UTC | #5
From: Andy Lutomirski
> Sent: 20 December 2019 10:30
> > On Dec 20, 2019, at 6:10 PM, David Laight <David.Laight@aculab.com> wrote:
> >
> > From: Brian Gerst
> >> Sent: 20 December 2019 03:48
> >>> On Thu, Dec 19, 2019 at 8:50 PM Andy Lutomirski <luto@kernel.org> wrote:
> >>>
> >>> On Thu, Dec 19, 2019 at 3:58 AM Brian Gerst <brgerst@gmail.com> wrote:
> >>>>
> >>>> force_iret() was originally intended to prevent the return to user mode with
> >>>> the SYSRET or SYSEXIT instructions, in cases where the register state could
> >>>> have been changed to be incompatible with those instructions.
> >>>
> >>> It's more than that.  Before the big syscall rework, we didn't restore
> >>> the caller-saved regs.  See:
> >>>
> >>> commit 21d375b6b34ff511a507de27bf316b3dde6938d9
> >>> Author: Andy Lutomirski <luto@kernel.org>
> >>> Date:   Sun Jan 28 10:38:49 2018 -0800
> >>>
> >>>    x86/entry/64: Remove the SYSCALL64 fast path
> >>>
> >>> So if you changed r12, for example, the change would get lost.
> >>
> >> force_iret() specifically dealt with changes to CS, SS and EFLAGS.
> >> Saving and restoring the extra registers was a different problem
> >> although it affected the same functions like ptrace, signals, and
> >> exec.
> >
> > Is it ever possible for any of the segment registers to refer to the LDT
> > and for another thread to invalidate the entries 'very late' ?
> 
> Not in newer kernels, because the actual LDT is never modified.
> Instead, LDT changes create a whole new LDT and propagate it with an IPI.

Can the IPI be disabled through the SYSRET path?
Once in user space, the IPI will interrupt the process and, I presume, it will
pick up the new LDT on 'return to user'.
But if the IPI happens between the LDT being set and SYSRET it will (presumably)
remain 'pending' until the next system call?
Which could be long enough for one thread to have passed a pointer across giving
an unexpected SEGV (or maybe worse, failing to give an expected one).

> But the IRET path can fail due to changes to the selectors while in the kernel, due to sigreturn or ptrace.  We have delightful selftests
> for this.
> 
> >
> > So even though the values were valid when changed, they are
> > invalid during the 'return to user' sequence.
> >
> > I remember writing a signal handler that 'corrupted' all the
> > segment registers (etc) and fixing the NetBSD kernel to handle
> > all the faults restoring the segment registers and IRET faulting
> > in kernel (IIRC invalid user %SS or %CS).
> > (IRET can also fault in user space, but that is a normal fault.)
> 
> Did you remember to test the #NP case?  Many kernels forgot that this was possible :)

That might have been why I was fixing it.
I certainly tested the cases where loading the user segment registers faulted in kernel
(after loading the user-GS) and where IRET faulted in kernel.
I fixed up the stack in the interrupt entry code to make it all appear to be a fault
in user-space (deleting one of the trap frames).
This also stops repeated faults getting further and further down the kernel stack.

> > Is it actually cheaper to properly validate the segment registers,
> > or take the 'hit' of the slightly slower IRET path and get the cpu
> > to do it for you?
> 
> The validation we’re talking about is for SYSRET, not IRET.  It has its own set of nasty conditions involving EFLAGS, R11, RIP, and RCX.
> Fortunately no segments are involved. The algorithm is, roughly:
> 
> if (okay for SYSRET) {
>   SYSRET (and assume it can’t fail)
> } else {
>   if (need ESPFIX)
>    Horrible hacks;
>   IRET;
> }

Ok, I was part worried you were forcing 'okay for SYSRET' to 1.
 
> And we handle #GP, #SS, #NP and #DF from IRET. And we have selftests for all of this.

Yes, it would be best if IRET only ever faulted in user-space.

> And no one runs the bloody selftests on 32-bit kernels, resulting in truly awful bugs.

I suspect I didn't try hard enough to get FS/GS and FSBASE/GSBASE correctly restored.
(Given that the segment descriptors may not contain the required values.)

> We can’t handle #GP from SYSRET. Thanks, Intel.
> 
> (AMD gets this more right. SYSRET is still a turd, but it can’t fault. Intel handles RIP canonical checks differently from AMD, and SYSRET
> will #GP if RCX is noncanonical.  The result was privilege escalation on basically every OS when this was noticed.)

Yes, all the 'fast system call' instructions were badly thought out.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Brian Gerst Dec. 20, 2019, 12:18 p.m. UTC | #6
On Fri, Dec 20, 2019 at 5:10 AM David Laight <David.Laight@aculab.com> wrote:
>
> From: Brian Gerst
> > Sent: 20 December 2019 03:48
> > On Thu, Dec 19, 2019 at 8:50 PM Andy Lutomirski <luto@kernel.org> wrote:
> > >
> > > On Thu, Dec 19, 2019 at 3:58 AM Brian Gerst <brgerst@gmail.com> wrote:
> > > >
> > > > force_iret() was originally intended to prevent the return to user mode with
> > > > the SYSRET or SYSEXIT instructions, in cases where the register state could
> > > > have been changed to be incompatible with those instructions.
> > >
> > > It's more than that.  Before the big syscall rework, we didn't restore
> > > the caller-saved regs.  See:
> > >
> > > commit 21d375b6b34ff511a507de27bf316b3dde6938d9
> > > Author: Andy Lutomirski <luto@kernel.org>
> > > Date:   Sun Jan 28 10:38:49 2018 -0800
> > >
> > >     x86/entry/64: Remove the SYSCALL64 fast path
> > >
> > > So if you changed r12, for example, the change would get lost.
> >
> > force_iret() specifically dealt with changes to CS, SS and EFLAGS.
> > Saving and restoring the extra registers was a different problem
> > although it affected the same functions like ptrace, signals, and
> > exec.
>
> Is it ever possible for any of the segment registers to refer to the LDT
> and for another thread to invalidate the entries 'very late' ?
> So even though the values were valid when changed, they are
> invalid during the 'return to user' sequence.

Not in the SYSRET case, where the kernel requires that CS and SS are
static segments in the GDT.  Any userspace context that uses LDT
segments for CS/SS must return with IRET.  There is fault handling for
IRET (fixup_bad_iret()) for this case.

> I remember writing a signal handler that 'corrupted' all the
> segment registers (etc) and fixing the NetBSD kernel to handle
> all the faults restoring the segment registers and IRET faulting
> in kernel (IIRC invalid user %SS or %CS).
> (IRET can also fault in user space, but that is a normal fault.)
>
> Is it actually cheaper to properly validate the segment registers,
> or take the 'hit' of the slightly slower IRET path and get the cpu
> to do it for you?

SYSRET is faster because it avoids segment table lookups and
permission checks for CS and SS.  It simply sets the selectors to
values set in an MSR and the attributes (base, limit, etc.) to fixed
values.  It is up to the OS to make sure the actual segment
descriptors in memory match those default attributes.

--
Brian Gerst
David Laight Dec. 20, 2019, 12:35 p.m. UTC | #7
From: Brian Gerst
> Sent: 20 December 2019 12:18
...
> > Is it ever possible for any of the segment registers to refer to the LDT
> > and for another thread to invalidate the entries 'very late' ?
> > So even though the values were valid when changed, they are
> > invalid during the 'return to user' sequence.
> 
> Not in the SYSRET case, where the kernel requires that CS and SS are
> static segments in the GDT.  Any userspace context that uses LDT
> segments for CS/SS must return with IRET.  There is fault handling for
> IRET (fixup_bad_iret()) for this case.

Ok - It is a long time since i looked at these 'syscall' instructions.

...
> > Is it actually cheaper to properly validate the segment registers,
> > or take the 'hit' of the slightly slower IRET path and get the cpu
> > to do it for you?
> 
> SYSRET is faster because it avoids segment table lookups and
> permission checks for CS and SS.  It simply sets the selectors to
> values set in an MSR and the attributes (base, limit, etc.) to fixed
> values.  It is up to the OS to make sure the actual segment
> descriptors in memory match those default attributes.

I wonder how much difference that make when 'page table separation'
is used?

I guess the loading of ds/es/fs/gs can fault - but that it no harder
to handle than in the IRET case.

	David

Anyway, off until the new year now.

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Oleg Nesterov Dec. 20, 2019, 7:35 p.m. UTC | #8
On 12/19, Brian Gerst wrote:
>
> force_iret() was originally intended to prevent the return to user mode with
> the SYSRET or SYSEXIT instructions, in cases where the register state could
> have been changed to be incompatible with those instructions.  The entry code
> has been significantly reworked since then, and register state is validated
> before SYSRET or SYSEXIT are used.  force_iret() no longer serves its original
> purpose and can be eliminated.

Plus iiuc today force_iret() == set_thread_flag(TIF_NOTIFY_RESUME) simply has
no effect on asm paths.

Acked-by: Oleg Nesterov <oleg@redhat.com>
Andy Lutomirski Dec. 20, 2019, 9:20 p.m. UTC | #9
> On Dec 20, 2019, at 6:59 PM, David Laight <David.Laight@aculab.com> wrote:
> 
> From: Andy Lutomirski
>> Sent: 20 December 2019 10:30
>>>> On Dec 20, 2019, at 6:10 PM, David Laight <David.Laight@aculab.com> wrote:
>>> 
>>> From: Brian Gerst
>>>> Sent: 20 December 2019 03:48
>>>>> On Thu, Dec 19, 2019 at 8:50 PM Andy Lutomirski <luto@kernel.org> wrote:
>>>>> 
>>>>> On Thu, Dec 19, 2019 at 3:58 AM Brian Gerst <brgerst@gmail.com> wrote:
>>>>>> 
>>>>>> force_iret() was originally intended to prevent the return to user mode with
>>>>>> the SYSRET or SYSEXIT instructions, in cases where the register state could
>>>>>> have been changed to be incompatible with those instructions.
>>>>> 
>>>>> It's more than that.  Before the big syscall rework, we didn't restore
>>>>> the caller-saved regs.  See:
>>>>> 
>>>>> commit 21d375b6b34ff511a507de27bf316b3dde6938d9
>>>>> Author: Andy Lutomirski <luto@kernel.org>
>>>>> Date:   Sun Jan 28 10:38:49 2018 -0800
>>>>> 
>>>>>   x86/entry/64: Remove the SYSCALL64 fast path
>>>>> 
>>>>> So if you changed r12, for example, the change would get lost.
>>>> 
>>>> force_iret() specifically dealt with changes to CS, SS and EFLAGS.
>>>> Saving and restoring the extra registers was a different problem
>>>> although it affected the same functions like ptrace, signals, and
>>>> exec.
>>> 
>>> Is it ever possible for any of the segment registers to refer to the LDT
>>> and for another thread to invalidate the entries 'very late' ?
>> 
>> Not in newer kernels, because the actual LDT is never modified.
>> Instead, LDT changes create a whole new LDT and propagate it with an IPI.
> 
> Can the IPI be disabled through the SYSRET path?

There’s a whole dance in prepare_exit_to_usermode().  We turn off interrupts, then check for pending work (which does not include this IPI, but includes plenty of other nasty things), and we keep interrupts off until we are in user mode.

> Once in user space, the IPI will interrupt the process and, I presume, it will
> pick up the new LDT on 'return to user'.

The new LDT is picked up in the IPI callback.

> But if the IPI happens between the LDT being set and SYSRET it will (presumably)
> remain 'pending' until the next system call?
> Which could be long enough for one thread to have passed a pointer across giving
> an unexpected SEGV (or maybe worse, failing to give an expected one).

modify_ldt() won’t return until all threads have the new LDT.

Patch
diff mbox series

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 30416d7f19d4..a3aefe9b9401 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -114,8 +114,6 @@  static int ia32_restore_sigcontext(struct pt_regs *regs,
 
 	err |= fpu__restore_sig(buf, 1);
 
-	force_iret();
-
 	return err;
 }
 
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5057a8ed100b..78897a8da01f 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -339,22 +339,6 @@  static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
 
 #define ARCH_HAS_USER_SINGLE_STEP_REPORT
 
-/*
- * When hitting ptrace_stop(), we cannot return using SYSRET because
- * that does not restore the full CPU state, only a minimal set.  The
- * ptracer can change arbitrary register values, which is usually okay
- * because the usual ptrace stops run off the signal delivery path which
- * forces IRET; however, ptrace_event() stops happen in arbitrary places
- * in the kernel and don't force IRET path.
- *
- * So force IRET path after a ptrace stop.
- */
-#define arch_ptrace_stop_needed(code, info)				\
-({									\
-	force_iret();							\
-	false;								\
-})
-
 struct user_desc;
 extern int do_get_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d779366ce3f8..cf4327986e98 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -239,15 +239,6 @@  static inline int arch_within_stack_frames(const void * const stack,
 			   current_thread_info()->status & TS_COMPAT)
 #endif
 
-/*
- * Force syscall return via IRET by making it look as if there was
- * some work pending. IRET is our most capable (but slowest) syscall
- * return path, which is able to restore modified SS, CS and certain
- * EFLAGS values that other (fast) syscall return instructions
- * are not able to restore properly.
- */
-#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
-
 extern void arch_task_cache_init(void);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 extern void arch_release_task_struct(struct task_struct *tsk);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 323499f48858..5052ced43373 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -124,7 +124,6 @@  start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
 	regs->flags		= X86_EFLAGS_IF;
-	force_iret();
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 506d66830d4d..ffd497804dbc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -394,7 +394,6 @@  start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	regs->cs		= _cs;
 	regs->ss		= _ss;
 	regs->flags		= X86_EFLAGS_IF;
-	force_iret();
 }
 
 void
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 8eb7193e158d..8a29573851a3 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -151,8 +151,6 @@  static int restore_sigcontext(struct pt_regs *regs,
 
 	err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
 
-	force_iret();
-
 	return err;
 }
 
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index a76c12b38e92..91d55454e702 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -381,7 +381,6 @@  static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 		mark_screen_rdonly(tsk->mm);
 
 	memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
-	force_iret();
 	return regs->ax;
 }