All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andy Lutomirski <luto@kernel.org>
To: Dave Hansen <dave.hansen@intel.com>
Cc: moritz.lipp@iaik.tugraz.at,
	Daniel Gruss <daniel.gruss@iaik.tugraz.at>,
	michael.schwarz@iaik.tugraz.at,
	richard.fellner@student.tugraz.at,
	Andrew Lutomirski <luto@kernel.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Kees Cook <keescook@google.com>, Hugh Dickins <hughd@google.com>,
	Ingo Molnar <mingo@kernel.org>, X86 ML <x86@kernel.org>,
	LKML <linux-kernel@vger.kernel.org>,
	Borislav Petkov <bp@alien8.de>
Subject: Re: [RFC][PATCH] x86, kaiser: do not require mapping process kernel stacks
Date: Sat, 4 Nov 2017 04:24:47 -0700	[thread overview]
Message-ID: <CALCETrXiM1kVJ_bkBqyX2GbB+raA6XOK-BgO5n+aJMXMxeUYUg@mail.gmail.com> (raw)
In-Reply-To: <52d3fdde-7611-f766-eb2e-9dd1dde61760@intel.com>

On Fri, Nov 3, 2017 at 3:03 PM, Dave Hansen <dave.hansen@intel.com> wrote:
>
> With the KAISER code that I posted a few days ago, we map and unmap
> each of the kernel stacks when they are created.  That's slow and
> it is also the single largest thing still mapped into the user
> address space.
>
> This patch is on top of Andy's new trampoline stack code[1] plus
> the previous KAISER patches I posted.  I figured I'd post this now
> so folks can take a look before I go re-spin the whole thing.
>
> Andy, I think the do_double_fault() handling is arguably buggy
> with your trampoline stacks: it points the iret frame to the task
> stack instead of the trampoline stack.  It survives that normally,
> but it breaks with KAISER of course.

That is, indeed, a bug.  Will fix.

>
> The other oddity in this is the SWITCH_TO_KERNEL_CR3 in the
> entry_SYSCALL_64 path.  We do not have a stack to use or any
> regs to clobber, so we have to fall back to a per-cpu area to
> stash a register.  I'm open to any better ideas.

I think we have no choice but to use *two* words of temporary storage,
one for rsp and one for whatever scratch reg we use.  Unless we're
sneaky and use rsp as our scratch reg.  *snicker*

>
> 1. https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/log/?h=x86/entry_consolidation
>
> ---
>
>  b/arch/x86/entry/calling.h      |   15 +++++++++++++
>  b/arch/x86/entry/entry_64.S     |    8 ++++--
>  b/arch/x86/include/asm/kaiser.h |    2 -
>  b/arch/x86/kernel/traps.c       |   46 +++++++++++++++++++++++++++++++++-------
>  b/arch/x86/mm/kaiser.c          |   12 +---------
>  b/include/linux/kaiser.h        |    5 ----
>  b/kernel/fork.c                 |    5 ----
>  7 files changed, 60 insertions(+), 33 deletions(-)
>
> diff -puN arch/x86/mm/kaiser.c~kaiser-no-process-stacks arch/x86/mm/kaiser.c
> --- a/arch/x86/mm/kaiser.c~kaiser-no-process-stacks     2017-11-02 11:07:35.624523990 -0700
> +++ b/arch/x86/mm/kaiser.c      2017-11-03 14:22:34.878872556 -0700
> @@ -30,6 +30,8 @@
>  #include <asm/tlbflush.h>
>  #include <asm/desc.h>
>  +DEFINE_PER_CPU_USER_MAPPED(unsigned long, syscall_r11_tmp);
> +
>  /*
>   * At runtime, the only things we map are some things for CPU
>   * hotplug, and stacks for new processes.  No two CPUs will ever
> @@ -251,16 +253,6 @@ int kaiser_add_user_map(const void *__st
>         return 0;
>  }
>  -/*
> - * The stack mapping is called in generic code and can't use
> - * __PAGE_KERNEL
> - */
> -int kaiser_map_stack(struct task_struct *tsk)
> -{
> -       return kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE,
> -                                 __PAGE_KERNEL);
> -}
> -
>  int kaiser_add_user_map_ptrs(const void *__start_addr,
>                              const void *__end_addr,
>                              unsigned long flags)
> diff -puN kernel/fork.c~kaiser-no-process-stacks kernel/fork.c
> --- a/kernel/fork.c~kaiser-no-process-stacks    2017-11-02 11:08:33.103722857 -0700
> +++ b/kernel/fork.c     2017-11-03 14:19:35.720775925 -0700
> @@ -248,8 +248,6 @@ static unsigned long *alloc_thread_stack
>   static inline void free_thread_stack(struct task_struct *tsk)
>  {
> -       kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
> -
>  #ifdef CONFIG_VMAP_STACK
>         if (task_stack_vm_area(tsk)) {
>                 int i;
> @@ -539,9 +537,6 @@ static struct task_struct *dup_task_stru
>          * functions again.
>          */
>         tsk->stack = stack;
> -       err = kaiser_map_stack(tsk);
> -       if (err)
> -               goto free_stack;
>  #ifdef CONFIG_VMAP_STACK
>         tsk->stack_vm_area = stack_vm_area;
>  #endif
> diff -puN arch/x86/entry/entry_64.S~kaiser-no-process-stacks arch/x86/entry/entry_64.S
> --- a/arch/x86/entry/entry_64.S~kaiser-no-process-stacks        2017-11-02 11:31:30.904900271 -0700
> +++ b/arch/x86/entry/entry_64.S 2017-11-03 11:20:26.825735849 -0700
> @@ -145,6 +145,9 @@ ENTRY(entry_SYSCALL_64)
>         swapgs
>         movq    %rsp, PER_CPU_VAR(rsp_scratch)
> +
> +       SWITCH_TO_KERNEL_CR3_SYSCALL
> +
>         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
>         /* Construct struct pt_regs on stack */
> @@ -169,8 +172,6 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
>         /* NB: right here, all regs except r11 are live. */
>  -      SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
> -
>         /* Must wait until we have the kernel CR3 to call C functions: */
>         TRACE_IRQS_OFF
>  @@ -1270,6 +1271,7 @@ ENTRY(error_entry)
>          * gsbase and proceed.  We'll fix up the exception and land in
>          * .Lgs_change's error handler with kernel gsbase.
>          */
> +       SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
>         SWAPGS
>         jmp .Lerror_entry_done
>  @@ -1379,6 +1381,7 @@ ENTRY(nmi)
>         swapgs
>         cld
> +       SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
>         movq    %rsp, %rdx
>         movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
>         UNWIND_HINT_IRET_REGS base=%rdx offset=8
> @@ -1407,7 +1410,6 @@ ENTRY(nmi)
>         UNWIND_HINT_REGS
>         ENCODE_FRAME_POINTER
>  -      SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
>         /*
>          * At this point we no longer need to worry about stack damage
>          * due to nesting -- we're on the normal thread stack and we're
> diff -puN arch/x86/entry/calling.h~kaiser-no-process-stacks arch/x86/entry/calling.h
> --- a/arch/x86/entry/calling.h~kaiser-no-process-stacks 2017-11-02 12:19:37.999595414 -0700
> +++ b/arch/x86/entry/calling.h  2017-11-02 12:24:14.784007027 -0700
> @@ -3,6 +3,7 @@
>  #include <asm/cpufeatures.h>
>  #include <asm/page_types.h>
>  #include <asm/pgtable_types.h>
> +#include <asm/percpu.h>
>   /*
>  @@ -214,6 +215,18 @@ For 32-bit we have the following convent
>         mov     \scratch_reg, %cr3
>  .endm
>  +/*
> + * At syscall entry, we do not have a register to clobber and
> + * do not yet have a stack.  Even if we did, we no longer map
> + * the process stack, so it does us no good.  Just use some
> + * percpu space to stash r11.
> + */
> +.macro SWITCH_TO_KERNEL_CR3_SYSCALL
> +       movq    %r11, PER_CPU_VAR(syscall_r11_tmp)
> +       SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
> +       movq    PER_CPU_VAR(syscall_r11_tmp), %r11
> +.endm
> +
>  .macro SWITCH_TO_USER_CR3 scratch_reg:req
>         mov     %cr3, \scratch_reg
>         ADJUST_USER_CR3 \scratch_reg
> @@ -254,6 +267,8 @@ For 32-bit we have the following convent
>   .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
>  .endm
> +.macro SWITCH_TO_KERNEL_CR3_SYSCALL
> +.endm
>  .macro SWITCH_TO_USER_CR3 scratch_reg:req
>  .endm
>  .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
> diff -puN arch/x86/kernel/traps.c~kaiser-no-process-stacks arch/x86/kernel/traps.c
> --- a/arch/x86/kernel/traps.c~kaiser-no-process-stacks  2017-11-02 12:46:27.978440257 -0700
> +++ b/arch/x86/kernel/traps.c   2017-11-03 14:34:21.658546483 -0700
> @@ -329,6 +329,43 @@ __visible void __noreturn handle_stack_o
>  }
>  #endif
>  +/*
> + * This "fakes" a #GP from userspace upon returning (iret'ing)
> + * from this double fault.
> + */
> +void setup_fake_gp_at_iret(struct pt_regs *regs)
> +{
> +       unsigned long *new_stack_top = (unsigned long *)
> +               (this_cpu_read(cpu_tss.x86_tss.ist[0]) - 0x1500);
> +
> +       /*
> +        * Set up a stack just like the hardware would for a #GP.
> +        *
> +        * This format is an "iret frame", plus the error code
> +        * that the hardware puts on the stack for us for
> +        * exceptions.  (see struct pt_regs).
> +        */
> +       new_stack_top[-1] = regs->ss;
> +       new_stack_top[-2] = regs->sp;
> +       new_stack_top[-3] = regs->flags;
> +       new_stack_top[-4] = regs->cs;
> +       new_stack_top[-5] = regs->ip;
> +       new_stack_top[-6] = 0;  /* faked #GP error code */
> +
> +       /*
> +        * 'regs' points to the "iret frame" for *this*
> +        * exception, *not* the #GP we are faking.  Here,
> +        * we are telling 'iret' to jump to general_protection
> +        * when returning from this double fault.
> +        */
> +       regs->ip = (unsigned long)general_protection;
> +       /*
> +        * Make iret move the stack to the "fake #GP" stack
> +        * we created above.
> +        */
> +       regs->sp = (unsigned long)&new_stack_top[-6];
> +}
> +
>  #ifdef CONFIG_X86_64
>  /* Runs on IST stack */
>  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
> @@ -354,14 +391,7 @@ dotraplinkage void do_double_fault(struc
>                 regs->cs == __KERNEL_CS &&
>                 regs->ip == (unsigned long)native_irq_return_iret)
>         {
> -               struct pt_regs *normal_regs = task_pt_regs(current);
> -
> -               /* Fake a #GP(0) from userspace. */
> -               memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
> -               normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
> -               regs->ip = (unsigned long)general_protection;
> -               regs->sp = (unsigned long)&normal_regs->orig_ax;
> -
> +               setup_fake_gp_at_iret(regs);
>                 return;
>         }
>  #endif
> diff -puN arch/x86/include/asm/kaiser.h~kaiser-no-process-stacks arch/x86/include/asm/kaiser.h
> --- a/arch/x86/include/asm/kaiser.h~kaiser-no-process-stacks    2017-11-03 14:22:37.594995310 -0700
> +++ b/arch/x86/include/asm/kaiser.h     2017-11-03 14:22:40.848142333 -0700
> @@ -33,8 +33,6 @@
>  extern int kaiser_add_mapping(unsigned long addr, unsigned long size,
>                               unsigned long flags);
>  -extern int kaiser_map_stack(struct task_struct *tsk);
> -
>  /**
>   *  kaiser_remove_mapping - remove a kernel mapping from the userpage tables
>   *  @addr: the start address of the range
> diff -puN include/linux/kaiser.h~kaiser-no-process-stacks include/linux/kaiser.h
> --- a/include/linux/kaiser.h~kaiser-no-process-stacks   2017-11-03 14:22:37.654998020 -0700
> +++ b/include/linux/kaiser.h    2017-11-03 14:22:48.646494777 -0700
> @@ -11,11 +11,6 @@
>   * disabled.
>   */
>  -static inline int kaiser_map_stack(struct task_struct *tsk)
> -{
> -       return 0;
> -}
> -
>  static inline void kaiser_init(void)
>  {
>  }
> _

      reply	other threads:[~2017-11-04 11:25 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-11-03 22:03 [RFC][PATCH] x86, kaiser: do not require mapping process kernel stacks Dave Hansen
2017-11-04 11:24 ` Andy Lutomirski [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CALCETrXiM1kVJ_bkBqyX2GbB+raA6XOK-BgO5n+aJMXMxeUYUg@mail.gmail.com \
    --to=luto@kernel.org \
    --cc=bp@alien8.de \
    --cc=daniel.gruss@iaik.tugraz.at \
    --cc=dave.hansen@intel.com \
    --cc=hughd@google.com \
    --cc=keescook@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=michael.schwarz@iaik.tugraz.at \
    --cc=mingo@kernel.org \
    --cc=moritz.lipp@iaik.tugraz.at \
    --cc=richard.fellner@student.tugraz.at \
    --cc=torvalds@linux-foundation.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.