From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753129AbcHOLzH (ORCPT ); Mon, 15 Aug 2016 07:55:07 -0400 Received: from mail-wm0-f66.google.com ([74.125.82.66]:36830 "EHLO mail-wm0-f66.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752957AbcHOLzF (ORCPT ); Mon, 15 Aug 2016 07:55:05 -0400 Date: Mon, 15 Aug 2016 13:55:02 +0200 From: Michal Hocko To: Andy Lutomirski Cc: x86@kernel.org, Borislav Petkov , linux-kernel@vger.kernel.org, Brian Gerst , Oleg Nesterov Subject: Re: [PATCH v6 1/3] fork: Add generic vmalloced stack support Message-ID: <20160815115502.GC3360@dhcp22.suse.cz> References: <14c07d4fd173a5b117f51e8b939f9f4323e39899.1470907718.git.luto@kernel.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <14c07d4fd173a5b117f51e8b939f9f4323e39899.1470907718.git.luto@kernel.org> User-Agent: Mutt/1.6.0 (2016-04-01) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Thu 11-08-16 02:35:21, Andy Lutomirski wrote: > If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with > vmalloc_node. > > grsecurity has had a similar feature (called > GRKERNSEC_KSTACKOVERFLOW) for a long time. > > Cc: Oleg Nesterov > Signed-off-by: Andy Lutomirski Looks good to me. FWIW Acked-by: Michal Hocko > --- > arch/Kconfig | 34 +++++++++++++ > arch/ia64/include/asm/thread_info.h | 2 +- > include/linux/sched.h | 15 ++++++ > kernel/fork.c | 96 +++++++++++++++++++++++++++++-------- > 4 files changed, 126 insertions(+), 21 deletions(-) > > diff --git a/arch/Kconfig b/arch/Kconfig > index bd8056b5b246..2a7f5b62e716 100644 > --- a/arch/Kconfig > +++ b/arch/Kconfig > @@ -698,4 +698,38 @@ config ARCH_NO_COHERENT_DMA_MMAP > config CPU_NO_EFFICIENT_FFS > def_bool n > > +config HAVE_ARCH_VMAP_STACK > + def_bool n > + help > + An arch should select this symbol if it can support kernel stacks > + in vmalloc space. This means: > + > + - vmalloc space must be large enough to hold many kernel stacks. > + This may rule out many 32-bit architectures. > + > + - Stacks in vmalloc space need to work reliably. For example, if > + vmap page tables are created on demand, either this mechanism > + needs to work while the stack points to a virtual address with > + unpopulated page tables or arch code (switch_to and switch_mm, > + most likely) needs to ensure that the stack's page table entries > + are populated before running on a possibly unpopulated stack. > + > + - If the stack overflows into a guard page, something reasonable > + should happen. The definition of "reasonable" is flexible, but > + instantly rebooting without logging anything would be unfriendly. > + > +config VMAP_STACK > + default y > + bool "Use a virtually-mapped stack" > + depends on HAVE_ARCH_VMAP_STACK && !KASAN > + ---help--- > + Enable this if you want the use virtually-mapped kernel stacks > + with guard pages. This causes kernel stack overflows to be > + caught immediately rather than causing difficult-to-diagnose > + corruption. > + > + This is presently incompatible with KASAN because KASAN expects > + the stack to map directly to the KASAN shadow map using a formula > + that is incorrect if the stack is in vmalloc space. > + > source "kernel/gcov/Kconfig" > diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h > index 29bd59790d6c..c7026429816b 100644 > --- a/arch/ia64/include/asm/thread_info.h > +++ b/arch/ia64/include/asm/thread_info.h > @@ -56,7 +56,7 @@ struct thread_info { > #define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) > #define task_thread_info(tsk) ((struct thread_info *) 0) > #endif > -#define free_thread_stack(ti) /* nothing */ > +#define free_thread_stack(tsk) /* nothing */ > #define task_stack_page(tsk) ((void *)(tsk)) > > #define __HAVE_THREAD_FUNCTIONS > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 62c68e513e39..20f9f47bcfd0 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1923,6 +1923,9 @@ struct task_struct { > #ifdef CONFIG_MMU > struct task_struct *oom_reaper_list; > #endif > +#ifdef CONFIG_VMAP_STACK > + struct vm_struct *stack_vm_area; > +#endif > /* CPU-specific state of this task */ > struct thread_struct thread; > /* > @@ -1939,6 +1942,18 @@ extern int arch_task_struct_size __read_mostly; > # define arch_task_struct_size (sizeof(struct task_struct)) > #endif > > +#ifdef CONFIG_VMAP_STACK > +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) > +{ > + return t->stack_vm_area; > +} > +#else > +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) > +{ > + return NULL; > +} > +#endif > + > /* Future-safe accessor for struct task_struct's cpus_allowed. */ > #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) > > diff --git a/kernel/fork.c b/kernel/fork.c > index 52e725d4a866..05f7ef796fb4 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long *stack) > * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a > * kmemcache based allocator. > */ > -# if THREAD_SIZE >= PAGE_SIZE > -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, > - int node) > +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) > +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) > { > +#ifdef CONFIG_VMAP_STACK > + void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, > + VMALLOC_START, VMALLOC_END, > + THREADINFO_GFP | __GFP_HIGHMEM, > + PAGE_KERNEL, > + 0, node, > + __builtin_return_address(0)); > + > + /* > + * We can't call find_vm_area() in interrupt context, and > + * free_thread_stack can be called in interrupt context, so cache > + * the vm_struct. > + */ > + if (stack) > + tsk->stack_vm_area = find_vm_area(stack); > + return stack; > +#else > struct page *page = alloc_pages_node(node, THREADINFO_GFP, > THREAD_SIZE_ORDER); > > return page ? page_address(page) : NULL; > +#endif > } > > -static inline void free_thread_stack(unsigned long *stack) > +static inline void free_thread_stack(struct task_struct *tsk) > { > - __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); > + if (task_stack_vm_area(tsk)) > + vfree(tsk->stack); > + else > + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); > } > # else > static struct kmem_cache *thread_stack_cache; > @@ -181,9 +201,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, > return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); > } > > -static void free_thread_stack(unsigned long *stack) > +static void free_thread_stack(struct task_struct *tsk) > { > - kmem_cache_free(thread_stack_cache, stack); > + kmem_cache_free(thread_stack_cache, tsk->stack); > } > > void thread_stack_cache_init(void) > @@ -213,24 +233,47 @@ struct kmem_cache *vm_area_cachep; > /* SLAB cache for mm_struct structures (tsk->mm) */ > static struct kmem_cache *mm_cachep; > > -static void account_kernel_stack(unsigned long *stack, int account) > +static void account_kernel_stack(struct task_struct *tsk, int account) > { > - /* All stack pages are in the same zone and belong to the same memcg. */ > - struct page *first_page = virt_to_page(stack); > + void *stack = task_stack_page(tsk); > + struct vm_struct *vm = task_stack_vm_area(tsk); > + > + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); > + > + if (vm) { > + int i; > + > + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); > + > + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { > + mod_zone_page_state(page_zone(vm->pages[i]), > + NR_KERNEL_STACK_KB, > + PAGE_SIZE / 1024 * account); > + } > > - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, > - THREAD_SIZE / 1024 * account); > + /* All stack pages belong to the same memcg. */ > + memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, > + account * (THREAD_SIZE / 1024)); > + } else { > + /* > + * All stack pages are in the same zone and belong to the > + * same memcg. > + */ > + struct page *first_page = virt_to_page(stack); > + > + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, > + THREAD_SIZE / 1024 * account); > > - memcg_kmem_update_page_stat( > - first_page, MEMCG_KERNEL_STACK_KB, > - account * (THREAD_SIZE / 1024)); > + memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, > + account * (THREAD_SIZE / 1024)); > + } > } > > void free_task(struct task_struct *tsk) > { > - account_kernel_stack(tsk->stack, -1); > + account_kernel_stack(tsk, -1); > arch_release_thread_stack(tsk->stack); > - free_thread_stack(tsk->stack); > + free_thread_stack(tsk); > rt_mutex_debug_task_free(tsk); > ftrace_graph_exit_task(tsk); > put_seccomp_filter(tsk); > @@ -342,6 +385,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) > { > struct task_struct *tsk; > unsigned long *stack; > + struct vm_struct *stack_vm_area; > int err; > > if (node == NUMA_NO_NODE) > @@ -354,11 +398,23 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) > if (!stack) > goto free_tsk; > > + stack_vm_area = task_stack_vm_area(tsk); > + > err = arch_dup_task_struct(tsk, orig); > + > + /* > + * arch_dup_task_struct clobbers the stack-related fields. Make > + * sure they're properly initialized before using any stack-related > + * functions again. > + */ > + tsk->stack = stack; > +#ifdef CONFIG_VMAP_STACK > + tsk->stack_vm_area = stack_vm_area; > +#endif > + > if (err) > goto free_stack; > > - tsk->stack = stack; > #ifdef CONFIG_SECCOMP > /* > * We must handle setting up seccomp filters once we're under > @@ -390,14 +446,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) > tsk->task_frag.page = NULL; > tsk->wake_q.next = NULL; > > - account_kernel_stack(stack, 1); > + account_kernel_stack(tsk, 1); > > kcov_task_init(tsk); > > return tsk; > > free_stack: > - free_thread_stack(stack); > + free_thread_stack(tsk); > free_tsk: > free_task_struct(tsk); > return NULL; > -- > 2.7.4 -- Michal Hocko SUSE Labs