From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1753129AbcHOLzH (ORCPT <rfc822;w@1wt.eu>);
	Mon, 15 Aug 2016 07:55:07 -0400
Received: from mail-wm0-f66.google.com ([74.125.82.66]:36830 "EHLO
	mail-wm0-f66.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1752957AbcHOLzF (ORCPT
	<rfc822;linux-kernel@vger.kernel.org>);
	Mon, 15 Aug 2016 07:55:05 -0400
Date: Mon, 15 Aug 2016 13:55:02 +0200
From: Michal Hocko <mhocko@kernel.org>
To: Andy Lutomirski <luto@kernel.org>
Cc: x86@kernel.org, Borislav Petkov <bp@alien8.de>,
        linux-kernel@vger.kernel.org, Brian Gerst <brgerst@gmail.com>,
        Oleg Nesterov <oleg@redhat.com>
Subject: Re: [PATCH v6 1/3] fork: Add generic vmalloced stack support
Message-ID: <20160815115502.GC3360@dhcp22.suse.cz>
References: <cover.1470907718.git.luto@kernel.org>
 <14c07d4fd173a5b117f51e8b939f9f4323e39899.1470907718.git.luto@kernel.org>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <14c07d4fd173a5b117f51e8b939f9f4323e39899.1470907718.git.luto@kernel.org>
User-Agent: Mutt/1.6.0 (2016-04-01)
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

On Thu 11-08-16 02:35:21, Andy Lutomirski wrote:
> If CONFIG_VMAP_STACK is selected, kernel stacks are allocated with
> vmalloc_node.
> 
> grsecurity has had a similar feature (called
> GRKERNSEC_KSTACKOVERFLOW) for a long time.
> 
> Cc: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Andy Lutomirski <luto@kernel.org>

Looks good to me.
FWIW
Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  arch/Kconfig                        | 34 +++++++++++++
>  arch/ia64/include/asm/thread_info.h |  2 +-
>  include/linux/sched.h               | 15 ++++++
>  kernel/fork.c                       | 96 +++++++++++++++++++++++++++++--------
>  4 files changed, 126 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index bd8056b5b246..2a7f5b62e716 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -698,4 +698,38 @@ config ARCH_NO_COHERENT_DMA_MMAP
>  config CPU_NO_EFFICIENT_FFS
>  	def_bool n
>  
> +config HAVE_ARCH_VMAP_STACK
> +	def_bool n
> +	help
> +	  An arch should select this symbol if it can support kernel stacks
> +	  in vmalloc space.  This means:
> +
> +	  - vmalloc space must be large enough to hold many kernel stacks.
> +	    This may rule out many 32-bit architectures.
> +
> +	  - Stacks in vmalloc space need to work reliably.  For example, if
> +	    vmap page tables are created on demand, either this mechanism
> +	    needs to work while the stack points to a virtual address with
> +	    unpopulated page tables or arch code (switch_to and switch_mm,
> +	    most likely) needs to ensure that the stack's page table entries
> +	    are populated before running on a possibly unpopulated stack.
> +
> +	  - If the stack overflows into a guard page, something reasonable
> +	    should happen.  The definition of "reasonable" is flexible, but
> +	    instantly rebooting without logging anything would be unfriendly.
> +
> +config VMAP_STACK
> +	default y
> +	bool "Use a virtually-mapped stack"
> +	depends on HAVE_ARCH_VMAP_STACK && !KASAN
> +	---help---
> +	  Enable this if you want the use virtually-mapped kernel stacks
> +	  with guard pages.  This causes kernel stack overflows to be
> +	  caught immediately rather than causing difficult-to-diagnose
> +	  corruption.
> +
> +	  This is presently incompatible with KASAN because KASAN expects
> +	  the stack to map directly to the KASAN shadow map using a formula
> +	  that is incorrect if the stack is in vmalloc space.
> +
>  source "kernel/gcov/Kconfig"
> diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
> index 29bd59790d6c..c7026429816b 100644
> --- a/arch/ia64/include/asm/thread_info.h
> +++ b/arch/ia64/include/asm/thread_info.h
> @@ -56,7 +56,7 @@ struct thread_info {
>  #define alloc_thread_stack_node(tsk, node)	((unsigned long *) 0)
>  #define task_thread_info(tsk)	((struct thread_info *) 0)
>  #endif
> -#define free_thread_stack(ti)	/* nothing */
> +#define free_thread_stack(tsk)	/* nothing */
>  #define task_stack_page(tsk)	((void *)(tsk))
>  
>  #define __HAVE_THREAD_FUNCTIONS
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 62c68e513e39..20f9f47bcfd0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1923,6 +1923,9 @@ struct task_struct {
>  #ifdef CONFIG_MMU
>  	struct task_struct *oom_reaper_list;
>  #endif
> +#ifdef CONFIG_VMAP_STACK
> +	struct vm_struct *stack_vm_area;
> +#endif
>  /* CPU-specific state of this task */
>  	struct thread_struct thread;
>  /*
> @@ -1939,6 +1942,18 @@ extern int arch_task_struct_size __read_mostly;
>  # define arch_task_struct_size (sizeof(struct task_struct))
>  #endif
>  
> +#ifdef CONFIG_VMAP_STACK
> +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
> +{
> +	return t->stack_vm_area;
> +}
> +#else
> +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
> +{
> +	return NULL;
> +}
> +#endif
> +
>  /* Future-safe accessor for struct task_struct's cpus_allowed. */
>  #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 52e725d4a866..05f7ef796fb4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long *stack)
>   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
>   * kmemcache based allocator.
>   */
> -# if THREAD_SIZE >= PAGE_SIZE
> -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
> -						  int node)
> +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
> +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
>  {
> +#ifdef CONFIG_VMAP_STACK
> +	void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
> +					   VMALLOC_START, VMALLOC_END,
> +					   THREADINFO_GFP | __GFP_HIGHMEM,
> +					   PAGE_KERNEL,
> +					   0, node,
> +					   __builtin_return_address(0));
> +
> +	/*
> +	 * We can't call find_vm_area() in interrupt context, and
> +	 * free_thread_stack can be called in interrupt context, so cache
> +	 * the vm_struct.
> +	 */
> +	if (stack)
> +		tsk->stack_vm_area = find_vm_area(stack);
> +	return stack;
> +#else
>  	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
>  					     THREAD_SIZE_ORDER);
>  
>  	return page ? page_address(page) : NULL;
> +#endif
>  }
>  
> -static inline void free_thread_stack(unsigned long *stack)
> +static inline void free_thread_stack(struct task_struct *tsk)
>  {
> -	__free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
> +	if (task_stack_vm_area(tsk))
> +		vfree(tsk->stack);
> +	else
> +		__free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
>  }
>  # else
>  static struct kmem_cache *thread_stack_cache;
> @@ -181,9 +201,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
>  	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
>  }
>  
> -static void free_thread_stack(unsigned long *stack)
> +static void free_thread_stack(struct task_struct *tsk)
>  {
> -	kmem_cache_free(thread_stack_cache, stack);
> +	kmem_cache_free(thread_stack_cache, tsk->stack);
>  }
>  
>  void thread_stack_cache_init(void)
> @@ -213,24 +233,47 @@ struct kmem_cache *vm_area_cachep;
>  /* SLAB cache for mm_struct structures (tsk->mm) */
>  static struct kmem_cache *mm_cachep;
>  
> -static void account_kernel_stack(unsigned long *stack, int account)
> +static void account_kernel_stack(struct task_struct *tsk, int account)
>  {
> -	/* All stack pages are in the same zone and belong to the same memcg. */
> -	struct page *first_page = virt_to_page(stack);
> +	void *stack = task_stack_page(tsk);
> +	struct vm_struct *vm = task_stack_vm_area(tsk);
> +
> +	BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
> +
> +	if (vm) {
> +		int i;
> +
> +		BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
> +
> +		for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
> +			mod_zone_page_state(page_zone(vm->pages[i]),
> +					    NR_KERNEL_STACK_KB,
> +					    PAGE_SIZE / 1024 * account);
> +		}
>  
> -	mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
> -			    THREAD_SIZE / 1024 * account);
> +		/* All stack pages belong to the same memcg. */
> +		memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
> +					    account * (THREAD_SIZE / 1024));
> +	} else {
> +		/*
> +		 * All stack pages are in the same zone and belong to the
> +		 * same memcg.
> +		 */
> +		struct page *first_page = virt_to_page(stack);
> +
> +		mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
> +				    THREAD_SIZE / 1024 * account);
>  
> -	memcg_kmem_update_page_stat(
> -		first_page, MEMCG_KERNEL_STACK_KB,
> -		account * (THREAD_SIZE / 1024));
> +		memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
> +					    account * (THREAD_SIZE / 1024));
> +	}
>  }
>  
>  void free_task(struct task_struct *tsk)
>  {
> -	account_kernel_stack(tsk->stack, -1);
> +	account_kernel_stack(tsk, -1);
>  	arch_release_thread_stack(tsk->stack);
> -	free_thread_stack(tsk->stack);
> +	free_thread_stack(tsk);
>  	rt_mutex_debug_task_free(tsk);
>  	ftrace_graph_exit_task(tsk);
>  	put_seccomp_filter(tsk);
> @@ -342,6 +385,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
>  {
>  	struct task_struct *tsk;
>  	unsigned long *stack;
> +	struct vm_struct *stack_vm_area;
>  	int err;
>  
>  	if (node == NUMA_NO_NODE)
> @@ -354,11 +398,23 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
>  	if (!stack)
>  		goto free_tsk;
>  
> +	stack_vm_area = task_stack_vm_area(tsk);
> +
>  	err = arch_dup_task_struct(tsk, orig);
> +
> +	/*
> +	 * arch_dup_task_struct clobbers the stack-related fields.  Make
> +	 * sure they're properly initialized before using any stack-related
> +	 * functions again.
> +	 */
> +	tsk->stack = stack;
> +#ifdef CONFIG_VMAP_STACK
> +	tsk->stack_vm_area = stack_vm_area;
> +#endif
> +
>  	if (err)
>  		goto free_stack;
>  
> -	tsk->stack = stack;
>  #ifdef CONFIG_SECCOMP
>  	/*
>  	 * We must handle setting up seccomp filters once we're under
> @@ -390,14 +446,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
>  	tsk->task_frag.page = NULL;
>  	tsk->wake_q.next = NULL;
>  
> -	account_kernel_stack(stack, 1);
> +	account_kernel_stack(tsk, 1);
>  
>  	kcov_task_init(tsk);
>  
>  	return tsk;
>  
>  free_stack:
> -	free_thread_stack(stack);
> +	free_thread_stack(tsk);
>  free_tsk:
>  	free_task_struct(tsk);
>  	return NULL;
> -- 
> 2.7.4

-- 
Michal Hocko
SUSE Labs