linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
@ 2012-01-18 18:51 Arun Sharma
  2012-01-19  2:42 ` KAMEZAWA Hiroyuki
  2012-02-23  7:45 ` Balbir Singh
  0 siblings, 2 replies; 14+ messages in thread
From: Arun Sharma @ 2012-01-18 18:51 UTC (permalink / raw)
  To: linux-kernel; +Cc: Arun Sharma, linux-mm, Balbir Singh, KAMEZAWA Hiroyuki, akpm

This enables malloc optimizations where we might
madvise(..,MADV_DONTNEED) a page only to fault it
back at a different virtual address.

To ensure that we don't leak sensitive data to
unprivileged processes, we enable this optimization
only for pages that are reused within a memory
cgroup.

The idea is to make this opt-in both at the mmap()
level and cgroup level so the default behavior is
unchanged after the patch.

TODO: Ask for a VM_UNINITIALIZED bit
TODO: Implement a cgroup level opt-in flag

To: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: akpm@linux-foundation.org
Signed-off-by: Arun Sharma <asharma@fb.com>
---
 include/asm-generic/mman-common.h |    6 +-----
 include/linux/highmem.h           |    6 ++++++
 include/linux/mm.h                |    2 ++
 include/linux/mman.h              |    1 +
 include/linux/page_cgroup.h       |   29 +++++++++++++++++++++++++++++
 init/Kconfig                      |    2 +-
 mm/mempolicy.c                    |   29 +++++++++++++++++++++++------
 7 files changed, 63 insertions(+), 12 deletions(-)

diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
index 787abbb..71e079f 100644
--- a/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@ -19,11 +19,7 @@
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */
-#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
-# define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be uninitialized */
-#else
-# define MAP_UNINITIALIZED 0x0		/* Don't support this flag */
-#endif
+#define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be uninitialized */
 
 #define MS_ASYNC	1		/* sync memory asynchronously */
 #define MS_INVALIDATE	2		/* invalidate the caches */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 3a93f73..caae922 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -4,6 +4,7 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/page_cgroup.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 
@@ -156,6 +157,11 @@ __alloc_zeroed_user_highpage(gfp_t movableflags,
 	struct page *page = alloc_page_vma(GFP_HIGHUSER | movableflags,
 			vma, vaddr);
 
+#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
+	if (!page_needs_clearing(page, vma))
+		return page;
+#endif
+
 	if (page)
 		clear_user_highpage(page, vaddr);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4baadd1..c6bab01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -118,6 +118,8 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
 #define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
 #define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
+#define VM_UNINITIALIZED VM_SAO		/* Steal a powerpc bit for now, since we're out
+					   of bits for 32 bit archs */
 
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 8b74e9b..9bef6c9 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -87,6 +87,7 @@ calc_vm_flag_bits(unsigned long flags)
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
 	       _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
+	       _calc_vm_trans(flags, MAP_UNINITIALIZED, VM_UNINITIALIZED) |
 	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
 }
 #endif /* __KERNEL__ */
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 961ecc7..e959869 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -155,6 +155,17 @@ static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc)
 	return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK;
 }
 
+static int mm_match_cgroup(const struct mm_struct *mm,
+			   const struct mem_cgroup *cgroup);
+static inline bool page_seen_by_cgroup(struct page *page,
+				       const struct mm_struct *mm)
+{
+	struct page_cgroup *pcg = lookup_page_cgroup(page);
+	if (pcg == NULL)
+		return false;
+	return mm_match_cgroup(mm, pcg->mem_cgroup);
+}
+
 #else /* CONFIG_CGROUP_MEM_RES_CTLR */
 struct page_cgroup;
 
@@ -175,8 +186,26 @@ static inline void __init page_cgroup_init_flatmem(void)
 {
 }
 
+static inline bool page_seen_by_cgroup(struct page *page,
+				       const struct mm_struct *mm)
+{
+	return false;
+}
+
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR */
 
+static inline bool vma_requests_uninitialized(struct vm_area_struct *vma)
+{
+	return vma && !vma->vm_file && vma->vm_flags & VM_UNINITIALIZED;
+}
+
+static inline bool page_needs_clearing(struct page *page,
+				       struct vm_area_struct *vma)
+{
+	return !(vma_requests_uninitialized(vma)
+		&& page_seen_by_cgroup(page, vma->vm_mm));
+}
+
 #include <linux/swap.h>
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/init/Kconfig b/init/Kconfig
index 43298f9..428e047 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1259,7 +1259,7 @@ endchoice
 
 config MMAP_ALLOW_UNINITIALIZED
 	bool "Allow mmapped anonymous memory to be uninitialized"
-	depends on EXPERT && !MMU
+	depends on EXPERT
 	default n
 	help
 	  Normally, and according to the Linux spec, anonymous memory obtained
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c3fdbcb..7c9ab68 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
 #include <linux/mm_inline.h>
+#include <linux/page_cgroup.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -1847,6 +1848,11 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 	struct zonelist *zl;
 	struct page *page;
 
+#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
+	if (vma_requests_uninitialized(vma))
+		gfp &= ~__GFP_ZERO;
+#endif
+
 	get_mems_allowed();
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
@@ -1854,25 +1860,36 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
-		put_mems_allowed();
-		return page;
+		goto out;
 	}
 	zl = policy_zonelist(gfp, pol, node);
 	if (unlikely(mpol_needs_cond_ref(pol))) {
 		/*
 		 * slow path: ref counted shared policy
 		 */
-		struct page *page =  __alloc_pages_nodemask(gfp, order,
-						zl, policy_nodemask(gfp, pol));
+		page =  __alloc_pages_nodemask(gfp, order,
+					       zl, policy_nodemask(gfp, pol));
 		__mpol_put(pol);
-		put_mems_allowed();
-		return page;
+		goto out;
 	}
+
 	/*
 	 * fast path:  default or task policy
 	 */
 	page = __alloc_pages_nodemask(gfp, order, zl,
 				      policy_nodemask(gfp, pol));
+
+out:
+#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
+	if (page_needs_clearing(page, vma)) {
+		int i;
+		for (i = 0; i < (1 << order); i++) {
+			void *kaddr = kmap_atomic(page + i, KM_USER0);
+			clear_page(kaddr);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+	}
+#endif
 	put_mems_allowed();
 	return page;
 }
-- 
1.7.4


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-01-18 18:51 [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu Arun Sharma
@ 2012-01-19  2:42 ` KAMEZAWA Hiroyuki
  2012-01-24  0:54   ` Arun Sharma
  2012-02-23  7:45 ` Balbir Singh
  1 sibling, 1 reply; 14+ messages in thread
From: KAMEZAWA Hiroyuki @ 2012-01-19  2:42 UTC (permalink / raw)
  To: Arun Sharma; +Cc: linux-kernel, linux-mm, Balbir Singh, akpm

On Wed, 18 Jan 2012 10:51:02 -0800
Arun Sharma <asharma@fb.com> wrote:

> This enables malloc optimizations where we might
> madvise(..,MADV_DONTNEED) a page only to fault it
> back at a different virtual address.
> 
> To ensure that we don't leak sensitive data to
> unprivileged processes, we enable this optimization
> only for pages that are reused within a memory
> cgroup.
> 
> The idea is to make this opt-in both at the mmap()
> level and cgroup level so the default behavior is
> unchanged after the patch.
> 
> TODO: Ask for a VM_UNINITIALIZED bit
> TODO: Implement a cgroup level opt-in flag
> 

Hmm, then, 
1. a new task jumped into this cgroup can see any uncleared data...
2. if a memcg pointer is reused, the information will be leaked.
3. If VM_UNINITALIZED is set, the process can see any data which
   was freed by other process which doesn't know VM_UNINITALIZED at all.

4. The process will be able to see file cache data which the it has no
   access right if it's accessed by memcg once.

3 & 4 seems too danger.

Isn't it better to have this as per-task rather than per-memcg ?
And just allow to reuse pages the page has freed ?


Thanks,
-Kame


> To: linux-kernel@vger.kernel.org
> Cc: linux-mm@kvack.org
> Cc: Balbir Singh <bsingharora@gmail.com>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Cc: akpm@linux-foundation.org
> Signed-off-by: Arun Sharma <asharma@fb.com>
> ---
>  include/asm-generic/mman-common.h |    6 +-----
>  include/linux/highmem.h           |    6 ++++++
>  include/linux/mm.h                |    2 ++
>  include/linux/mman.h              |    1 +
>  include/linux/page_cgroup.h       |   29 +++++++++++++++++++++++++++++
>  init/Kconfig                      |    2 +-
>  mm/mempolicy.c                    |   29 +++++++++++++++++++++++------
>  7 files changed, 63 insertions(+), 12 deletions(-)
> 
> diff --git a/include/asm-generic/mman-common.h b/include/asm-generic/mman-common.h
> index 787abbb..71e079f 100644
> --- a/include/asm-generic/mman-common.h
> +++ b/include/asm-generic/mman-common.h
> @@ -19,11 +19,7 @@
>  #define MAP_TYPE	0x0f		/* Mask for type of mapping */
>  #define MAP_FIXED	0x10		/* Interpret addr exactly */
>  #define MAP_ANONYMOUS	0x20		/* don't use a file */
> -#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
> -# define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be uninitialized */
> -#else
> -# define MAP_UNINITIALIZED 0x0		/* Don't support this flag */
> -#endif
> +#define MAP_UNINITIALIZED 0x4000000	/* For anonymous mmap, memory could be uninitialized */
>  
>  #define MS_ASYNC	1		/* sync memory asynchronously */
>  #define MS_INVALIDATE	2		/* invalidate the caches */
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> index 3a93f73..caae922 100644
> --- a/include/linux/highmem.h
> +++ b/include/linux/highmem.h
> @@ -4,6 +4,7 @@
>  #include <linux/fs.h>
>  #include <linux/kernel.h>
>  #include <linux/mm.h>
> +#include <linux/page_cgroup.h>
>  #include <linux/uaccess.h>
>  #include <linux/hardirq.h>
>  
> @@ -156,6 +157,11 @@ __alloc_zeroed_user_highpage(gfp_t movableflags,
>  	struct page *page = alloc_page_vma(GFP_HIGHUSER | movableflags,
>  			vma, vaddr);
>  
> +#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
> +	if (!page_needs_clearing(page, vma))
> +		return page;
> +#endif
> +
>  	if (page)
>  		clear_user_highpage(page, vaddr);
>  
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 4baadd1..c6bab01 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -118,6 +118,8 @@ extern unsigned int kobjsize(const void *objp);
>  #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
>  #define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
>  #define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
> +#define VM_UNINITIALIZED VM_SAO		/* Steal a powerpc bit for now, since we're out
> +					   of bits for 32 bit archs */
>  
>  /* Bits set in the VMA until the stack is in its final location */
>  #define VM_STACK_INCOMPLETE_SETUP	(VM_RAND_READ | VM_SEQ_READ)
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index 8b74e9b..9bef6c9 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -87,6 +87,7 @@ calc_vm_flag_bits(unsigned long flags)
>  	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
>  	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
>  	       _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
> +	       _calc_vm_trans(flags, MAP_UNINITIALIZED, VM_UNINITIALIZED) |
>  	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
>  }
>  #endif /* __KERNEL__ */
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 961ecc7..e959869 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -155,6 +155,17 @@ static inline unsigned long page_cgroup_array_id(struct page_cgroup *pc)
>  	return (pc->flags >> PCG_ARRAYID_SHIFT) & PCG_ARRAYID_MASK;
>  }
>  
> +static int mm_match_cgroup(const struct mm_struct *mm,
> +			   const struct mem_cgroup *cgroup);
> +static inline bool page_seen_by_cgroup(struct page *page,
> +				       const struct mm_struct *mm)
> +{
> +	struct page_cgroup *pcg = lookup_page_cgroup(page);
> +	if (pcg == NULL)
> +		return false;
> +	return mm_match_cgroup(mm, pcg->mem_cgroup);
> +}
> +
>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>  struct page_cgroup;
>  
> @@ -175,8 +186,26 @@ static inline void __init page_cgroup_init_flatmem(void)
>  {
>  }
>  
> +static inline bool page_seen_by_cgroup(struct page *page,
> +				       const struct mm_struct *mm)
> +{
> +	return false;
> +}
> +
>  #endif /* CONFIG_CGROUP_MEM_RES_CTLR */
>  
> +static inline bool vma_requests_uninitialized(struct vm_area_struct *vma)
> +{
> +	return vma && !vma->vm_file && vma->vm_flags & VM_UNINITIALIZED;
> +}
> +
> +static inline bool page_needs_clearing(struct page *page,
> +				       struct vm_area_struct *vma)
> +{
> +	return !(vma_requests_uninitialized(vma)
> +		&& page_seen_by_cgroup(page, vma->vm_mm));
> +}
> +
>  #include <linux/swap.h>
>  
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
> diff --git a/init/Kconfig b/init/Kconfig
> index 43298f9..428e047 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1259,7 +1259,7 @@ endchoice
>  
>  config MMAP_ALLOW_UNINITIALIZED
>  	bool "Allow mmapped anonymous memory to be uninitialized"
> -	depends on EXPERT && !MMU
> +	depends on EXPERT
>  	default n
>  	help
>  	  Normally, and according to the Linux spec, anonymous memory obtained
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index c3fdbcb..7c9ab68 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -90,6 +90,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/ctype.h>
>  #include <linux/mm_inline.h>
> +#include <linux/page_cgroup.h>
>  
>  #include <asm/tlbflush.h>
>  #include <asm/uaccess.h>
> @@ -1847,6 +1848,11 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
>  	struct zonelist *zl;
>  	struct page *page;
>  
> +#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
> +	if (vma_requests_uninitialized(vma))
> +		gfp &= ~__GFP_ZERO;
> +#endif
> +
>  	get_mems_allowed();
>  	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
>  		unsigned nid;
> @@ -1854,25 +1860,36 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
>  		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
>  		mpol_cond_put(pol);
>  		page = alloc_page_interleave(gfp, order, nid);
> -		put_mems_allowed();
> -		return page;
> +		goto out;
>  	}
>  	zl = policy_zonelist(gfp, pol, node);
>  	if (unlikely(mpol_needs_cond_ref(pol))) {
>  		/*
>  		 * slow path: ref counted shared policy
>  		 */
> -		struct page *page =  __alloc_pages_nodemask(gfp, order,
> -						zl, policy_nodemask(gfp, pol));
> +		page =  __alloc_pages_nodemask(gfp, order,
> +					       zl, policy_nodemask(gfp, pol));
>  		__mpol_put(pol);
> -		put_mems_allowed();
> -		return page;
> +		goto out;
>  	}
> +
>  	/*
>  	 * fast path:  default or task policy
>  	 */
>  	page = __alloc_pages_nodemask(gfp, order, zl,
>  				      policy_nodemask(gfp, pol));
> +
> +out:
> +#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED
> +	if (page_needs_clearing(page, vma)) {
> +		int i;
> +		for (i = 0; i < (1 << order); i++) {
> +			void *kaddr = kmap_atomic(page + i, KM_USER0);
> +			clear_page(kaddr);
> +			kunmap_atomic(kaddr, KM_USER0);
> +		}
> +	}
> +#endif
>  	put_mems_allowed();
>  	return page;
>  }
> -- 
> 1.7.4
> 
> 


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-01-19  2:42 ` KAMEZAWA Hiroyuki
@ 2012-01-24  0:54   ` Arun Sharma
  2012-01-24  3:07     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 14+ messages in thread
From: Arun Sharma @ 2012-01-24  0:54 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-kernel, linux-mm, Balbir Singh, akpm

On 1/18/12 6:42 PM, KAMEZAWA Hiroyuki wrote:
>
> Hmm, then,
> 1. a new task jumped into this cgroup can see any uncleared data...
> 2. if a memcg pointer is reused, the information will be leaked.

You're suggesting mm_match_cgroup() is good enough for accounting 
purposes, but not usable for cases where its important to get the 
equality right?

> 3. If VM_UNINITALIZED is set, the process can see any data which
>     was freed by other process which doesn't know VM_UNINITALIZED at all.
>
> 4. The process will be able to see file cache data which the it has no
>     access right if it's accessed by memcg once.
>
> 3&  4 seems too danger.

Yes - these are the risks that I'm hoping we can document, so the 
cgroups admin can avoid opting-in if not everything running in the 
cgroup is trusted.

>
> Isn't it better to have this as per-task rather than per-memcg ?
> And just allow to reuse pages the page has freed ?
>

I'm worrying that the additional complexity of maintaining a per-task 
page list would be a problem. It might slow down workloads that 
alloc/free a lot because of the added code. It'll probably touch the 
kswapd as well (for reclaiming pages from the per-task free lists under 
low mem conditions).

Did you have some implementation ideas which would not have the problems 
above?

  -Arun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-01-24  0:54   ` Arun Sharma
@ 2012-01-24  3:07     ` KAMEZAWA Hiroyuki
  2012-01-25  1:45       ` Arun Sharma
  0 siblings, 1 reply; 14+ messages in thread
From: KAMEZAWA Hiroyuki @ 2012-01-24  3:07 UTC (permalink / raw)
  To: Arun Sharma; +Cc: linux-kernel, linux-mm, Balbir Singh, akpm

On Mon, 23 Jan 2012 16:54:22 -0800
Arun Sharma <asharma@fb.com> wrote:

> On 1/18/12 6:42 PM, KAMEZAWA Hiroyuki wrote:
> >
> > Hmm, then,
> > 1. a new task jumped into this cgroup can see any uncleared data...
> > 2. if a memcg pointer is reused, the information will be leaked.
> 
> You're suggesting mm_match_cgroup() is good enough for accounting 
> purposes, but not usable for cases where its important to get the 
> equality right?
> 

I think there is no 100% solution to check reuse of object.



> > 3. If VM_UNINITALIZED is set, the process can see any data which
> >     was freed by other process which doesn't know VM_UNINITALIZED at all.
> >
> > 4. The process will be able to see file cache data which the it has no
> >     access right if it's accessed by memcg once.
> >
> > 3&  4 seems too danger.
> 
> Yes - these are the risks that I'm hoping we can document, so the 
> cgroups admin can avoid opting-in if not everything running in the 
> cgroup is trusted.
> 

I guess admins/users can't handle that.

> >
> > Isn't it better to have this as per-task rather than per-memcg ?
> > And just allow to reuse pages the page has freed ?
> >
> 
> I'm worrying that the additional complexity of maintaining a per-task 
> page list would be a problem. It might slow down workloads that 
> alloc/free a lot because of the added code. It'll probably touch the 
> kswapd as well (for reclaiming pages from the per-task free lists under 
> low mem conditions).
> 
> Did you have some implementation ideas which would not have the problems 
> above?
> 

If you just want to reduce latency of GFP_ZERO, you may be able to
clear pages by (rate limited) kernel daemon for minimize latency.

But, what I'm not sure is the effect of cpu cache. Now, user process
can expect the page is on cpu cache when it faulted. page-fault
does all prefetching by clearing pages. This helps performance much
in general. So, I think it's limited situation that no-clear-page-at-fault
is good for total applications performance.
You can see reduction of clear_page() cost by removing GFP_ZERO but
what's your application's total performance ? Is it good enough considering
many risks ?


Thanks,
-Kame











^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-01-24  3:07     ` KAMEZAWA Hiroyuki
@ 2012-01-25  1:45       ` Arun Sharma
  2012-02-22  0:34         ` Arun Sharma
  0 siblings, 1 reply; 14+ messages in thread
From: Arun Sharma @ 2012-01-25  1:45 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-kernel, linux-mm, Balbir Singh, akpm

On 1/23/12 7:07 PM, KAMEZAWA Hiroyuki wrote:

> You can see reduction of clear_page() cost by removing GFP_ZERO but
> what's your application's total performance ? Is it good enough considering
> many risks ?

I see 90k calls/sec to clear_page_c when running our application. I 
don't have data on the impact of GFP_ZERO alone, but an earlier 
experiment when we tuned malloc to not call madvise(MADV_DONTNEED) 
aggressively saved us 3% CPU. So I'm expecting this to be a 1-2% win.

But not calling madvise() increases our RSS and increases the risk of OOM.

Agree with your analysis that removing the cache misses at clear_page() 
is not always a win, since it moves the misses to the code where the app 
first touches the data.

  -Arun



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-01-25  1:45       ` Arun Sharma
@ 2012-02-22  0:34         ` Arun Sharma
  0 siblings, 0 replies; 14+ messages in thread
From: Arun Sharma @ 2012-02-22  0:34 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-kernel, linux-mm, Balbir Singh, akpm

On 1/24/12 5:45 PM, Arun Sharma wrote:
> On 1/23/12 7:07 PM, KAMEZAWA Hiroyuki wrote:
>
>> You can see reduction of clear_page() cost by removing GFP_ZERO but
>> what's your application's total performance ? Is it good enough
>> considering
>> many risks ?
>
> I see 90k calls/sec to clear_page_c when running our application. I
> don't have data on the impact of GFP_ZERO alone, but an earlier
> experiment when we tuned malloc to not call madvise(MADV_DONTNEED)
> aggressively saved us 3% CPU. So I'm expecting this to be a 1-2% win.

I saw some additional measurement data today.

We were running at a lower-than-default value for the rate at which our 
malloc implementation releases unused faulted-in memory to the kernel 
via madvise(). This was done just to reduce the impact of clear_page() 
on application performance. But it cost us at least several hundred megs 
(if not more) in additional RSS.

We compared the impact of increasing the madvise rate to the default[1]. 
This used to cause a 3% CPU regression earlier. But with the patch, the 
regression was completely gone and we recovered a bunch of memory in 
terms of reduced RSS.

Hope this additional data is useful. Happy to clean up the patch and 
implement the opt-in flags.

  -Arun

[1] The default rate is 32:1, i.e. no more than 1/32th of the heap is 
unused and dirty (i.e. contributing to RSS).

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-01-18 18:51 [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu Arun Sharma
  2012-01-19  2:42 ` KAMEZAWA Hiroyuki
@ 2012-02-23  7:45 ` Balbir Singh
  2012-02-23 18:42   ` Arun Sharma
  1 sibling, 1 reply; 14+ messages in thread
From: Balbir Singh @ 2012-02-23  7:45 UTC (permalink / raw)
  To: Arun Sharma; +Cc: linux-kernel, linux-mm, KAMEZAWA Hiroyuki, akpm

On Thu, Jan 19, 2012 at 12:21 AM, Arun Sharma <asharma@fb.com> wrote:
>
> This enables malloc optimizations where we might
> madvise(..,MADV_DONTNEED) a page only to fault it
> back at a different virtual address.
>
> To ensure that we don't leak sensitive data to
> unprivileged processes, we enable this optimization
> only for pages that are reused within a memory
> cgroup.
>

So the assumption is that only apps that have access to each others
VMA's will run in this cgroup?

> The idea is to make this opt-in both at the mmap()
> level and cgroup level so the default behavior is
> unchanged after the patch.
>

Sorry, I am not convinced we need to do this

1. I know that zeroing out memory is expensive, but building a
potential loop hole is not a good idea
2. How do we ensure that tasks in a cgroup should be allowed to reuse
memory uninitialized, how does the cgroup admin know what she is
getting into?

So I am going to NACK this.

Balbir

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-02-23  7:45 ` Balbir Singh
@ 2012-02-23 18:42   ` Arun Sharma
  2012-02-24  2:47     ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 14+ messages in thread
From: Arun Sharma @ 2012-02-23 18:42 UTC (permalink / raw)
  To: Balbir Singh; +Cc: linux-kernel, linux-mm, KAMEZAWA Hiroyuki, akpm

Hi Balbir,

Thanks for reviewing. Would you change your position if I limit the 
scope of the patch to a cgroup with a single address space?

The moment the cgroup sees more than one address space (either due to 
tasks getting created or being added), this optimization would be turned 
off.

More details below:

On 2/22/12 11:45 PM, Balbir Singh wrote:
>
> So the assumption is that only apps that have access to each others
> VMA's will run in this cgroup?
>

In a distributed computing environment, a user submits a job to the 
cluster job scheduler. The job might involve multiple related 
executables and might involve multiple address spaces. But they're 
performing one logical task, have a single resource limit enforced by a 
cgroup.

They don't have access to each other's VMAs, but if "accidentally" one 
of them comes across an uninitialized page with data from another task, 
it's not a violation of the security model.

> Sorry, I am not convinced we need to do this
>
> 1. I know that zeroing out memory is expensive, but building a
> potential loop hole is not a good idea
> 2. How do we ensure that tasks in a cgroup should be allowed to reuse
> memory uninitialized, how does the cgroup admin know what she is
> getting into?

I was thinking of addressing this via documentation (as in: don't use 
this if you don't know what you're doing!). But limiting the scope to a 
single address space cgroup seems cleaner to me.

  -Arun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-02-23 18:42   ` Arun Sharma
@ 2012-02-24  2:47     ` KAMEZAWA Hiroyuki
  2012-02-24 14:51       ` Balbir Singh
  2012-02-24 19:26       ` Arun Sharma
  0 siblings, 2 replies; 14+ messages in thread
From: KAMEZAWA Hiroyuki @ 2012-02-24  2:47 UTC (permalink / raw)
  To: Arun Sharma; +Cc: Balbir Singh, linux-kernel, linux-mm, akpm

On Thu, 23 Feb 2012 10:42:16 -0800
Arun Sharma <asharma@fb.com> wrote:

> Hi Balbir,
> 
> Thanks for reviewing. Would you change your position if I limit the 
> scope of the patch to a cgroup with a single address space?
> 
> The moment the cgroup sees more than one address space (either due to 
> tasks getting created or being added), this optimization would be turned 
> off.
> 
> More details below:
> 
> On 2/22/12 11:45 PM, Balbir Singh wrote:
> >
> > So the assumption is that only apps that have access to each others
> > VMA's will run in this cgroup?
> >
> 
> In a distributed computing environment, a user submits a job to the 
> cluster job scheduler. The job might involve multiple related 
> executables and might involve multiple address spaces. But they're 
> performing one logical task, have a single resource limit enforced by a 
> cgroup.
> 
> They don't have access to each other's VMAs, but if "accidentally" one 
> of them comes across an uninitialized page with data from another task, 
> it's not a violation of the security model.
> 
How do you handle shared resouce, file-cache ?

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-02-24  2:47     ` KAMEZAWA Hiroyuki
@ 2012-02-24 14:51       ` Balbir Singh
  2012-02-24 19:11         ` Arun Sharma
  2012-02-24 19:26       ` Arun Sharma
  1 sibling, 1 reply; 14+ messages in thread
From: Balbir Singh @ 2012-02-24 14:51 UTC (permalink / raw)
  To: Arun Sharma; +Cc: linux-kernel, linux-mm, akpm, KAMEZAWA Hiroyuki

On Fri, Feb 24, 2012 at 8:17 AM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
>> They don't have access to each other's VMAs, but if "accidentally" one
>> of them comes across an uninitialized page with data from another task,
>> it's not a violation of the security model.

Can you expand more on the single address space model?

Balbir

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-02-24 14:51       ` Balbir Singh
@ 2012-02-24 19:11         ` Arun Sharma
  2012-02-25  4:13           ` Balbir Singh
  0 siblings, 1 reply; 14+ messages in thread
From: Arun Sharma @ 2012-02-24 19:11 UTC (permalink / raw)
  To: Balbir Singh; +Cc: linux-kernel, linux-mm, akpm, KAMEZAWA Hiroyuki

On 2/24/12 6:51 AM, Balbir Singh wrote:
> On Fri, Feb 24, 2012 at 8:17 AM, KAMEZAWA Hiroyuki
> <kamezawa.hiroyu@jp.fujitsu.com>  wrote:
>>> They don't have access to each other's VMAs, but if "accidentally" one
>>> of them comes across an uninitialized page with data from another task,
>>> it's not a violation of the security model.
>
> Can you expand more on the single address space model?

I haven't thought this through yet. But I know that just adding

&& (cgroup_task_count() == 1)

to page_needs_clearing() is not going to do it. We'll have to design a 
new mechanism (cgroup_mm_count_all()?) and make sure that it doesn't 
race with fork() and inadvertently expose pages from the new address 
space to the existing one.

A uid based approach such as the one implemented by Davide Libenzi

http://thread.gmane.org/gmane.linux.kernel/548928
http://thread.gmane.org/gmane.linux.kernel/548926

would probably apply the optimization to more use cases - but 
conceptually a bit more complex. If we go with this more relaxed 
approach, we'll have to design a race-free cgroup_uid_count() based 
mechanism.

  -Arun



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-02-24  2:47     ` KAMEZAWA Hiroyuki
  2012-02-24 14:51       ` Balbir Singh
@ 2012-02-24 19:26       ` Arun Sharma
  1 sibling, 0 replies; 14+ messages in thread
From: Arun Sharma @ 2012-02-24 19:26 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: Balbir Singh, linux-kernel, linux-mm, akpm

On 2/23/12 6:47 PM, KAMEZAWA Hiroyuki wrote:
>>
>> In a distributed computing environment, a user submits a job to the
>> cluster job scheduler. The job might involve multiple related
>> executables and might involve multiple address spaces. But they're
>> performing one logical task, have a single resource limit enforced by a
>> cgroup.
>>
>> They don't have access to each other's VMAs, but if "accidentally" one
>> of them comes across an uninitialized page with data from another task,
>> it's not a violation of the security model.
>>
> How do you handle shared resouce, file-cache ?
>

 From a security perspective or a resource limit perspective?

Security: all processes in the cgroup run with the same uid and have the 
same access to the filesystem. Multiple address spaces in a cgroup can 
be thought of as an implementation detail.

Resource limit: We don't have strict enforcement right now. There is a 
desire to include everything (file cache, slab memory) in the job's 
memory resource limit.

  -Arun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-02-24 19:11         ` Arun Sharma
@ 2012-02-25  4:13           ` Balbir Singh
  2012-02-27 18:32             ` Arun Sharma
  0 siblings, 1 reply; 14+ messages in thread
From: Balbir Singh @ 2012-02-25  4:13 UTC (permalink / raw)
  To: Arun Sharma; +Cc: linux-kernel, linux-mm, akpm, KAMEZAWA Hiroyuki

On Sat, Feb 25, 2012 at 12:41 AM, Arun Sharma <asharma@fb.com> wrote:
> On 2/24/12 6:51 AM, Balbir Singh wrote:
>>
>> On Fri, Feb 24, 2012 at 8:17 AM, KAMEZAWA Hiroyuki
>> <kamezawa.hiroyu@jp.fujitsu.com>  wrote:
>>>>
>>>> They don't have access to each other's VMAs, but if "accidentally" one
>>>> of them comes across an uninitialized page with data from another task,
>>>> it's not a violation of the security model.
>>
>>
>> Can you expand more on the single address space model?
>
>
> I haven't thought this through yet. But I know that just adding
>
> && (cgroup_task_count() == 1)
>
> to page_needs_clearing() is not going to do it. We'll have to design a new
> mechanism (cgroup_mm_count_all()?) and make sure that it doesn't race with
> fork() and inadvertently expose pages from the new address space to the
> existing one.
>
> A uid based approach such as the one implemented by Davide Libenzi
>
> http://thread.gmane.org/gmane.linux.kernel/548928
> http://thread.gmane.org/gmane.linux.kernel/548926
>
> would probably apply the optimization to more use cases - but conceptually a
> bit more complex. If we go with this more relaxed approach, we'll have to
> design a race-free cgroup_uid_count() based mechanism.

Are you suggesting all processes with the same UID should have access
to each others memory contents?

Balbir

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu
  2012-02-25  4:13           ` Balbir Singh
@ 2012-02-27 18:32             ` Arun Sharma
  0 siblings, 0 replies; 14+ messages in thread
From: Arun Sharma @ 2012-02-27 18:32 UTC (permalink / raw)
  To: Balbir Singh; +Cc: linux-kernel, linux-mm, akpm, KAMEZAWA Hiroyuki

On 2/24/12 8:13 PM, Balbir Singh wrote:
>> A uid based approach such as the one implemented by Davide Libenzi
>>
>> http://thread.gmane.org/gmane.linux.kernel/548928
>> http://thread.gmane.org/gmane.linux.kernel/548926
>>
>> would probably apply the optimization to more use cases - but conceptually a
>> bit more complex. If we go with this more relaxed approach, we'll have to
>> design a race-free cgroup_uid_count() based mechanism.
>
> Are you suggesting all processes with the same UID should have access
> to each others memory contents?

No - that's a stronger statement than the one I made in my last message. 
I'll however observe that something like this is already possible via 
PTRACE_PEEKDATA.

Like I said: a cgroup with a single mm_struct is conceptually cleanest 
and covers some of our heavy use cases. A cgroup with a single uid would 
cover more of our use cases. It'd be good to know if you and other 
maintainers are willing to accept the former, but not the latter.

I'll note that the malloc implementation which uses these interfaces can 
still decide to zero the memory depending on which variant of *alloc is 
called. But then, we'd have more fine grained control and more 
flexibility in terms of temporal usage hints.

  -Arun

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2012-02-27 18:32 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-01-18 18:51 [PATCH] mm: Enable MAP_UNINITIALIZED for archs with mmu Arun Sharma
2012-01-19  2:42 ` KAMEZAWA Hiroyuki
2012-01-24  0:54   ` Arun Sharma
2012-01-24  3:07     ` KAMEZAWA Hiroyuki
2012-01-25  1:45       ` Arun Sharma
2012-02-22  0:34         ` Arun Sharma
2012-02-23  7:45 ` Balbir Singh
2012-02-23 18:42   ` Arun Sharma
2012-02-24  2:47     ` KAMEZAWA Hiroyuki
2012-02-24 14:51       ` Balbir Singh
2012-02-24 19:11         ` Arun Sharma
2012-02-25  4:13           ` Balbir Singh
2012-02-27 18:32             ` Arun Sharma
2012-02-24 19:26       ` Arun Sharma

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).