linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/3] MAP_NOZERO - implement a new VM_NOZERO/MAP_NOZERO page retirement policy
@ 2007-06-27  2:44 Davide Libenzi
  2007-06-27  3:03 ` Rik van Riel
  0 siblings, 1 reply; 8+ messages in thread
From: Davide Libenzi @ 2007-06-27  2:44 UTC (permalink / raw)
  To: Linux Kernel Mailing List

This is the core implementation of the new VM_NOZERO page retirement
policy (and the associated MAP_NOZERO).
A new field  owner_uid  is added the the  mm_struct, and it is kept set to
the effective UID of the task that own the  mm_struct.
A new field  owner_uid  is also added to the page struct.
When pages exit (unmapped from) a  vma, they are marked with the effective
UID of the  mm_struct  that owns it.
When pages exit the allocator, their  owner_uid  is cleared, unless the
new flag __GFP_UIDKEEP is passed to it. So every page fetcher other than
the new alloc_zeroed_page_vma(), clears the owner_uid and blocks all the
following uses of the uncleared page itself.
The new alloc_zeroed_page_vma() calls __alloc_pages() with the __GFP_UIDKEEP
flag, and checks if the VM_NOZERO flag is set in the vma, and if the  owner_uid
field of the page matches the one of the  mm_struct  owning the vma.
If any of these test fail, the page is cleared in the usual way, otherwise
it is passed back without being cleared.
Page-cache pages are (once unmapped) marked with the uid owning the  inode
of the mapping the pages are associated with.




Signed-off-by: Davide Libenzi <davidel@xmailserver.org>


- Davide



---
 include/asm-alpha/page.h     |    3 ++-
 include/asm-cris/page.h      |    3 ++-
 include/asm-generic/mman.h   |    1 +
 include/asm-h8300/page.h     |    3 ++-
 include/asm-i386/page.h      |    3 ++-
 include/asm-ia64/page.h      |    2 +-
 include/asm-m32r/page.h      |    3 ++-
 include/asm-m68knommu/page.h |    3 ++-
 include/asm-s390/page.h      |    3 ++-
 include/asm-x86_64/page.h    |    3 ++-
 include/linux/gfp.h          |    5 +++++
 include/linux/highmem.h      |    7 +------
 include/linux/mm.h           |   16 ++++++++++++++++
 include/linux/mm_types.h     |    1 +
 include/linux/mman.h         |    3 ++-
 include/linux/rmap.h         |    1 +
 include/linux/sched.h        |    3 +++
 kernel/fork.c                |    1 +
 kernel/sys.c                 |    3 +++
 mm/filemap.c                 |    2 ++
 mm/mmap.c                    |    3 ++-
 mm/page_alloc.c              |   33 +++++++++++++++++++++++++++++++++
 mm/rmap.c                    |   14 ++++++++++++++
 23 files changed, 102 insertions(+), 17 deletions(-)

Index: linux-2.6.mod/include/linux/sched.h
===================================================================
--- linux-2.6.mod.orig/include/linux/sched.h	2007-06-21 13:59:38.000000000 -0700
+++ linux-2.6.mod/include/linux/sched.h	2007-06-21 14:01:28.000000000 -0700
@@ -386,6 +386,9 @@
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;
 	struct kioctx		*ioctx_list;
+
+	/* Effective UID of the owner of this mm_struct */
+	uid_t			owner_uid;
 };
 
 struct sighand_struct {
Index: linux-2.6.mod/mm/rmap.c
===================================================================
--- linux-2.6.mod.orig/mm/rmap.c	2007-06-21 14:27:19.000000000 -0700
+++ linux-2.6.mod/mm/rmap.c	2007-06-25 17:42:59.000000000 -0700
@@ -627,6 +627,16 @@
 }
 #endif
 
+void page_set_owner(struct page *page, uid_t owner_uid)
+{
+	if (unlikely(PageCompound(page))) {
+		unsigned int nrpages = 1U << compound_order(page);
+		for (; nrpages; nrpages--, page++)
+			page_set_owner_uid(page, owner_uid);
+	} else
+		page_set_owner_uid(page, owner_uid);
+}
+
 /**
  * page_remove_rmap - take down pte mapping from a page
  * @page: page to remove mapping from
@@ -649,6 +659,10 @@
 				print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
 			BUG();
 		}
+		/*
+		 * Record the last owner of the page.
+		 */
+		page_set_owner(page, vma->vm_mm->owner_uid);
 
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
Index: linux-2.6.mod/kernel/fork.c
===================================================================
--- linux-2.6.mod.orig/kernel/fork.c	2007-06-21 14:32:44.000000000 -0700
+++ linux-2.6.mod/kernel/fork.c	2007-06-24 21:23:52.000000000 -0700
@@ -342,6 +342,7 @@
 	mm->ioctx_list = NULL;
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
+	mm->owner_uid = current->euid;
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
Index: linux-2.6.mod/include/linux/highmem.h
===================================================================
--- linux-2.6.mod.orig/include/linux/highmem.h	2007-06-21 14:38:02.000000000 -0700
+++ linux-2.6.mod/include/linux/highmem.h	2007-06-22 12:10:36.000000000 -0700
@@ -76,12 +76,7 @@
 static inline struct page *
 alloc_zeroed_user_highpage(struct vm_area_struct *vma, unsigned long vaddr)
 {
-	struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr);
-
-	if (page)
-		clear_user_highpage(page, vaddr);
-
-	return page;
+	return alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr);
 }
 #endif
 
Index: linux-2.6.mod/include/linux/mm.h
===================================================================
--- linux-2.6.mod.orig/include/linux/mm.h	2007-06-21 14:43:06.000000000 -0700
+++ linux-2.6.mod/include/linux/mm.h	2007-06-25 19:27:42.000000000 -0700
@@ -169,6 +169,7 @@
 #define VM_MAPPED_COPY	0x01000000	/* T if mapped copy of data (nommu mmap) */
 #define VM_INSERTPAGE	0x02000000	/* The vma has had "vm_insert_page()" done on it */
 #define VM_ALWAYSDUMP	0x04000000	/* Always include in core dumps */
+#define VM_NOZERO	0x08000000	/* Do not zero the page, if possible */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -647,6 +648,21 @@
 	return atomic_read(&(page)->_mapcount) >= 0;
 }
 
+static inline void reset_owner_uid(struct page *page)
+{
+	page->owner_uid = -1;
+}
+
+static inline uid_t page_owner_uid(struct page *page)
+{
+	return (uid_t) page->owner_uid;
+}
+
+static inline void page_set_owner_uid(struct page *page, uid_t uid)
+{
+	page->owner_uid = (int) uid;
+}
+
 /*
  * Error return values for the *_nopage functions
  */
Index: linux-2.6.mod/include/asm-alpha/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-alpha/page.h	2007-06-21 16:37:20.000000000 -0700
+++ linux-2.6.mod/include/asm-alpha/page.h	2007-06-21 16:40:19.000000000 -0700
@@ -17,7 +17,8 @@
 extern void clear_page(void *page);
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vmaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+	alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 extern void copy_page(void * _to, void * _from);
Index: linux-2.6.mod/include/asm-cris/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-cris/page.h	2007-06-21 16:37:20.000000000 -0700
+++ linux-2.6.mod/include/asm-cris/page.h	2007-06-21 16:40:08.000000000 -0700
@@ -20,7 +20,8 @@
 #define clear_user_page(page, vaddr, pg)    clear_page(page)
 #define copy_user_page(to, from, vaddr, pg) copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+	alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
Index: linux-2.6.mod/include/asm-h8300/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-h8300/page.h	2007-06-21 16:37:20.000000000 -0700
+++ linux-2.6.mod/include/asm-h8300/page.h	2007-06-21 16:39:57.000000000 -0700
@@ -22,7 +22,8 @@
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+	alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
Index: linux-2.6.mod/include/asm-i386/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-i386/page.h	2007-06-21 16:37:20.000000000 -0700
+++ linux-2.6.mod/include/asm-i386/page.h	2007-06-21 16:39:47.000000000 -0700
@@ -34,7 +34,8 @@
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+	alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
Index: linux-2.6.mod/include/asm-ia64/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-ia64/page.h	2007-06-21 16:37:20.000000000 -0700
+++ linux-2.6.mod/include/asm-ia64/page.h	2007-06-21 16:39:27.000000000 -0700
@@ -89,7 +89,7 @@
 
 #define alloc_zeroed_user_highpage(vma, vaddr) \
 ({						\
-	struct page *page = alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr); \
+	struct page *page = alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr); \
 	if (page)				\
  		flush_dcache_page(page);	\
 	page;					\
Index: linux-2.6.mod/include/asm-m32r/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-m32r/page.h	2007-06-21 16:37:20.000000000 -0700
+++ linux-2.6.mod/include/asm-m32r/page.h	2007-06-21 16:39:00.000000000 -0700
@@ -15,7 +15,8 @@
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+	alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
Index: linux-2.6.mod/include/asm-m68knommu/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-m68knommu/page.h	2007-06-21 16:37:20.000000000 -0700
+++ linux-2.6.mod/include/asm-m68knommu/page.h	2007-06-21 16:38:49.000000000 -0700
@@ -22,7 +22,8 @@
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+	alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
Index: linux-2.6.mod/include/asm-s390/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-s390/page.h	2007-06-21 16:37:20.000000000 -0700
+++ linux-2.6.mod/include/asm-s390/page.h	2007-06-21 16:38:35.000000000 -0700
@@ -64,7 +64,8 @@
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+	alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 
 /*
Index: linux-2.6.mod/include/asm-x86_64/page.h
===================================================================
--- linux-2.6.mod.orig/include/asm-x86_64/page.h	2007-06-21 16:37:20.000000000 -0700
+++ linux-2.6.mod/include/asm-x86_64/page.h	2007-06-21 16:38:13.000000000 -0700
@@ -48,7 +48,8 @@
 #define clear_user_page(page, vaddr, pg)	clear_page(page)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
 
-#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
+#define alloc_zeroed_user_highpage(vma, vaddr) \
+	alloc_zeroed_page_vma(vma, GFP_HIGHUSER, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
 /*
  * These are used to make use of C type-checking..
Index: linux-2.6.mod/include/asm-generic/mman.h
===================================================================
--- linux-2.6.mod.orig/include/asm-generic/mman.h	2007-06-21 16:43:33.000000000 -0700
+++ linux-2.6.mod/include/asm-generic/mman.h	2007-06-21 18:14:55.000000000 -0700
@@ -13,6 +13,7 @@
 #define PROT_NONE	0x0		/* page can not be accessed */
 #define PROT_GROWSDOWN	0x01000000	/* mprotect flag: extend change to start of growsdown vma */
 #define PROT_GROWSUP	0x02000000	/* mprotect flag: extend change to end of growsup vma */
+#define MAP_NOZERO	0x04000000	/* Do not zero the pages, if possible */
 
 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
Index: linux-2.6.mod/include/linux/mman.h
===================================================================
--- linux-2.6.mod.orig/include/linux/mman.h	2007-06-21 16:47:03.000000000 -0700
+++ linux-2.6.mod/include/linux/mman.h	2007-06-21 16:47:45.000000000 -0700
@@ -63,7 +63,8 @@
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
 	       _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
-	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
+	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
+	       _calc_vm_trans(flags, MAP_NOZERO,     VM_NOZERO    );
 }
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMAN_H */
Index: linux-2.6.mod/mm/mmap.c
===================================================================
--- linux-2.6.mod.orig/mm/mmap.c	2007-06-21 16:48:31.000000000 -0700
+++ linux-2.6.mod/mm/mmap.c	2007-06-25 19:14:49.000000000 -0700
@@ -915,7 +915,8 @@
 
 	if (!len)
 		return -EINVAL;
-
+	if (file && (flags & MAP_NOZERO))
+		return -EINVAL;
 	error = arch_mmap_check(addr, len, flags);
 	if (error)
 		return error;
Index: linux-2.6.mod/mm/page_alloc.c
===================================================================
--- linux-2.6.mod.orig/mm/page_alloc.c	2007-06-22 10:56:07.000000000 -0700
+++ linux-2.6.mod/mm/page_alloc.c	2007-06-25 17:40:23.000000000 -0700
@@ -1370,11 +1370,44 @@
 		show_mem();
 	}
 got_pg:
+	if (page && !(gfp_mask & __GFP_UIDKEEP)) {
+		unsigned int pgcount = 1U << order;
+		struct page *npage = page;
+
+		/*
+		 * It'd be possible to remove the loop below by resetting
+		 * page->owner_uid when the page is handed back to the buddy
+		 * allocator. Here we would simply reset page->owner_uid only.
+		 * This reduces the efficency of page reuse though, since pages
+		 * used by a user may be reset too early.
+		 */
+		for (; pgcount; pgcount--, npage++)
+			reset_owner_uid(npage);
+	}
 	return page;
 }
 
 EXPORT_SYMBOL(__alloc_pages);
 
+static inline int page_need_clear(struct vm_area_struct *vma, struct page *page)
+{
+	return (vma->vm_flags & VM_NOZERO) == 0 ||
+		page_owner_uid(page) != vma->vm_mm->owner_uid;
+}
+
+struct page *alloc_zeroed_page_vma(struct vm_area_struct *vma, gfp_t gfp_mask,
+				   unsigned long vaddr)
+{
+	struct page *page = alloc_page_vma(gfp_mask | __GFP_UIDKEEP, vma, vaddr);
+
+	if (page) {
+		if (page_need_clear(vma, page))
+			clear_user_highpage(page, vaddr);
+		reset_owner_uid(page);
+	}
+	return page;
+}
+
 /*
  * Common helper functions.
  */
Index: linux-2.6.mod/include/linux/gfp.h
===================================================================
--- linux-2.6.mod.orig/include/linux/gfp.h	2007-06-21 16:32:34.000000000 -0700
+++ linux-2.6.mod/include/linux/gfp.h	2007-06-22 12:15:14.000000000 -0700
@@ -45,6 +45,7 @@
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
 #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
+#define __GFP_UIDKEEP	((__force gfp_t)0x80000u)	/* Do not clear owner UID */
 
 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -124,6 +125,10 @@
 extern struct page *
 FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
 
+extern struct page *alloc_zeroed_page_vma(struct vm_area_struct *vma,
+					  gfp_t gfp_mask,
+					  unsigned long vaddr);
+
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
Index: linux-2.6.mod/mm/filemap.c
===================================================================
--- linux-2.6.mod.orig/mm/filemap.c	2007-06-24 21:03:07.000000000 -0700
+++ linux-2.6.mod/mm/filemap.c	2007-06-24 22:12:40.000000000 -0700
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/mman.h>
+#include <linux/rmap.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/uio.h>
@@ -118,6 +119,7 @@
 
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
+	page_set_owner(page, mapping->host->i_uid);
 	mapping->nrpages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 }
Index: linux-2.6.mod/include/linux/rmap.h
===================================================================
--- linux-2.6.mod.orig/include/linux/rmap.h	2007-06-24 21:28:50.000000000 -0700
+++ linux-2.6.mod/include/linux/rmap.h	2007-06-24 21:29:13.000000000 -0700
@@ -72,6 +72,7 @@
 void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void page_add_file_rmap(struct page *);
+void page_set_owner(struct page *page, uid_t owner_uid);
 void page_remove_rmap(struct page *, struct vm_area_struct *);
 
 #ifdef CONFIG_DEBUG_VM
Index: linux-2.6.mod/include/linux/mm_types.h
===================================================================
--- linux-2.6.mod.orig/include/linux/mm_types.h	2007-06-21 14:02:06.000000000 -0700
+++ linux-2.6.mod/include/linux/mm_types.h	2007-06-25 19:11:22.000000000 -0700
@@ -64,6 +64,7 @@
 	struct list_head lru;		/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
 					 */
+	int owner_uid;			/* Last owner of the page */
 	/*
 	 * On machines where all RAM is mapped into kernel address space,
 	 * we can simply calculate the virtual address. On machines with
Index: linux-2.6.mod/kernel/sys.c
===================================================================
--- linux-2.6.mod.orig/kernel/sys.c	2007-06-26 17:40:19.000000000 -0700
+++ linux-2.6.mod/kernel/sys.c	2007-06-26 17:46:08.000000000 -0700
@@ -1149,6 +1149,7 @@
 
 	if (new_euid != old_euid) {
 		current->mm->dumpable = suid_dumpable;
+		current->mm->owner_uid = new_euid;
 		smp_wmb();
 	}
 	current->fsuid = current->euid = new_euid;
@@ -1199,6 +1200,7 @@
 
 	if (old_euid != uid) {
 		current->mm->dumpable = suid_dumpable;
+		current->mm->owner_uid = uid;
 		smp_wmb();
 	}
 	current->fsuid = current->euid = uid;
@@ -1244,6 +1246,7 @@
 	if (euid != (uid_t) -1) {
 		if (euid != current->euid) {
 			current->mm->dumpable = suid_dumpable;
+			current->mm->owner_uid = euid;
 			smp_wmb();
 		}
 		current->euid = euid;


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/3] MAP_NOZERO - implement a new VM_NOZERO/MAP_NOZERO page retirement policy
  2007-06-27  2:44 [patch 1/3] MAP_NOZERO - implement a new VM_NOZERO/MAP_NOZERO page retirement policy Davide Libenzi
@ 2007-06-27  3:03 ` Rik van Riel
  2007-06-27  3:28   ` Davide Libenzi
  0 siblings, 1 reply; 8+ messages in thread
From: Rik van Riel @ 2007-06-27  3:03 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List

Davide Libenzi wrote:
> This is the core implementation of the new VM_NOZERO page retirement
> policy (and the associated MAP_NOZERO).
> A new field  owner_uid  is added the the  mm_struct, and it is kept set to
> the effective UID of the task that own the  mm_struct.
> A new field  owner_uid  is also added to the page struct.

You will also need to take the task's SELinux security
context into account.

SUID programs should not be able to use this feature,
either.

> When pages exit (unmapped from) a  vma, they are marked with the effective
> UID of the  mm_struct  that owns it.


> --- linux-2.6.mod.orig/include/linux/mm_types.h	2007-06-21 14:02:06.000000000 -0700
> +++ linux-2.6.mod/include/linux/mm_types.h	2007-06-25 19:11:22.000000000 -0700
> @@ -64,6 +64,7 @@
>  	struct list_head lru;		/* Pageout list, eg. active_list
>  					 * protected by zone->lru_lock !
>  					 */
> +	int owner_uid;			/* Last owner of the page */
>  	/*
>  	 * On machines where all RAM is mapped into kernel address space,
>  	 * we can simply calculate the virtual address. On machines with

Since this is only set when the page is freed, could
the owner_uid and security context be put inside a
union with some fields that are not otherwise used
for free pages?

-- 
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/3] MAP_NOZERO - implement a new VM_NOZERO/MAP_NOZERO page retirement policy
  2007-06-27  3:03 ` Rik van Riel
@ 2007-06-27  3:28   ` Davide Libenzi
  2007-06-27  4:18     ` Rik van Riel
  0 siblings, 1 reply; 8+ messages in thread
From: Davide Libenzi @ 2007-06-27  3:28 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Linux Kernel Mailing List

On Tue, 26 Jun 2007, Rik van Riel wrote:

> SUID programs should not be able to use this feature,
> either.

Why? A SUID programs runs under the UID of the owner, and should be no 
problems in it seeing the owners data.
But the patch post was more a quest for possible scenarios where the use 
of MAP_NOZERO can result in lower security WRT the same program (under the 
same security restrictions) not using such feature.
If you have something specific in mind, please go ahead and shoot.



> > When pages exit (unmapped from) a  vma, they are marked with the effective
> > UID of the  mm_struct  that owns it.
> 
> 
> > --- linux-2.6.mod.orig/include/linux/mm_types.h	2007-06-21
> > 14:02:06.000000000 -0700
> > +++ linux-2.6.mod/include/linux/mm_types.h	2007-06-25 19:11:22.000000000
> > -0700
> > @@ -64,6 +64,7 @@
> >  	struct list_head lru;		/* Pageout list, eg. active_list
> >  					 * protected by zone->lru_lock !
> >  					 */
> > +	int owner_uid;			/* Last owner of the page */
> >  	/*
> >  	 * On machines where all RAM is mapped into kernel address space,
> >  	 * we can simply calculate the virtual address. On machines with
> 
> Since this is only set when the page is freed, could
> the owner_uid and security context be put inside a
> union with some fields that are not otherwise used
> for free pages?

I tried to look, and the attempt to reuse _mapcount failed miserably :)
The last time we have the owner info (vma->mm) available, is before 
processing of the other fields ends. OTOH I'm not VM guru either, so I may 
be wrong. It can share ->virtual (when enabled).




- Davide



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/3] MAP_NOZERO - implement a new VM_NOZERO/MAP_NOZERO page retirement policy
  2007-06-27  3:28   ` Davide Libenzi
@ 2007-06-27  4:18     ` Rik van Riel
  2007-06-27  4:32       ` Davide Libenzi
  0 siblings, 1 reply; 8+ messages in thread
From: Rik van Riel @ 2007-06-27  4:18 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Linux Kernel Mailing List

Davide Libenzi wrote:
> On Tue, 26 Jun 2007, Rik van Riel wrote:
> 
>> SUID programs should not be able to use this feature,
>> either.
> 
> Why? A SUID programs runs under the UID of the owner, and should be no 
> problems in it seeing the owners data.

Because an SUID program can change its UID back.

At least, one that was SUID root.  OTOH, any
program running as root can change UID, so we
should probably not allow root to get nonzeroed
pages.

> But the patch post was more a quest for possible scenarios where the use 
> of MAP_NOZERO can result in lower security WRT the same program (under the 
> same security restrictions) not using such feature.
> If you have something specific in mind, please go ahead and shoot.

Besides the non-enforcing of SELinux security
labels (and maybe namespaces?), I cannot think
of anything.

>>> When pages exit (unmapped from) a  vma, they are marked with the effective
>>> UID of the  mm_struct  that owns it.
>>
>>> --- linux-2.6.mod.orig/include/linux/mm_types.h	2007-06-21
>>> 14:02:06.000000000 -0700
>>> +++ linux-2.6.mod/include/linux/mm_types.h	2007-06-25 19:11:22.000000000
>>> -0700
>>> @@ -64,6 +64,7 @@
>>>  	struct list_head lru;		/* Pageout list, eg. active_list
>>>  					 * protected by zone->lru_lock !
>>>  					 */
>>> +	int owner_uid;			/* Last owner of the page */
>>>  	/*
>>>  	 * On machines where all RAM is mapped into kernel address space,
>>>  	 * we can simply calculate the virtual address. On machines with
>> Since this is only set when the page is freed, could
>> the owner_uid and security context be put inside a
>> union with some fields that are not otherwise used
>> for free pages?
> 
> I tried to look, and the attempt to reuse _mapcount failed miserably :)
> The last time we have the owner info (vma->mm) available, is before 
> processing of the other fields ends. OTOH I'm not VM guru either, so I may 
> be wrong. It can share ->virtual (when enabled).

I think the process that actually calls the page freeing
functions is always the process that owned the page, so
going for current->mm should work.

Getting the UID wrong for file pages caught in a truncate
is fine, since the process obviously already had access
to the data in that page.

-- 
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/3] MAP_NOZERO - implement a new VM_NOZERO/MAP_NOZERO page retirement policy
  2007-06-27  4:18     ` Rik van Riel
@ 2007-06-27  4:32       ` Davide Libenzi
  2007-06-28  6:19         ` Andy Isaacson
  0 siblings, 1 reply; 8+ messages in thread
From: Davide Libenzi @ 2007-06-27  4:32 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Linux Kernel Mailing List

On Wed, 27 Jun 2007, Rik van Riel wrote:

> Davide Libenzi wrote:
> > On Tue, 26 Jun 2007, Rik van Riel wrote:
> > 
> > > SUID programs should not be able to use this feature,
> > > either.
> > 
> > Why? A SUID programs runs under the UID of the owner, and should be no
> > problems in it seeing the owners data.
> 
> Because an SUID program can change its UID back.
> 
> At least, one that was SUID root.  OTOH, any
> program running as root can change UID, so we
> should probably not allow root to get nonzeroed
> pages.

Well, root can in general access the whole system in any case. At the 
moment, root cannot access othe UIDs pages. Only their own. And this 
differs from standard security policies where root can access everything.
Pages used internally by the kernel, cannot be reused by anyone.



> > I tried to look, and the attempt to reuse _mapcount failed miserably :)
> > The last time we have the owner info (vma->mm) available, is before
> > processing of the other fields ends. OTOH I'm not VM guru either, so I may
> > be wrong. It can share ->virtual (when enabled).
> 
> I think the process that actually calls the page freeing
> functions is always the process that owned the page, so
> going for current->mm should work.

I'll try to see if that works out...



- Davide



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/3] MAP_NOZERO - implement a new VM_NOZERO/MAP_NOZERO page retirement policy
  2007-06-27  4:32       ` Davide Libenzi
@ 2007-06-28  6:19         ` Andy Isaacson
  2007-06-28 14:27           ` Rik van Riel
  2007-06-28 18:24           ` Davide Libenzi
  0 siblings, 2 replies; 8+ messages in thread
From: Andy Isaacson @ 2007-06-28  6:19 UTC (permalink / raw)
  To: Davide Libenzi; +Cc: Rik van Riel, Linux Kernel Mailing List

On Tue, Jun 26, 2007 at 09:32:44PM -0700, Davide Libenzi wrote:
> > Because an SUID program can change its UID back.
> > 
> > At least, one that was SUID root.  OTOH, any
> > program running as root can change UID, so we
> > should probably not allow root to get nonzeroed
> > pages.
> 
> Well, root can in general access the whole system in any case. At the 
> moment, root cannot access othe UIDs pages. Only their own. And this 
> differs from standard security policies where root can access everything.
> Pages used internally by the kernel, cannot be reused by anyone.

But MAP_NOZERO adds a new possible information leak from root out to the
non-root user.  If root does

    setuid(newuid);
    exec(...);
    exit(1);

and there are MAP_NOZERO pages which contain sensitive information,
a process running as newuid would be able to race the exec with
PTRACE_ATTACH and extract the sensitive information.  Without MAP_NOZERO
the information leak is limited to information which was in the setuid
program's address space (and presumably, setuid programs are written to
be careful about such things).

That said, I think I like the idea of MAP_NOZERO.  Could it be
generalized to some kind of "free pool" rather than keyed off of uid?

-andy

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/3] MAP_NOZERO - implement a new VM_NOZERO/MAP_NOZERO page retirement policy
  2007-06-28  6:19         ` Andy Isaacson
@ 2007-06-28 14:27           ` Rik van Riel
  2007-06-28 18:24           ` Davide Libenzi
  1 sibling, 0 replies; 8+ messages in thread
From: Rik van Riel @ 2007-06-28 14:27 UTC (permalink / raw)
  To: Andy Isaacson; +Cc: Davide Libenzi, Linux Kernel Mailing List

Andy Isaacson wrote:

> That said, I think I like the idea of MAP_NOZERO.  Could it be
> generalized to some kind of "free pool" rather than keyed off of uid?

Good idea.  At exec() time the new mm can inherit the "free pool"
pointer that the parent process points to, when nothing changes.

Certain events can cause a process to need another "free pool",
for example changing the UID, changing security context or
changing CPU/NUMA node binding.

-- 
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [patch 1/3] MAP_NOZERO - implement a new VM_NOZERO/MAP_NOZERO page retirement policy
  2007-06-28  6:19         ` Andy Isaacson
  2007-06-28 14:27           ` Rik van Riel
@ 2007-06-28 18:24           ` Davide Libenzi
  1 sibling, 0 replies; 8+ messages in thread
From: Davide Libenzi @ 2007-06-28 18:24 UTC (permalink / raw)
  To: Andy Isaacson; +Cc: Rik van Riel, Linux Kernel Mailing List

On Wed, 27 Jun 2007, Andy Isaacson wrote:

> On Tue, Jun 26, 2007 at 09:32:44PM -0700, Davide Libenzi wrote:
> > > Because an SUID program can change its UID back.
> > > 
> > > At least, one that was SUID root.  OTOH, any
> > > program running as root can change UID, so we
> > > should probably not allow root to get nonzeroed
> > > pages.
> > 
> > Well, root can in general access the whole system in any case. At the 
> > moment, root cannot access othe UIDs pages. Only their own. And this 
> > differs from standard security policies where root can access everything.
> > Pages used internally by the kernel, cannot be reused by anyone.
> 
> But MAP_NOZERO adds a new possible information leak from root out to the
> non-root user.  If root does
> 
>     setuid(newuid);
>     exec(...);
>     exit(1);
> 
> and there are MAP_NOZERO pages which contain sensitive information,
> a process running as newuid would be able to race the exec with
> PTRACE_ATTACH and extract the sensitive information.  Without MAP_NOZERO
> the information leak is limited to information which was in the setuid
> program's address space (and presumably, setuid programs are written to
> be careful about such things).

That probably deserves a patch alone (see below), besides MAP_NOZERO. 
Basically, a new "exec uid" is added and such exec-uid is set only after 
the binary completed the detach from the old context. Ptrace check that 
uid also, and part of the may_attach() function.



> That said, I think I like the idea of MAP_NOZERO.  Could it be
> generalized to some kind of "free pool" rather than keyed off of uid?

Problem is, you end up with yet another pool to be looked up, flushed 
under memory pressure, etc..



- Davide



---
 fs/exec.c             |    2 ++
 include/linux/sched.h |    2 +-
 kernel/ptrace.c       |    1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

Index: linux-2.6.mod/fs/exec.c
===================================================================
--- linux-2.6.mod.orig/fs/exec.c	2007-06-28 11:15:53.000000000 -0700
+++ linux-2.6.mod/fs/exec.c	2007-06-28 11:18:47.000000000 -0700
@@ -905,6 +905,8 @@
 	flush_signal_handlers(current, 0);
 	flush_old_files(current->files);
 
+	current->xuid = current->uid;
+
 	return 0;
 
 mmap_failed:
Index: linux-2.6.mod/include/linux/sched.h
===================================================================
--- linux-2.6.mod.orig/include/linux/sched.h	2007-06-28 11:16:15.000000000 -0700
+++ linux-2.6.mod/include/linux/sched.h	2007-06-28 11:16:49.000000000 -0700
@@ -917,7 +917,7 @@
 	struct list_head cpu_timers[3];
 
 /* process credentials */
-	uid_t uid,euid,suid,fsuid;
+	uid_t uid,euid,suid,fsuid,xuid;
 	gid_t gid,egid,sgid,fsgid;
 	struct group_info *group_info;
 	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
Index: linux-2.6.mod/kernel/ptrace.c
===================================================================
--- linux-2.6.mod.orig/kernel/ptrace.c	2007-06-28 11:09:27.000000000 -0700
+++ linux-2.6.mod/kernel/ptrace.c	2007-06-28 11:18:35.000000000 -0700
@@ -135,6 +135,7 @@
 		return 0;
 	if (((current->uid != task->euid) ||
 	     (current->uid != task->suid) ||
+	     (current->xuid != task->xuid) ||
 	     (current->uid != task->uid) ||
 	     (current->gid != task->egid) ||
 	     (current->gid != task->sgid) ||

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2007-06-28 18:24 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-06-27  2:44 [patch 1/3] MAP_NOZERO - implement a new VM_NOZERO/MAP_NOZERO page retirement policy Davide Libenzi
2007-06-27  3:03 ` Rik van Riel
2007-06-27  3:28   ` Davide Libenzi
2007-06-27  4:18     ` Rik van Riel
2007-06-27  4:32       ` Davide Libenzi
2007-06-28  6:19         ` Andy Isaacson
2007-06-28 14:27           ` Rik van Riel
2007-06-28 18:24           ` Davide Libenzi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).