linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/5] NOMMU: MM cleanups
@ 2004-12-09 15:08 dhowells
  2004-12-09 15:08 ` [PATCH 4/5] NOMMU: Make POSIX shmem work on ramfs-backed files dhowells
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: dhowells @ 2004-12-09 15:08 UTC (permalink / raw)
  To: akpm, davidm, gerg, wli; +Cc: linux-kernel, uclinux-dev

Let me try these again, this time with the To: line correct...

The attached patch does some cleaning up of the MM code preparatory to
overhauling the high-order page handling:

 (1) Trailing spaces have been cleaned up on lines in page_alloc.c and
     bootmem.c.

 (2) bootmem.c now has a separate path to release pages to the main allocator
     that bypasses many of the checks performed on struct pages.

 (3) __pagevec_free() has moved to swap.c with all the other pagevec
     functions.

 (4) put_page() has moved to page_alloc.c with all the other related
     functions. This could be relegated to a separate file, but since there
     are many other conditionals in page_alloc.c, what's the point?

Signed-Off-By: dhowells@redhat.com
---
diffstat mmcleanup-2610rc2mm3.diff
 bootmem.c    |   35 ++++++++-------
 internal.h   |    3 -
 page_alloc.c |  136 +++++++++++++++++++++++++++++++++++++----------------------
 swap.c       |   29 +++---------
 4 files changed, 116 insertions(+), 87 deletions(-)

diff -uNrp /warthog/kernels/linux-2.6.10-rc2-mm3/mm/bootmem.c linux-2.6.10-rc2-mm3-mmcleanup/mm/bootmem.c
--- /warthog/kernels/linux-2.6.10-rc2-mm3/mm/bootmem.c	2004-11-22 10:54:17.000000000 +0000
+++ linux-2.6.10-rc2-mm3-mmcleanup/mm/bootmem.c	2004-11-23 15:32:12.964968405 +0000
@@ -89,7 +89,7 @@ static void __init reserve_bootmem_core(
 	 * fully reserved.
 	 */
 	unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE;
-	unsigned long eidx = (addr + size - bdata->node_boot_start + 
+	unsigned long eidx = (addr + size - bdata->node_boot_start +
 							PAGE_SIZE-1)/PAGE_SIZE;
 	unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE;
 
@@ -174,7 +174,7 @@ __alloc_bootmem_core(struct bootmem_data
 	 * We try to allocate bootmem pages above 'goal'
 	 * first, then we try to allocate lower pages.
 	 */
-	if (goal && (goal >= bdata->node_boot_start) && 
+	if (goal && (goal >= bdata->node_boot_start) &&
 	    ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
 		preferred = goal - bdata->node_boot_start;
 
@@ -264,7 +264,7 @@ static unsigned long __init free_all_boo
 	bootmem_data_t *bdata = pgdat->bdata;
 	unsigned long i, count, total = 0;
 	unsigned long idx;
-	unsigned long *map; 
+	unsigned long *map;
 	int gofast = 0;
 
 	BUG_ON(!bdata->node_bootmem_map);
@@ -274,55 +274,59 @@ static unsigned long __init free_all_boo
 	page = virt_to_page(phys_to_virt(bdata->node_boot_start));
 	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
 	map = bdata->node_bootmem_map;
+
 	/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
 	if (bdata->node_boot_start == 0 ||
 	    ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
 		gofast = 1;
+
 	for (i = 0; i < idx; ) {
 		unsigned long v = ~map[i / BITS_PER_LONG];
+
 		if (gofast && v == ~0UL) {
 			int j, order;
 
 			count += BITS_PER_LONG;
 			__ClearPageReserved(page);
 			order = ffs(BITS_PER_LONG) - 1;
-			set_page_refs(page, order);
 			for (j = 1; j < BITS_PER_LONG; j++) {
 				if (j + 16 < BITS_PER_LONG)
 					prefetchw(page + j + 16);
 				__ClearPageReserved(page + j);
 			}
-			__free_pages(page, order);
+			__free_pages_bootmem(page, order);
 			i += BITS_PER_LONG;
 			page += BITS_PER_LONG;
+
 		} else if (v) {
 			unsigned long m;
 			for (m = 1; m && i < idx; m<<=1, page++, i++) {
 				if (v & m) {
 					count++;
 					__ClearPageReserved(page);
-					set_page_refs(page, 0);
-					__free_page(page);
+					__free_pages_bootmem(page, 0);
 				}
 			}
+
 		} else {
-			i+=BITS_PER_LONG;
+			i += BITS_PER_LONG;
 			page += BITS_PER_LONG;
 		}
 	}
 	total += count;
 
 	/*
-	 * Now free the allocator bitmap itself, it's not
-	 * needed anymore:
+	 * Now free the allocator bitmap itself, it's not needed anymore:
 	 */
 	page = virt_to_page(bdata->node_bootmem_map);
-	count = 0;
-	for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
-		count++;
+
+	count = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+	count = ((count / 8) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	for (i = count; i > 0; i--) {
 		__ClearPageReserved(page);
-		set_page_count(page, 1);
-		__free_page(page);
+		__free_pages_bootmem(page, 0);
+		page++;
 	}
 	total += count;
 	bdata->node_bootmem_map = NULL;
@@ -402,4 +406,3 @@ void * __init __alloc_bootmem_node (pg_d
 
 	return __alloc_bootmem(size, align, goal);
 }
-
diff -uNrp /warthog/kernels/linux-2.6.10-rc2-mm3/mm/internal.h linux-2.6.10-rc2-mm3-mmcleanup/mm/internal.h
--- /warthog/kernels/linux-2.6.10-rc2-mm3/mm/internal.h	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-mmcleanup/mm/internal.h	2004-11-23 15:31:55.601409553 +0000
@@ -10,4 +10,5 @@
  */
 
 /* page_alloc.c */
-extern void set_page_refs(struct page *page, int order);
+extern void fastcall free_hot_cold_page(struct page *page, int cold);
+extern fastcall void __init __free_pages_bootmem(struct page *page, unsigned int order);
diff -uNrp /warthog/kernels/linux-2.6.10-rc2-mm3/mm/page_alloc.c linux-2.6.10-rc2-mm3-mmcleanup/mm/page_alloc.c
--- /warthog/kernels/linux-2.6.10-rc2-mm3/mm/page_alloc.c	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-mmcleanup/mm/page_alloc.c	2004-11-23 16:13:04.184628888 +0000
@@ -103,6 +103,23 @@ static void bad_page(const char *functio
 	tainted |= TAINT_BAD_PAGE;
 }
 
+void set_page_refs(struct page *page, int order)
+{
+#ifdef CONFIG_MMU
+	set_page_count(page, 1);
+#else
+	int i;
+
+	/*
+	 * We need to reference all the pages for this order, otherwise if
+	 * anyone accesses one of the pages with (get/put) it will be freed.
+	 * - eg: access_process_vm()
+	 */
+	for (i = 0; i < (1 << order); i++)
+		set_page_count(page + i, 1);
+#endif /* CONFIG_MMU */
+}
+
 #ifndef CONFIG_HUGETLB_PAGE
 #define prep_compound_page(page, order) do { } while (0)
 #define destroy_compound_page(page, order) do { } while (0)
@@ -167,11 +184,13 @@ static void destroy_compound_page(struct
  * zone->lock is already acquired when we use these.
  * So, we don't need atomic page->flags operations here.
  */
-static inline unsigned long page_order(struct page *page) {
+static inline unsigned long page_order(struct page *page)
+{
 	return page->private;
 }
 
-static inline void set_page_order(struct page *page, int order) {
+static inline void set_page_order(struct page *page, int order)
+{
 	page->private = order;
 	__SetPagePrivate(page);
 }
@@ -217,10 +236,10 @@ static inline int page_is_buddy(struct p
  * free pages of length of (1 << order) and marked with PG_Private.Page's
  * order is recorded in page->private field.
  * So when we are allocating or freeing one, we can derive the state of the
- * other.  That is, if we allocate a small block, and both were   
- * free, the remainder of the region must be split into blocks.   
+ * other.  That is, if we allocate a small block, and both were
+ * free, the remainder of the region must be split into blocks.
  * If a block is freed, and its buddy is also free, then this
- * triggers coalescing into a block of larger size.            
+ * triggers coalescing into a block of larger size.
  *
  * -- wli
  */
@@ -286,7 +305,7 @@ static inline void free_pages_check(cons
 }
 
 /*
- * Frees a list of pages. 
+ * Frees a list of pages.
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free, or 0 for all on the list.
  *
@@ -337,10 +356,33 @@ void __free_pages_ok(struct page *page, 
 	for (i = 0 ; i < (1 << order) ; ++i)
 		free_pages_check(__FUNCTION__, page + i);
 	list_add(&page->lru, &list);
-	kernel_map_pages(page, 1<<order, 0);
+	kernel_map_pages(page, 1 << order, 0);
 	free_pages_bulk(page_zone(page), 1, &list, order);
 }
 
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+fastcall void __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+	set_page_refs(page, order);
+	set_page_count(page, 0);
+
+	if (order == 0) {
+		free_hot_cold_page(page, 0);
+	} else {
+		LIST_HEAD(list);
+
+		arch_free_page(page, order);
+
+		mod_page_state(pgfree, 1 << order);
+
+		list_add(&page->lru, &list);
+		kernel_map_pages(page, 1 << order, 0);
+		free_pages_bulk(page_zone(page), 1, &list, order);
+	}
+}
+
 
 /*
  * The order of subdivision here is critical for the IO subsystem.
@@ -374,23 +416,6 @@ expand(struct zone *zone, struct page *p
 	return page;
 }
 
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
-	set_page_count(page, 1);
-#else
-	int i;
-
-	/*
-	 * We need to reference all the pages for this order, otherwise if
-	 * anyone accesses one of the pages with (get/put) it will be freed.
-	 * - eg: access_process_vm()
-	 */
-	for (i = 0; i < (1 << order); i++)
-		set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
-}
-
 /*
  * This page is about to be returned from the page allocator
  */
@@ -415,7 +440,7 @@ static void prep_new_page(struct page *p
 	set_page_refs(page, order);
 }
 
-/* 
+/*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
@@ -441,19 +466,19 @@ static struct page *__rmqueue(struct zon
 	return NULL;
 }
 
-/* 
+/*
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
  * Returns the number of new pages which were placed at *list.
  */
-static int rmqueue_bulk(struct zone *zone, unsigned int order, 
+static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list)
 {
 	unsigned long flags;
 	int i;
 	int allocated = 0;
 	struct page *page;
-	
+
 	spin_lock_irqsave(&zone->lock, flags);
 	for (i = 0; i < count; ++i) {
 		page = __rmqueue(zone, order);
@@ -517,9 +542,9 @@ void drain_local_pages(void)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);	
+	local_irq_save(flags);
 	__drain_pages(smp_processor_id());
-	local_irq_restore(flags);	
+	local_irq_restore(flags);
 }
 #endif /* CONFIG_PM */
 
@@ -552,8 +577,7 @@ static void zone_statistics(struct zonel
 /*
  * Free a 0-order page
  */
-static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
-static void fastcall free_hot_cold_page(struct page *page, int cold)
+void fastcall free_hot_cold_page(struct page *page, int cold)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
@@ -580,7 +604,7 @@ void fastcall free_hot_page(struct page 
 {
 	free_hot_cold_page(page, 0);
 }
-	
+
 void fastcall free_cold_page(struct page *page)
 {
 	free_hot_cold_page(page, 1);
@@ -957,14 +981,6 @@ fastcall unsigned long get_zeroed_page(u
 
 EXPORT_SYMBOL(get_zeroed_page);
 
-void __pagevec_free(struct pagevec *pvec)
-{
-	int i = pagevec_count(pvec);
-
-	while (--i >= 0)
-		free_hot_cold_page(pvec->pages[i], pvec->cold);
-}
-
 fastcall void __free_pages(struct page *page, unsigned int order)
 {
 	if (!PageReserved(page) && put_page_testzero(page)) {
@@ -987,6 +1003,26 @@ fastcall void free_pages(unsigned long a
 
 EXPORT_SYMBOL(free_pages);
 
+#ifdef CONFIG_HUGETLB_PAGE
+
+void put_page(struct page *page)
+{
+	if (unlikely(PageCompound(page))) {
+		page = (struct page *)page->private;
+		if (put_page_testzero(page)) {
+			void (*dtor)(struct page *page);
+
+			dtor = (void (*)(struct page *))page[1].mapping;
+			(*dtor)(page);
+		}
+		return;
+	}
+	if (!PageReserved(page) && put_page_testzero(page))
+		__page_cache_release(page);
+}
+EXPORT_SYMBOL(put_page);
+#endif
+
 /*
  * Total amount of free (allocatable) RAM:
  */
@@ -1498,7 +1534,7 @@ static void __init build_zonelists(pg_da
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  		for (node = 0; node < local_node; node++)
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
- 
+
 		zonelist->zones[j] = NULL;
 	}
 }
@@ -1636,7 +1672,7 @@ static void __init free_area_init_core(s
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
-	
+
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
@@ -1798,7 +1834,7 @@ static void frag_stop(struct seq_file *m
 {
 }
 
-/* 
+/*
  * This walks the free areas for each zone.
  */
 static int frag_show(struct seq_file *m, void *arg)
@@ -2038,8 +2074,8 @@ static void setup_per_zone_protection(vo
 }
 
 /*
- * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
- *	that the pages_{min,low,high} values for each zone are set correctly 
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures
+ *	that the pages_{min,low,high} values for each zone are set correctly
  *	with respect to min_free_kbytes.
  */
 static void setup_per_zone_pages_min(void)
@@ -2073,10 +2109,10 @@ static void setup_per_zone_pages_min(voi
 				min_pages = 128;
 			zone->pages_min = min_pages;
 		} else {
-			/* if it's a lowmem zone, reserve a number of pages 
+			/* if it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->pages_min = (pages_min * zone->present_pages) / 
+			zone->pages_min = (pages_min * zone->present_pages) /
 			                   lowmem_pages;
 		}
 
@@ -2132,11 +2168,11 @@ static int __init init_per_zone_pages_mi
 module_init(init_per_zone_pages_min)
 
 /*
- * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
-int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, file, buffer, length, ppos);
diff -uNrp /warthog/kernels/linux-2.6.10-rc2-mm3/mm/swap.c linux-2.6.10-rc2-mm3-mmcleanup/mm/swap.c
--- /warthog/kernels/linux-2.6.10-rc2-mm3/mm/swap.c	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-mmcleanup/mm/swap.c	2004-11-23 15:31:55.602409470 +0000
@@ -30,30 +30,11 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include "internal.h"
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
-#ifdef CONFIG_HUGETLB_PAGE
-
-void put_page(struct page *page)
-{
-	if (unlikely(PageCompound(page))) {
-		page = (struct page *)page->private;
-		if (put_page_testzero(page)) {
-			void (*dtor)(struct page *page);
-
-			dtor = (void (*)(struct page *))page[1].mapping;
-			(*dtor)(page);
-		}
-		return;
-	}
-	if (!PageReserved(page) && put_page_testzero(page))
-		__page_cache_release(page);
-}
-EXPORT_SYMBOL(put_page);
-#endif
-
 /*
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
@@ -242,6 +223,14 @@ void release_pages(struct page **pages, 
 	pagevec_free(&pages_to_free);
 }
 
+void __pagevec_free(struct pagevec *pvec)
+{
+	int i = pagevec_count(pvec);
+
+	while (--i >= 0)
+		free_hot_cold_page(pvec->pages[i], pvec->cold);
+}
+
 /*
  * The pages which we're about to release may be in the deferred lru-addition
  * queues.  That would prevent them from really being freed right now.  That's

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 5/5] NOMMU: Futher nommu shared memory support
  2004-12-09 15:08 [PATCH 1/5] NOMMU: MM cleanups dhowells
  2004-12-09 15:08 ` [PATCH 4/5] NOMMU: Make POSIX shmem work on ramfs-backed files dhowells
@ 2004-12-09 15:08 ` dhowells
  2004-12-09 15:08 ` [PATCH 3/5] NOMMU: mmap fixes and extensions dhowells
  2004-12-09 15:08 ` [PATCH 2/5] NOMMU: High-order page management overhaul dhowells
  3 siblings, 0 replies; 11+ messages in thread
From: dhowells @ 2004-12-09 15:08 UTC (permalink / raw)
  To: akpm, davidm, gerg, wli; +Cc: linux-kernel, uclinux-dev

The attached patch furthers shared memory support under !MMU conditions:

 (1) tiny-shmem.c farms get_unmapped_area() and mmap() requests off to ramfs in
     addition to the inode creation requests, thus supporting SYSV SHM

 (2) tiny-shmem.c no longer expands the inode it allocates by writing directly
     into i_size.

 (3) SYSV IPC is then available on nommu.

     (a) SYSV SHM requires shmem_mmap() to be provided by the backing
         filesystem (tmpfs in shmem.c or tiny-shmem.c).

     (b) SYSV SHM requires shmem_get_unmapped_area() to be provided by the
         backing fs if under !MMU conditions.

Signed-Off-By: dhowells@redhat.com
---
diffstat nommu-shmem-2610rc2mm3-3.diff
 include/linux/mm.h |   11 +++++++++++
 init/Kconfig       |    1 -
 ipc/shm.c          |   16 +++++++++++-----
 mm/shmem.c         |    2 +-
 mm/tiny-shmem.c    |   26 +++++++++++++++++++++++++-
 5 files changed, 48 insertions(+), 8 deletions(-)

diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/init/Kconfig linux-2.6.10-rc2-mm3-shmem/init/Kconfig
--- linux-2.6.10-rc2-mm3-mmcleanup/init/Kconfig	2004-11-22 10:54:17.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/init/Kconfig	2004-12-01 17:07:36.000000000 +0000
@@ -81,7 +81,6 @@ config SWAP
 
 config SYSVIPC
 	bool "System V IPC"
-	depends on MMU
 	---help---
 	  Inter Process Communication is a suite of library functions and
 	  system calls which let processes (running programs) synchronize and
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/include/linux/mm.h linux-2.6.10-rc2-mm3-shmem/include/linux/mm.h
--- linux-2.6.10-rc2-mm3-mmcleanup/include/linux/mm.h	2004-11-22 10:54:16.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/include/linux/mm.h	2004-12-08 16:52:24.000000000 +0000
@@ -566,9 +603,20 @@ int shmem_lock(struct file *file, int lo
 #define shmem_get_policy(a, b)	(NULL)
 #endif
 struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags);
+extern int shmem_mmap(struct file *file, struct vm_area_struct *vma);
 
 int shmem_zero_setup(struct vm_area_struct *);
 
+#ifdef CONFIG_MMU
+#define shmem_get_unmapped_area(f,a,l,p,fl) 0
+#else
+extern unsigned long shmem_get_unmapped_area(struct file *file,
+					     unsigned long addr,
+					     unsigned long len,
+					     unsigned long pgoff,
+					     unsigned long flags);
+#endif
+
 static inline int can_do_mlock(void)
 {
 	if (capable(CAP_IPC_LOCK))
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/ipc/shm.c linux-2.6.10-rc2-mm3-shmem/ipc/shm.c
--- linux-2.6.10-rc2-mm3-mmcleanup/ipc/shm.c	2004-11-22 10:54:17.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/ipc/shm.c	2004-12-02 15:20:36.000000000 +0000
@@ -157,14 +157,20 @@ static void shm_close (struct vm_area_st
 
 static int shm_mmap(struct file * file, struct vm_area_struct * vma)
 {
-	file_accessed(file);
-	vma->vm_ops = &shm_vm_ops;
-	shm_inc(file->f_dentry->d_inode->i_ino);
-	return 0;
+	int ret;
+
+	ret = shmem_mmap(file, vma);
+	if (ret == 0) {
+		vma->vm_ops = &shm_vm_ops;
+		shm_inc(file->f_dentry->d_inode->i_ino);
+	}
+
+	return ret;
 }
 
 static struct file_operations shm_file_operations = {
-	.mmap	= shm_mmap
+	.mmap	= shm_mmap,
+	.get_unmapped_area = shmem_get_unmapped_area,
 };
 
 static struct vm_operations_struct shm_vm_ops = {
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/mm/shmem.c linux-2.6.10-rc2-mm3-shmem/mm/shmem.c
--- linux-2.6.10-rc2-mm3-mmcleanup/mm/shmem.c	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/mm/shmem.c	2004-12-02 15:16:23.000000000 +0000
@@ -1246,7 +1246,7 @@ out_nomem:
 	return retval;
 }
 
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/mm/tiny-shmem.c linux-2.6.10-rc2-mm3-shmem/mm/tiny-shmem.c
--- linux-2.6.10-rc2-mm3-mmcleanup/mm/tiny-shmem.c	2004-11-15 13:34:39.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/mm/tiny-shmem.c	2004-12-02 16:20:55.000000000 +0000
@@ -78,8 +78,13 @@ struct file *shmem_file_setup(char *name
 		goto close_file;
 
 	d_instantiate(dentry, inode);
-	inode->i_size = size;
 	inode->i_nlink = 0;	/* It is unlinked */
+
+	/* notify everyone as to the change of file size */
+	error = do_truncate(dentry, size);
+	if (error < 0)
+		goto close_file;
+
 	file->f_vfsmnt = mntget(shm_mnt);
 	file->f_dentry = dentry;
 	file->f_mapping = inode->i_mapping;
@@ -120,3 +125,22 @@ int shmem_unuse(swp_entry_t entry, struc
 {
 	return 0;
 }
+
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+#ifndef CONFIG_MMU
+	return ramfs_nommu_mmap(file, vma);
+#else
+	return 0;
+#endif
+}
+
+unsigned long shmem_get_unmapped_area(struct file *file,
+				      unsigned long addr,
+				      unsigned long len,
+				      unsigned long pgoff,
+				      unsigned long flags)
+{
+	return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
+}

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 3/5] NOMMU: mmap fixes and extensions
  2004-12-09 15:08 [PATCH 1/5] NOMMU: MM cleanups dhowells
  2004-12-09 15:08 ` [PATCH 4/5] NOMMU: Make POSIX shmem work on ramfs-backed files dhowells
  2004-12-09 15:08 ` [PATCH 5/5] NOMMU: Futher nommu shared memory support dhowells
@ 2004-12-09 15:08 ` dhowells
  2004-12-09 15:08 ` [PATCH 2/5] NOMMU: High-order page management overhaul dhowells
  3 siblings, 0 replies; 11+ messages in thread
From: dhowells @ 2004-12-09 15:08 UTC (permalink / raw)
  To: akpm, davidm, gerg, wli; +Cc: linux-kernel, uclinux-dev

The attached patch applies some fixes and extensions to the nommu mmap
implementation:

 (1) /proc/maps distinguishes shareable private mappings and real shared
     mappings by marking the former with 's' and the latter with 'S'.

 (2) Remove some #ifdefs from linux/mm.h now that proper VMAs are used.

 (3) Compile in prio_trees again now that proper VMAs are used.

 (4) Keep track of VMAs in the relevant mapping's prio_tree.

 (5) Only set VM_SHARED on MAP_SHARED mappings. Its presence indicates that the
     backing memory is supplied by the underlying file or chardev.

     VM_MAYSHARE indicates that a VMA may be shared if it's a private VMA
     (memory allocated by do_mmap_pgoff() calling kmalloc()).

 (6) Permit MAP_SHARED + PROT_WRITE on memory-backed files[*] and chardevs if
     the backing fs/chardev is willing to indicate a contiguous area of memory
     when its get_unmapped_area() is called.

     [*] file->f_mapping->backing_dev_info->memory_backed == 1

 (7) Uniquify overlapping VMAs (eg: MAP_SHARED on chardevs) in
     nommu_vma_tree. Identical entries break the assumptions on which rbtrees
     work. Since we don't need to share VMAs in this case, we uniquify such
     VMAs by using the pointer to the VMA. They're only kept in the tree for
     /proc/maps visibility.

With this patch it should be possible to map contiguous flash files directly
out of ROM simply by providing get_unmapped_area() for a read-only/shared
mapping.

Signed-Off-By: dhowells@redhat.com
---
diffstat nommu-mmap-2610rc2mm3-3.diff
 fs/proc/nommu.c    |    2 
 include/linux/mm.h |    4 -
 mm/Makefile        |    4 -
 mm/nommu.c         |  173 ++++++++++++++++++++++++++++++++++++++---------------
 4 files changed, 129 insertions(+), 54 deletions(-)

diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/fs/proc/nommu.c linux-2.6.10-rc2-mm3-shmem/fs/proc/nommu.c
--- linux-2.6.10-rc2-mm3-mmcleanup/fs/proc/nommu.c	2004-11-22 10:54:11.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/fs/proc/nommu.c	2004-12-03 11:53:00.000000000 +0000
@@ -62,7 +62,7 @@ static int nommu_vma_list_show(struct se
 		   flags & VM_READ ? 'r' : '-',
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
-		   flags & VM_MAYSHARE ? 's' : 'p',
+		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
 		   vma->vm_pgoff << PAGE_SHIFT,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/include/linux/mm.h linux-2.6.10-rc2-mm3-shmem/include/linux/mm.h
--- linux-2.6.10-rc2-mm3-mmcleanup/include/linux/mm.h	2004-11-22 10:54:16.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/include/linux/mm.h	2004-12-08 16:52:24.000000000 +0000
@@ -724,14 +772,12 @@ struct vm_area_struct *vma_prio_tree_nex
 	for (prio_tree_iter_init(iter, root, begin, end), vma = NULL;	\
 		(vma = vma_prio_tree_next(vma, iter)); )
 
-#ifdef CONFIG_MMU
 static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
 					struct list_head *list)
 {
 	vma->shared.vm_set.parent = NULL;
 	list_add_tail(&vma->shared.vm_set.list, list);
 }
-#endif
 
 /* mmap.c */
 extern void vma_adjust(struct vm_area_struct *vma, unsigned long start,
@@ -846,7 +892,6 @@ static inline void __vm_stat_account(str
 }
 #endif /* CONFIG_PROC_FS */
 
-#ifdef CONFIG_MMU
 static inline void vm_stat_account(struct vm_area_struct *vma)
 {
 	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
@@ -858,7 +903,6 @@ static inline void vm_stat_unaccount(str
 	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
 							-vma_pages(vma));
 }
-#endif
 
 /* update per process rss and vm hiwater data */
 extern void update_mem_hiwater(void);
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/mm/Makefile linux-2.6.10-rc2-mm3-shmem/mm/Makefile
--- linux-2.6.10-rc2-mm3-mmcleanup/mm/Makefile	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/mm/Makefile	2004-11-26 16:15:04.000000000 +0000
@@ -5,12 +5,12 @@
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-			   vmalloc.o prio_tree.o
+			   vmalloc.o
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
 			   readahead.o slab.o swap.o truncate.o vmscan.o \
-			   $(mmu-y)
+			   prio_tree.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/mm/nommu.c linux-2.6.10-rc2-mm3-shmem/mm/nommu.c
--- linux-2.6.10-rc2-mm3-mmcleanup/mm/nommu.c	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/mm/nommu.c	2004-12-07 18:44:17.000000000 +0000
@@ -48,10 +48,6 @@ DECLARE_RWSEM(nommu_vma_sem);
 struct vm_operations_struct generic_file_vm_ops = {
 };
 
-void __init prio_tree_init(void)
-{
-}
-
 /*
  * Handle all mappings that got truncated by a "truncate()"
  * system call.
@@ -315,25 +311,69 @@ static inline struct vm_area_struct *fin
 static void add_nommu_vma(struct vm_area_struct *vma)
 {
 	struct vm_area_struct *pvma;
+	struct address_space *mapping;
 	struct rb_node **p = &nommu_vma_tree.rb_node;
 	struct rb_node *parent = NULL;
 
+	/* add the VMA to the mapping */
+	if (vma->vm_file) {
+		mapping = vma->vm_file->f_mapping;
+
+		flush_dcache_mmap_lock(mapping);
+		vma_prio_tree_insert(vma, &mapping->i_mmap);
+		flush_dcache_mmap_unlock(mapping);
+	}
+
+	/* add the VMA to the master list */
 	while (*p) {
 		parent = *p;
 		pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
 
-		if (vma->vm_start < pvma->vm_start)
+		if (vma->vm_start < pvma->vm_start) {
 			p = &(*p)->rb_left;
-		else if (vma->vm_start > pvma->vm_start)
+		}
+		else if (vma->vm_start > pvma->vm_start) {
 			p = &(*p)->rb_right;
-		else
-			BUG(); /* shouldn't happen by this point */
+		}
+		else {
+			/* mappings are at the same address - this can only
+			 * happen for shared-mem chardevs and shared file
+			 * mappings backed by ramfs/tmpfs */
+			BUG_ON(!(pvma->vm_flags & VM_SHARED));
+
+			if (vma < pvma)
+				p = &(*p)->rb_left;
+			else if (vma > pvma)
+				p = &(*p)->rb_right;
+			else
+				BUG();
+		}
 	}
 
 	rb_link_node(&vma->vm_rb, parent, p);
 	rb_insert_color(&vma->vm_rb, &nommu_vma_tree);
 }
 
+static void delete_nommu_vma(struct vm_area_struct *vma)
+{
+	struct address_space *mapping;
+
+	/* remove the VMA from the mapping */
+	if (vma->vm_file) {
+		mapping = vma->vm_file->f_mapping;
+
+		flush_dcache_mmap_lock(mapping);
+		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		flush_dcache_mmap_unlock(mapping);
+	}
+
+	/* remove from the master list */
+	rb_erase(&vma->vm_rb, &nommu_vma_tree);
+}
+
+/*
+ * handle mapping creation for uClinux
+ */
 unsigned long do_mmap_pgoff(struct file *file,
 			    unsigned long addr,
 			    unsigned long len,
@@ -343,19 +383,27 @@ unsigned long do_mmap_pgoff(struct file 
 {
 	struct vm_list_struct *vml = NULL;
 	struct vm_area_struct *vma = NULL;
+	struct address_space *mapping = NULL;
 	struct rb_node *rb;
 	unsigned int vm_flags;
 	void *result;
-	int ret, chrdev;
+	int ret, chrdev, memback;
 
 	/*
 	 * Get the !CONFIG_MMU specific checks done first
 	 */
+	memback = 0;
 	chrdev = 0;
-	if (file)
+	if (file) {
 		chrdev = S_ISCHR(file->f_dentry->d_inode->i_mode);
+		mapping = file->f_mapping;
+		if (!mapping)
+			mapping = file->f_dentry->d_inode->i_mapping;
+		if (mapping && mapping->backing_dev_info)
+			memback = mapping->backing_dev_info->memory_backed;
+	}
 
-	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && file && !chrdev) {
+	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && file && !chrdev && !memback) {
 		printk("MAP_SHARED not completely supported (cannot detect page dirtying)\n");
 		return -EINVAL;
 	}
@@ -387,49 +435,53 @@ unsigned long do_mmap_pgoff(struct file 
 		goto error_getting_vml;
 	memset(vml, 0, sizeof(*vml));
 
-	/* Do simple checking here so the lower-level routines won't have
+	/* do simple checking here so the lower-level routines won't have
 	 * to. we assume access permissions have been handled by the open
 	 * of the memory object, so we don't do any here.
 	 */
 	vm_flags = calc_vm_flags(prot,flags) /* | mm->def_flags */
 		| VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
-	if (!chrdev) {
+	if (!chrdev && !memback) {
 		/* share any file segment that's mapped read-only */
 		if (((flags & MAP_PRIVATE) && !(prot & PROT_WRITE) && file) ||
 		    ((flags & MAP_SHARED) && !(prot & PROT_WRITE) && file))
-			vm_flags |= VM_SHARED | VM_MAYSHARE;
+			vm_flags |= VM_MAYSHARE;
 
 		/* refuse to let anyone share files with this process if it's being traced -
 		 * otherwise breakpoints set in it may interfere with another untraced process
 		 */
-		if (!chrdev && current->ptrace & PT_PTRACED)
+		if (current->ptrace & PT_PTRACED)
 			vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
 	}
 	else {
-		/* permit sharing of character devices at any time */
+		/* permit sharing of character devices and ramfs files at any time */
 		vm_flags |= VM_MAYSHARE;
 		if (flags & MAP_SHARED)
 			vm_flags |= VM_SHARED;
 	}
 
-	/* if we want to share, we need to search for VMAs created by another mmap() call that
-	 * overlap with our proposed mapping
-	 * - we can only share with an exact match on regular files
-	 * - shared mappings on character devices are permitted to overlap inexactly as far as we
-	 *   are concerned, but in that case, sharing is handled in the driver rather than here
-	 */
 	down_write(&nommu_vma_sem);
-	if (!chrdev && vm_flags & VM_SHARED) {
+
+	/* if we want to share, we need to search for VMAs created by another
+	 * mmap() call that overlap with our proposed mapping
+	 * - we can only share with an exact match on most regular files
+	 * - shared mappings on character devices and memory backed files are
+	 *   permitted to overlap inexactly as far as we are concerned for in
+	 *   these cases, sharing is handled in the driver or filesystem rather
+	 *   than here
+	 */
+	if (vm_flags & VM_MAYSHARE) {
 		unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		unsigned long vmpglen;
 
 		for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) {
 			vma = rb_entry(rb, struct vm_area_struct, vm_rb);
 
-			if (!(vma->vm_flags & VM_SHARED))
+			if (!(vma->vm_flags & VM_MAYSHARE))
 				continue;
 
+			/* search for overlapping mappings on the same file */
 			if (vma->vm_file->f_dentry->d_inode != file->f_dentry->d_inode)
 				continue;
 
@@ -440,8 +492,9 @@ unsigned long do_mmap_pgoff(struct file 
 			if (pgoff >= vma->vm_pgoff + vmpglen)
 				continue;
 
+			/* handle inexact matches between mappings */
 			if (vmpglen != pglen || vma->vm_pgoff != pgoff) {
-				if (flags & MAP_SHARED)
+				if (!chrdev && !memback)
 					goto sharing_violation;
 				continue;
 			}
@@ -455,6 +508,8 @@ unsigned long do_mmap_pgoff(struct file 
 		}
 	}
 
+	vma = NULL;
+
 	/* obtain the address to map to. we verify (or select) it and ensure
 	 * that it represents a valid section of the address space
 	 * - this is the hook for quasi-memory character devices
@@ -496,7 +551,6 @@ unsigned long do_mmap_pgoff(struct file 
 
 #ifdef MAGIC_ROM_PTR
 		/* First, try simpler routine designed to give us a ROM pointer. */
-
 		if (file->f_op->romptr && !(prot & PROT_WRITE)) {
 			ret = file->f_op->romptr(file, vma);
 #ifdef DEBUG
@@ -510,9 +564,9 @@ unsigned long do_mmap_pgoff(struct file 
 				goto error;
 		} else
 #endif /* MAGIC_ROM_PTR */
-		/* Then try full mmap routine, which might return a RAM pointer,
-		   or do something truly complicated. */
-
+		/* Then try full mmap routine, which might return a RAM
+		 * pointer, or do something truly complicated
+		 */
 		if (file->f_op->mmap) {
 			ret = file->f_op->mmap(file, vma);
 
@@ -530,8 +584,9 @@ unsigned long do_mmap_pgoff(struct file 
 			goto error;
 		}
 
-		/* An ENOSYS error indicates that mmap isn't possible (as opposed to
-		   tried but failed) so we'll fall through to the copy. */
+		/* An ENOSYS error indicates that mmap isn't possible (as
+		 * opposed to tried but failed) so we'll fall through to the
+		 * copy. */
 	}
 
 	/* allocate some memory to hold the mapping */
@@ -576,8 +631,10 @@ unsigned long do_mmap_pgoff(struct file 
 		flush_icache_range((unsigned long) result, (unsigned long) result + len);
 
  done:
-	realalloc += kobjsize(result);
-	askedalloc += len;
+	if (!(vma->vm_flags & VM_SHARED)) {
+		realalloc += kobjsize(result);
+		askedalloc += len;
+	}
 
 	realalloc += kobjsize(vma);
 	askedalloc += sizeof(*vma);
@@ -639,21 +696,24 @@ static void put_vma(struct vm_area_struc
 		down_write(&nommu_vma_sem);
 
 		if (atomic_dec_and_test(&vma->vm_usage)) {
-			rb_erase(&vma->vm_rb, &nommu_vma_tree);
+			delete_nommu_vma(vma);
 
 			if (vma->vm_ops && vma->vm_ops->close)
 				vma->vm_ops->close(vma);
 
-			if (!(vma->vm_flags & VM_IO) && vma->vm_start) {
+			/* IO memory and memory shared directly out of the pagecache from
+			 * ramfs/tmpfs mustn't be released here */
+			if (!(vma->vm_flags & (VM_IO | VM_SHARED)) && vma->vm_start) {
 				realalloc -= kobjsize((void *) vma->vm_start);
 				askedalloc -= vma->vm_end - vma->vm_start;
-				if (vma->vm_file)
-					fput(vma->vm_file);
 				kfree((void *) vma->vm_start);
 			}
 
 			realalloc -= kobjsize(vma);
 			askedalloc -= sizeof(*vma);
+
+			if (vma->vm_file)
+				fput(vma->vm_file);
 			kfree(vma);
 		}
 
@@ -664,6 +724,7 @@ static void put_vma(struct vm_area_struc
 int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 {
 	struct vm_list_struct *vml, **parent;
+	unsigned long end = addr + len;
 
 #ifdef MAGIC_ROM_PTR
 	/* For efficiency's sake, if the pointer is obviously in ROM,
@@ -677,15 +738,16 @@ int do_munmap(struct mm_struct *mm, unsi
 #endif
 
 	for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next)
-		if ((*parent)->vma->vm_start == addr)
-			break;
-	vml = *parent;
+		if ((*parent)->vma->vm_start == addr &&
+		    (*parent)->vma->vm_end == end)
+			goto found;
 
-	if (!vml) {
-		printk("munmap of non-mmaped memory by process %d (%s): %p\n",
-		       current->pid, current->comm, (void *) addr);
-		return -EINVAL;
-	}
+	printk("munmap of non-mmaped memory by process %d (%s): %p\n",
+	       current->pid, current->comm, (void *) addr);
+	return -EINVAL;
+
+ found:
+	vml = *parent;
 
 	put_vma(vml->vma);
 
@@ -793,12 +855,23 @@ unsigned long do_mremap(unsigned long ad
 	return vml->vma->vm_start;
 }
 
-struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
+/*
+ * Look up the first VMA which satisfies  addr < vm_end,  NULL if none
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
+	struct vm_list_struct *vml;
+
+	for (vml = mm->context.vmlist; vml; vml = vml->next)
+		if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end)
+			return vml->vma;
+
 	return NULL;
 }
 
-struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
+EXPORT_SYMBOL(find_vma);
+
+struct page *follow_page(struct mm_struct *mm, unsigned long addr, int write)
 {
 	return NULL;
 }
@@ -845,3 +918,9 @@ void unmap_mapping_range(struct address_
 			 int even_cows)
 {
 }
+
+struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int *type)
+{
+	BUG();
+	return NULL;
+}

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/5] NOMMU: High-order page management overhaul
  2004-12-09 15:08 [PATCH 1/5] NOMMU: MM cleanups dhowells
                   ` (2 preceding siblings ...)
  2004-12-09 15:08 ` [PATCH 3/5] NOMMU: mmap fixes and extensions dhowells
@ 2004-12-09 15:08 ` dhowells
  3 siblings, 0 replies; 11+ messages in thread
From: dhowells @ 2004-12-09 15:08 UTC (permalink / raw)
  To: akpm, davidm, gerg, wli; +Cc: linux-kernel, uclinux-dev

The attached patch overhauls high-order page handling.

 (1) A new bit flag PG_compound_slave has been added. This is used to mark the
     second+ subpages of a compound page, thus making get_page() and
     put_page() able to determine the need to perform weird stuff quickly.

     This could be changed to do horribly things with the page count or to
     abuse the page->lru member instead of eating another page flag.

 (2) Compound page metadata is now always set on compound pages when allocating
     and checked when freeing. This metadata is mostly as it was before:

	- PG_compound is set on all subpages
	- PG_compound_slave is set on all but the first subpage <--- [1]
	- page[1].index holds the compound page order
	- page[1...N-1].private points to page[0]. <--- [2]
	- page[1].mapping may hold a destructor function for put_page()

     This is now done in prep_new_page().

     [1] New metadata addition
     [2] Page private is no longer modified on page[0]

 (3) __page_first() is now provided to find the first page of any page set
     (even single page sets).

 (4) A new config option ENHANCED_COMPOUND_PAGES is now available. This is
     only set on !MMU or HUGETLB_PAGE. It causes __page_first() to dereference
     page->private if PG_compound_slave is set.

 (5) __GFP_COMP is required to request a compound page. This is asserted by the
     slab allocator when it allocates a page. The flag is ignored for any
     single-page allocation.

 (6) compound_page_order() is now available. This will indicate the order of a
     compound page. It says that high-order arrays of single pages are order 0.

     Since it is now trivial to work out the order of any page, free_pages()
     and co could all lose their order arguments.

 (7) bad_page() now prints more information, including information about more
     pages in the case of a compound page.

 (8) prep_compound_page() and destroy_compound_page() have been absorbed.

 (9) A lot more unlikely() clauses have been inserted in the free page
     checking functions.

(10) The !MMU bits have all gone from page_alloc.c.

(11) There's now a page destructor prototype and a function to set the
     destructor on compound pages.

(12) Two functions are now provided in page_alloc.c to dissociate high-order or
     compound pages into pages of a smaller order.

Note: I've changed my patch such that high-order pages aren't always marked
compound now. This has reverted to being contingent on the __GFP_COMP flag
being passed to __alloc_pages(). The slab allocator now always supplies this
flag.

Signed-Off-By: dhowells@redhat.com
---
diffstat compound-2610rc2mm3-3.diff
 include/linux/mm.h         |   69 ++++++--
 include/linux/page-flags.h |    6 
 init/Kconfig               |   13 +
 mm/hugetlb.c               |    4 
 mm/page_alloc.c            |  388 ++++++++++++++++++++++++++++-----------------
 mm/slab.c                  |    2 
 6 files changed, 323 insertions(+), 159 deletions(-)

diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/include/linux/mm.h linux-2.6.10-rc2-mm3-shmem/include/linux/mm.h
--- linux-2.6.10-rc2-mm3-mmcleanup/include/linux/mm.h	2004-11-22 10:54:16.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/include/linux/mm.h	2004-12-08 16:52:24.000000000 +0000
@@ -227,6 +227,12 @@ typedef unsigned long page_flags_t;
  * it to keep track of whatever it is we are using the page for at the
  * moment. Note that we have no way to track which tasks are using
  * a page.
+ *
+ * Any high-order page allocation has all the pages marked PG_compound. The
+ * first page of such a block holds the block's usage count and control
+ * data. The second page holds the order in its index member and a destructor
+ * function pointer in its mapping member. In enhanced compound page mode, the
+ * second+ pages have their private pointers pointing at the first page.
  */
 struct page {
 	page_flags_t flags;		/* Atomic flags, some possibly
@@ -314,45 +320,76 @@ struct page {
  */
 #define get_page_testone(p)	atomic_inc_and_test(&(p)->_count)
 
-#define set_page_count(p,v) 	atomic_set(&(p)->_count, v - 1)
+#define set_page_count(p,v) 	atomic_set(&(p)->_count, (v) - 1)
 #define __put_page(p)		atomic_dec(&(p)->_count)
 
 extern void FASTCALL(__page_cache_release(struct page *));
 
-#ifdef CONFIG_HUGETLB_PAGE
-
-static inline int page_count(struct page *p)
+static inline struct page *page_head(struct page *page)
 {
-	if (PageCompound(p))
-		p = (struct page *)p->private;
-	return atomic_read(&(p)->_count) + 1;
+#ifdef CONFIG_ENHANCED_COMPOUND_PAGES
+	if (unlikely(PageCompoundSlave(page)))
+		page = (struct page *) page->private;
+#endif
+	return page;
 }
 
-static inline void get_page(struct page *page)
+static inline unsigned compound_page_order(struct page *page)
 {
-	if (unlikely(PageCompound(page)))
-		page = (struct page *)page->private;
-	atomic_inc(&page->_count);
+	unsigned order = 0;
+
+	if (unlikely(PageCompound(page))) {
+		page = page_head(page);
+		order = page[1].index;
+	}
+	return order;
 }
 
-void put_page(struct page *page);
+extern void split_compound_page(struct page *page, unsigned new_order);
+extern void split_highorder_page(struct page *page, unsigned new_order,
+				 unsigned old_order);
 
-#else		/* CONFIG_HUGETLB_PAGE */
+typedef void (*page_dtor_t)(struct page *);
 
-#define page_count(p)		(atomic_read(&(p)->_count) + 1)
+static inline page_dtor_t page_dtor(struct page *page)
+{
+	page_dtor_t dtor = NULL;
+
+	if (unlikely(PageCompound(page))) {
+		page = page_head(page);
+		dtor = (page_dtor_t) page[1].mapping;
+	}
+	return dtor;
+}
+
+static inline void set_page_dtor(struct page *page, page_dtor_t dtor)
+{
+	BUG_ON(!PageCompound(page));
+	BUG_ON(PageCompoundSlave(page));
+	page[1].mapping = (void *) dtor;
+}
+
+static inline int page_count(struct page *page)
+{
+	page = page_head(page);
+	return atomic_read(&page->_count) + 1;
+}
 
 static inline void get_page(struct page *page)
 {
+	page = page_head(page);
 	atomic_inc(&page->_count);
 }
 
+#ifdef CONFIG_ENHANCED_COMPOUND_PAGES
+extern fastcall void put_page(struct page *page);
+#else
 static inline void put_page(struct page *page)
 {
 	if (!PageReserved(page) && put_page_testzero(page))
 		__page_cache_release(page);
 }
-
-#endif		/* CONFIG_HUGETLB_PAGE */
+#endif /* CONFIG_COMPOUND_PAGE */
 
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/include/linux/page-flags.h linux-2.6.10-rc2-mm3-shmem/include/linux/page-flags.h
--- linux-2.6.10-rc2-mm3-mmcleanup/include/linux/page-flags.h	2004-11-22 10:54:16.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/include/linux/page-flags.h	2004-11-22 11:45:09.000000000 +0000
@@ -78,6 +78,7 @@
 #define PG_sharedpolicy         19      /* Page was allocated for a file
 					   mapping using a shared_policy */
 
+#define PG_compound_slave	20	/* second+ page of a compound page */
 
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
@@ -294,6 +295,11 @@ extern unsigned long __read_page_state(u
 #define PageCompound(page)	test_bit(PG_compound, &(page)->flags)
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
+#define __ClearPageCompound(page)	__clear_bit(PG_compound, &(page)->flags)
+
+#define PageCompoundSlave(page)		test_bit(PG_compound_slave, &(page)->flags)
+#define SetPageCompoundSlave(page)	set_bit(PG_compound_slave, &(page)->flags)
+#define ClearPageCompoundSlave(page)	clear_bit(PG_compound_slave, &(page)->flags)
 
 #define PageSharedPolicy(page)      test_bit(PG_sharedpolicy, &(page)->flags)
 #define SetPageSharedPolicy(page)   set_bit(PG_sharedpolicy, &(page)->flags)
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/init/Kconfig linux-2.6.10-rc2-mm3-shmem/init/Kconfig
--- linux-2.6.10-rc2-mm3-mmcleanup/init/Kconfig	2004-11-22 10:54:17.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/init/Kconfig	2004-12-01 17:07:36.000000000 +0000
@@ -380,6 +380,19 @@ config TINY_SHMEM
 	default !SHMEM
 	bool
 
+config ENHANCED_COMPOUND_PAGES
+	bool
+	default HUGETLB_PAGE || !MMU
+	help
+
+	  Enhance management of high-order pages by pointing the 2nd+ pages at
+	  the first. get_page() and put_page() then use the usage count on the
+	  first page to manage all the pages in the block.
+
+	  This is used when it might be necessary to access the intermediate
+	  pages of a block, such as ptrace() might under nommu of hugetlb
+	  conditions.
+
 menu "Loadable module support"
 
 config MODULES
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/mm/hugetlb.c linux-2.6.10-rc2-mm3-shmem/mm/hugetlb.c
--- linux-2.6.10-rc2-mm3-mmcleanup/mm/hugetlb.c	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/mm/hugetlb.c	2004-12-01 15:37:59.000000000 +0000
@@ -67,7 +67,7 @@ void free_huge_page(struct page *page)
 	BUG_ON(page_count(page));
 
 	INIT_LIST_HEAD(&page->lru);
-	page[1].mapping = NULL;
+	set_page_dtor(page, NULL);
 
 	spin_lock(&hugetlb_lock);
 	enqueue_huge_page(page);
@@ -87,7 +87,7 @@ struct page *alloc_huge_page(void)
 	}
 	spin_unlock(&hugetlb_lock);
 	set_page_count(page, 1);
-	page[1].mapping = (void *)free_huge_page;
+	set_page_dtor(page, free_huge_page);
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_highpage(&page[i]);
 	return page;
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/mm/page_alloc.c linux-2.6.10-rc2-mm3-shmem/mm/page_alloc.c
--- linux-2.6.10-rc2-mm3-mmcleanup/mm/page_alloc.c	2004-11-23 16:13:04.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/mm/page_alloc.c	2004-12-02 14:02:37.000000000 +0000
@@ -80,15 +80,61 @@ static int bad_range(struct zone *zone, 
 	return 0;
 }
 
-static void bad_page(const char *function, struct page *page)
+static inline void __bad_page(struct page *page)
 {
-	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
-		function, current->comm, page);
-	printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
-		(int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
-		page->mapping, page_mapcount(page), page_count(page));
+	const char *fmt;
+
+	if (sizeof(void *) == 4)
+		fmt = KERN_EMERG "%08lx %p %08x %p %8x %8x %8lx %8lx\n";
+	else
+		fmt = KERN_EMERG "%016lx %p %08x %p %8x %8x %16lx %16lx\n";
+
+	printk(fmt,
+	       page_to_pfn(page),
+	       page,
+	       (unsigned) page->flags,
+	       page->mapping, page_mapcount(page), page_count(page),
+	       page->index, page->private);
+}
+
+static void bad_page(const char *function, struct page *page,
+		     struct page *page0, int order)
+{
+	printk(KERN_EMERG "\n");
+	printk(KERN_EMERG
+	       "Bad page state at %s (in process '%s', order %d)\n",
+	       function, current->comm, order);
+
+	if (sizeof(void *) == 4) {
+		printk(KERN_EMERG
+		       "PFN      PAGE*    FLAGS    MAPPING  MAPCOUNT COUNT    INDEX    PRIVATE\n");
+		printk(KERN_EMERG
+		       "======== ======== ======== ======== ======== ======== ======== ========\n");
+	}
+	else {
+		printk(KERN_EMERG
+		       "PFN              PAGE*            FLAGS    MAPPING          MAPCOUNT COUNT    INDEX            PRIVATE\n");
+		printk(KERN_EMERG
+		       "================ ================ ======== ================ ======== ======== ================ ================\n");
+	}
+
+	/* print extra details on a compound page */
+	if (PageCompound(page0)) {
+		__bad_page(page0);
+		__bad_page(page0 + 1);
+
+		if (page > page0 + 1) {
+			if (page > page0 + 2)
+				printk(KERN_EMERG "...\n");
+			__bad_page(page);
+		}
+	} else {
+		__bad_page(page);
+	}
+
 	printk(KERN_EMERG "Backtrace:\n");
 	dump_stack();
+
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
 	page->flags &= ~(1 << PG_private	|
 			1 << PG_locked	|
@@ -103,82 +149,6 @@ static void bad_page(const char *functio
 	tainted |= TAINT_BAD_PAGE;
 }
 
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
-	set_page_count(page, 1);
-#else
-	int i;
-
-	/*
-	 * We need to reference all the pages for this order, otherwise if
-	 * anyone accesses one of the pages with (get/put) it will be freed.
-	 * - eg: access_process_vm()
-	 */
-	for (i = 0; i < (1 << order); i++)
-		set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
-}
-
-#ifndef CONFIG_HUGETLB_PAGE
-#define prep_compound_page(page, order) do { } while (0)
-#define destroy_compound_page(page, order) do { } while (0)
-#else
-/*
- * Higher-order pages are called "compound pages".  They are structured thusly:
- *
- * The first PAGE_SIZE page is called the "head page".
- *
- * The remaining PAGE_SIZE pages are called "tail pages".
- *
- * All pages have PG_compound set.  All pages have their ->private pointing at
- * the head page (even the head page has this).
- *
- * The first tail page's ->mapping, if non-zero, holds the address of the
- * compound page's put_page() function.
- *
- * The order of the allocation is stored in the first tail page's ->index
- * This is only for debug at present.  This usage means that zero-order pages
- * may not be compound.
- */
-static void prep_compound_page(struct page *page, unsigned long order)
-{
-	int i;
-	int nr_pages = 1 << order;
-
-	page[1].mapping = NULL;
-	page[1].index = order;
-	for (i = 0; i < nr_pages; i++) {
-		struct page *p = page + i;
-
-		SetPageCompound(p);
-		p->private = (unsigned long)page;
-	}
-}
-
-static void destroy_compound_page(struct page *page, unsigned long order)
-{
-	int i;
-	int nr_pages = 1 << order;
-
-	if (!PageCompound(page))
-		return;
-
-	if (page[1].index != order)
-		bad_page(__FUNCTION__, page);
-
-	for (i = 0; i < nr_pages; i++) {
-		struct page *p = page + i;
-
-		if (!PageCompound(p))
-			bad_page(__FUNCTION__, page);
-		if (p->private != (unsigned long)page)
-			bad_page(__FUNCTION__, page);
-		ClearPageCompound(p);
-	}
-}
-#endif		/* CONFIG_HUGETLB_PAGE */
-
 /*
  * function for dealing with page's order in buddy system.
  * zone->lock is already acquired when we use these.
@@ -201,6 +171,11 @@ static inline void rmv_page_order(struct
 	page->private = 0;
 }
 
+static inline void set_page_refs(struct page *page, int order)
+{
+	set_page_count(page, 1);
+}
+
 /*
  * This function checks whether a page is free && is the buddy
  * we can do coalesce a page and its buddy if
@@ -221,6 +196,93 @@ static inline int page_is_buddy(struct p
 }
 
 /*
+ * validate a page that's being handed back for recycling
+ */
+static
+void free_pages_check_compound(const char *function, struct page *page, int order)
+{
+	struct page *xpage;
+	int i;
+
+	xpage = page;
+
+	if (unlikely(order == 0 ||
+		     PageCompoundSlave(page)
+		     ))
+		goto badpage;
+
+	xpage++;
+	if (unlikely(xpage->index != order))
+		goto badpage;
+
+	for (i = (1 << order) - 1; i > 0; i--) {
+		if (unlikely(!PageCompound(xpage) ||
+			     !PageCompoundSlave(xpage) ||
+			     (xpage->flags & (
+				     1 << PG_lru	|
+				     1 << PG_private	|
+				     1 << PG_locked	|
+				     1 << PG_active	|
+				     1 << PG_reclaim	|
+				     1 << PG_slab	|
+				     1 << PG_swapcache	|
+				     1 << PG_writeback
+				     )) ||
+			     page_count(xpage) != 0 ||
+			     page_mapped(xpage) ||
+			     xpage->mapping != NULL ||
+			     xpage->private != (unsigned long) page
+			     ))
+			goto badpage;
+
+		if (PageDirty(xpage))
+			ClearPageDirty(xpage);
+		xpage++;
+	}
+
+	return;
+
+ badpage:
+	bad_page(function, xpage, page, order);
+	return;
+}
+
+static inline
+void free_pages_check(const char *function, struct page *page, int order)
+{
+	if (unlikely(
+		page_mapped(page) ||
+		page->mapping != NULL ||
+		page_count(page) != 0 ||
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_reclaim	|
+			1 << PG_slab	|
+			1 << PG_swapcache |
+			1 << PG_writeback ))
+		))
+		goto badpage;
+
+	/* check that compound pages are correctly assembled */
+	if (unlikely(PageCompound(page)))
+		free_pages_check_compound(function, page, order);
+	else if (unlikely(order > 0))
+		goto badpage;
+
+	if (PageDirty(page))
+		ClearPageDirty(page);
+
+	return;
+
+ badpage:
+	bad_page(function, page, page, order);
+	return;
+}
+
+/*
  * Freeing function for a buddy system allocator.
  *
  * The concept of a buddy system is to maintain direct-mapped table
@@ -251,8 +313,14 @@ static inline void __free_pages_bulk (st
 	struct page *coalesced;
 	int order_size = 1 << order;
 
-	if (unlikely(order))
-		destroy_compound_page(page, order);
+	if (unlikely(PageCompound(page))) {
+		struct page *xpage = page;
+		int i;
+
+		for (i = (1 << order); i > 0; i--)
+			(xpage++)->flags &=
+				~(1 << PG_compound | 1 << PG_compound_slave);
+	}
 
 	page_idx = page - base;
 
@@ -285,25 +353,6 @@ static inline void __free_pages_bulk (st
 	zone->free_area[order].nr_free++;
 }
 
-static inline void free_pages_check(const char *function, struct page *page)
-{
-	if (	page_mapped(page) ||
-		page->mapping != NULL ||
-		page_count(page) != 0 ||
-		(page->flags & (
-			1 << PG_lru	|
-			1 << PG_private |
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_reclaim	|
-			1 << PG_slab	|
-			1 << PG_swapcache |
-			1 << PG_writeback )))
-		bad_page(function, page);
-	if (PageDirty(page))
-		ClearPageDirty(page);
-}
-
 /*
  * Frees a list of pages.
  * Assumes all pages on list are in same zone, and of same order.
@@ -341,20 +390,12 @@ free_pages_bulk(struct zone *zone, int c
 void __free_pages_ok(struct page *page, unsigned int order)
 {
 	LIST_HEAD(list);
-	int i;
 
 	arch_free_page(page, order);
 
 	mod_page_state(pgfree, 1 << order);
 
-#ifndef CONFIG_MMU
-	if (order > 0)
-		for (i = 1 ; i < (1 << order) ; ++i)
-			__put_page(page + i);
-#endif
-
-	for (i = 0 ; i < (1 << order) ; ++i)
-		free_pages_check(__FUNCTION__, page + i);
+	free_pages_check(__FUNCTION__, page, order);
 	list_add(&page->lru, &list);
 	kernel_map_pages(page, 1 << order, 0);
 	free_pages_bulk(page_zone(page), 1, &list, order);
@@ -419,25 +460,57 @@ expand(struct zone *zone, struct page *p
 /*
  * This page is about to be returned from the page allocator
  */
-static void prep_new_page(struct page *page, int order)
+static void prep_new_page(struct page *page, unsigned int gfp_mask, int order,
+			  int check)
 {
-	if (page->mapping || page_mapped(page) ||
-	    (page->flags & (
-			1 << PG_private	|
-			1 << PG_locked	|
-			1 << PG_lru	|
-			1 << PG_active	|
-			1 << PG_dirty	|
-			1 << PG_reclaim	|
-			1 << PG_swapcache |
-			1 << PG_writeback )))
-		bad_page(__FUNCTION__, page);
+	page_flags_t pgflags = page->flags;
+
+	/* check the struct page hasn't become corrupted */
+	if (check) {
+		if (page->mapping || page_mapped(page) ||
+		    (pgflags & (
+			    1 << PG_private	|
+			    1 << PG_locked	|
+			    1 << PG_lru	|
+			    1 << PG_active	|
+			    1 << PG_dirty	|
+			    1 << PG_reclaim	|
+			    1 << PG_swapcache |
+			    1 << PG_writeback |
+			    1 << PG_compound |
+			    1 << PG_compound_slave)))
+			bad_page(__FUNCTION__, page, page, order);
+	}
+
+	pgflags &= ~(1 << PG_uptodate | 1 << PG_error |
+		     1 << PG_referenced | 1 << PG_arch_1 |
+		     1 << PG_checked | 1 << PG_mappedtodisk);
 
-	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
-			1 << PG_referenced | 1 << PG_arch_1 |
-			1 << PG_checked | 1 << PG_mappedtodisk);
 	page->private = 0;
+
+	/* set the refcount on the page */
 	set_page_refs(page, order);
+
+	/* if requested, mark a high-order allocation as being a compound page
+	 * and store high-order page metadata on the second page */
+	if (order > 0 && gfp_mask & __GFP_COMP) {
+		struct page *xpage;
+		int i;
+
+		pgflags |= 1 << PG_compound;
+
+		page[1].index = order;
+		page[1].mapping = NULL; /* no destructor yet */
+
+		xpage = page + 1;
+		for (i = (1 << order) - 1; i > 0; i--) {
+			xpage->flags |= 1 << PG_compound | 1 << PG_compound_slave;
+			xpage->private = (unsigned long) page;
+			xpage++;
+		}
+	}
+
+	page->flags = pgflags;
 }
 
 /*
@@ -589,7 +662,7 @@ void fastcall free_hot_cold_page(struct 
 	inc_page_state(pgfree);
 	if (PageAnon(page))
 		page->mapping = NULL;
-	free_pages_check(__FUNCTION__, page);
+	free_pages_check(__FUNCTION__, page, 0);
 	pcp = &zone->pageset[get_cpu()].pcp[cold];
 	local_irq_save(flags);
 	if (pcp->count >= pcp->high)
@@ -708,11 +781,11 @@ perthread_pages_alloc(void)
  */
 
 static struct page *
-buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
+buffered_rmqueue(struct zone *zone, int order, unsigned int gfp_mask)
 {
 	unsigned long flags;
 	struct page *page = NULL;
-	int cold = !!(gfp_flags & __GFP_COLD);
+	int cold = !!(gfp_mask & __GFP_COLD);
 
 	if (order == 0) {
 		struct per_cpu_pages *pcp;
@@ -740,9 +813,7 @@ buffered_rmqueue(struct zone *zone, int 
 	if (page != NULL) {
 		BUG_ON(bad_range(zone, page));
 		mod_page_state_zone(zone, pgalloc, 1 << order);
-		prep_new_page(page, order);
-		if (order && (gfp_flags & __GFP_COMP))
-			prep_compound_page(page, order);
+		prep_new_page(page, gfp_mask, order, 1);
 	}
 	return page;
 }
@@ -1003,23 +1074,24 @@ fastcall void free_pages(unsigned long a
 
 EXPORT_SYMBOL(free_pages);
 
-#ifdef CONFIG_HUGETLB_PAGE
-
-void put_page(struct page *page)
+#ifdef CONFIG_ENHANCED_COMPOUND_PAGES
+fastcall void put_page(struct page *page)
 {
 	if (unlikely(PageCompound(page))) {
-		page = (struct page *)page->private;
+		page = (struct page *) page->private;
 		if (put_page_testzero(page)) {
-			void (*dtor)(struct page *page);
+			page_dtor_t dtor;
 
-			dtor = (void (*)(struct page *))page[1].mapping;
+			dtor = (page_dtor_t) page[1].mapping;
 			(*dtor)(page);
 		}
 		return;
 	}
-	if (!PageReserved(page) && put_page_testzero(page))
+
+	if (likely(!PageReserved(page)) && put_page_testzero(page))
 		__page_cache_release(page);
 }
+
 EXPORT_SYMBOL(put_page);
 #endif
 
@@ -2258,3 +2330,39 @@ void *__init alloc_large_system_hash(con
 
 	return table;
 }
+
+/*
+ * split a compound page into an array of smaller chunks of a given order
+ */
+void split_compound_page(struct page *page, unsigned new_order)
+{
+	unsigned old_order, loop, stop, step;
+
+	old_order = compound_page_order(page);
+	if (old_order != new_order) {
+		BUG_ON(old_order < new_order);
+
+		stop = 1 << old_order;
+		step = 1 << new_order;
+		for (loop = 0; loop < stop; loop += step)
+			prep_new_page(page + loop, __GFP_COMP, new_order, 0);
+	}
+}
+
+/*
+ * split a high-order page into an array of smaller chunks of a given order
+ */
+void split_highorder_page(struct page *page, unsigned new_order,
+			  unsigned old_order)
+{
+	unsigned loop, stop, step;
+
+	if (old_order != new_order) {
+		BUG_ON(old_order < new_order);
+
+		stop = 1 << old_order;
+		step = 1 << new_order;
+		for (loop = 0; loop < stop; loop += step)
+			prep_new_page(page + loop, 0, new_order, 0);
+	}
+}
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/mm/slab.c linux-2.6.10-rc2-mm3-shmem/mm/slab.c
--- linux-2.6.10-rc2-mm3-mmcleanup/mm/slab.c	2004-11-22 10:54:18.000000000 +0000
+++ linux-2.6.10-rc2-mm3-shmem/mm/slab.c	2004-12-01 15:49:28.000000000 +0000
@@ -873,7 +873,7 @@ static void *kmem_getpages(kmem_cache_t 
 	void *addr;
 	int i;
 
-	flags |= cachep->gfpflags;
+	flags |= cachep->gfpflags | __GFP_COMP;
 	if (likely(nodeid == -1)) {
 		addr = (void*)__get_free_pages(flags, cachep->gfporder);
 		if (!addr)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 4/5] NOMMU: Make POSIX shmem work on ramfs-backed files
  2004-12-09 15:08 [PATCH 1/5] NOMMU: MM cleanups dhowells
@ 2004-12-09 15:08 ` dhowells
  2004-12-10  3:08   ` Ingo Oeser
  2004-12-10 14:28   ` David Howells
  2004-12-09 15:08 ` [PATCH 5/5] NOMMU: Futher nommu shared memory support dhowells
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 11+ messages in thread
From: dhowells @ 2004-12-09 15:08 UTC (permalink / raw)
  To: akpm, davidm, gerg, wli; +Cc: linux-kernel, uclinux-dev

The attached patch makes ramfs able to support POSIX shared memory under !MMU
conditions. It does this by:

 (1) Intercept file expansion from zero-size by truncation. If this happens,
     ramfs will attempt to allocate a high-order page sufficient to hold the
     entire file. If successful, it'll split the high-order page into an array
     of single pages and attach those that it needs to the inode. Any excess
     will be returned to the allocator.

     If unsuccessful then the operation will return error ENOMEM or EFBIG
     beyond the normal error returns for truncation.

     The assumption is made that an mmap() will be performed to the size given
     should this be successful.

 (2) Prevent file contraction leaving VM_SHARED VMAs dangling in midair. If
     this were to happen, error ETXTBSY is returned instead. It's not quite the
     right error, but it'll probably do.

 (3) get_unmapped_area() is now provided on ramfs files if !MMU. This checks
     that if a shared mapping is desired the pages requested are all present
     with the file and that they're all contiguous. The address of the
     appropriate start page in the file returned if they are; ENOMEM is
     returned if not.

The page attachment mentioned in (1) could be done by get_unmapped_area()
instead if mapping->nrpages is zero; however, this would mean that open(),
ftruncate(), write(), mmap() wouldn't work. There's no ideal solution since
there's effectively no MMU.

What (1) allows is data passed to write() calls made after the truncation to
appear on the memory indicated by mmap() whilst it is mapped.

This is sufficient to allow POSIX shared memory on ramfs.

Signed-Off-By: dhowells@redhat.com
---
diffstat nommu-ramfs-2610rc2mm3-3.diff
 fs/ramfs/Makefile     |    8 +
 fs/ramfs/file-mmu.c   |   57 +++++++++
 fs/ramfs/file-nommu.c |  295 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ramfs/inode.c      |   22 ---
 fs/ramfs/internal.h   |   15 ++
 include/linux/ramfs.h |   10 +
 6 files changed, 385 insertions(+), 22 deletions(-)

diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/file-mmu.c linux-2.6.10-rc2-mm3-shmem/fs/ramfs/file-mmu.c
--- linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/file-mmu.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-rc2-mm3-shmem/fs/ramfs/file-mmu.c	2004-11-26 13:58:54.000000000 +0000
@@ -0,0 +1,57 @@
+/* file-mmu.c: ramfs MMU-based file operations
+ *
+ * Resizable simple ram filesystem for Linux.
+ *
+ * Copyright (C) 2000 Linus Torvalds.
+ *               2000 Transmeta Corp.
+ *
+ * Usage limits added by David Gibson, Linuxcare Australia.
+ * This file is released under the GPL.
+ */
+
+/*
+ * NOTE! This filesystem is probably most useful
+ * not as a real filesystem, but as an example of
+ * how virtual filesystems can be written.
+ *
+ * It doesn't get much simpler than this. Consider
+ * that this file implements the full semantics of
+ * a POSIX-compliant read-write filesystem.
+ *
+ * Note in particular how the filesystem does not
+ * need to implement any data structures of its own
+ * to keep track of the virtual data: using the VFS
+ * caches is sufficient.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+struct address_space_operations ramfs_aops = {
+	.readpage	= simple_readpage,
+	.prepare_write	= simple_prepare_write,
+	.commit_write	= simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+	.read		= generic_file_read,
+	.write		= generic_file_write,
+	.mmap		= generic_file_mmap,
+	.fsync		= simple_sync_file,
+	.sendfile	= generic_file_sendfile,
+	.llseek		= generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+	.getattr	= simple_getattr,
+};
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/file-nommu.c linux-2.6.10-rc2-mm3-shmem/fs/ramfs/file-nommu.c
--- linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/file-nommu.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-rc2-mm3-shmem/fs/ramfs/file-nommu.c	2004-12-07 20:30:01.000000000 +0000
@@ -0,0 +1,295 @@
+/* file-nommu.c: no-MMU version of ramfs
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/ramfs.h>
+#include <linux/quotaops.h>
+#include <linux/pagevec.h>
+#include <linux/mman.h>
+
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
+
+struct address_space_operations ramfs_aops = {
+	.readpage		= simple_readpage,
+	.prepare_write		= simple_prepare_write,
+	.commit_write		= simple_commit_write
+};
+
+struct file_operations ramfs_file_operations = {
+	.mmap			= ramfs_nommu_mmap,
+	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
+	.read			= generic_file_read,
+	.write			= generic_file_write,
+	.fsync			= simple_sync_file,
+	.sendfile		= generic_file_sendfile,
+	.llseek			= generic_file_llseek,
+};
+
+struct inode_operations ramfs_file_inode_operations = {
+	.setattr		= ramfs_nommu_setattr,
+	.getattr		= simple_getattr,
+};
+
+/*****************************************************************************/
+/*
+ * add a contiguous set of pages into a ramfs inode when it's truncated from
+ * size 0 on the assumption that it's going to be used for an mmap of shared
+ * memory
+ */
+static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+	struct pagevec lru_pvec;
+	unsigned long npages, xpages, loop, limit;
+	struct page *pages;
+	unsigned order;
+	void *data;
+	int ret;
+
+	/* make various checks */
+	order = get_order(newsize);
+	if (unlikely(order >= MAX_ORDER))
+		goto too_big;
+
+	limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+	if (limit != RLIM_INFINITY && newsize > limit)
+		goto fsize_exceeded;
+
+	if (newsize > inode->i_sb->s_maxbytes)
+		goto too_big;
+
+	i_size_write(inode, newsize);
+
+	/* allocate enough contiguous pages to be able to satisfy the
+	 * request */
+	pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+	if (!pages)
+		return -ENOMEM;
+
+	/* split the high-order page into an array of single pages */
+	split_highorder_page(pages, 0, order);
+
+	xpages = 1UL << order;
+	npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	/* trim off any pages we don't actually require */
+	for (loop = npages; loop < xpages; loop++)
+		__free_page(pages + loop);
+
+	/* clear the memory we allocated */
+	newsize = PAGE_SIZE * npages;
+	data = page_address(pages);
+	memset(data, 0, newsize);
+
+	/* attach all the pages to the inode's address space */
+	pagevec_init(&lru_pvec, 0);
+	for (loop = 0; loop < npages; loop++) {
+		struct page *page = pages + loop;
+
+		page->index = loop;
+
+		ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+		if (ret < 0)
+			goto add_error;
+
+		if (!pagevec_add(&lru_pvec, page))
+			__pagevec_lru_add(&lru_pvec);
+
+		unlock_page(page);
+	}
+
+	pagevec_lru_add(&lru_pvec);
+	return 0;
+
+ fsize_exceeded:
+	send_sig(SIGXFSZ, current, 0);
+ too_big:
+	return -EFBIG;
+
+ add_error:
+	page_cache_release(pages + loop);
+	for (loop++; loop < npages; loop++)
+		__free_page(pages + loop);
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * check that file shrinkage doesn't leave any VMAs dangling in midair
+ */
+static int ramfs_nommu_check_mappings(struct inode *inode,
+				      size_t newsize, size_t size)
+{
+	struct vm_area_struct *vma;
+	struct prio_tree_iter iter;
+
+	/* search for VMAs that fall within the dead zone */
+	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
+			      newsize >> PAGE_SHIFT,
+			      (size + PAGE_SIZE - 1) >> PAGE_SHIFT
+			      ) {
+		/* found one - only interested if it's shared out of the page
+		 * cache */
+		if (vma->vm_flags & VM_SHARED)
+			return -ETXTBSY; /* not quite true, but near enough */
+	}
+
+	return 0;
+}
+
+/*****************************************************************************/
+/*
+ *
+ */
+static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
+{
+	int ret;
+
+	/* assume a truncate from zero size is going to be for the purposes of
+	 * shared mmap */
+	if (size == 0) {
+		if (unlikely(newsize >> 32))
+			return -EFBIG;
+
+		return ramfs_nommu_expand_for_mapping(inode, newsize);
+	}
+
+	/* check that a decrease in size doesn't cut off any shared mappings */
+	if (newsize < size) {
+		ret = ramfs_nommu_check_mappings(inode, newsize, size);
+		if (ret < 0)
+			return ret;
+	}
+
+	ret = vmtruncate(inode, size);
+
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * handle a change of attributes
+ * - we're specifically interested in a change of size
+ */
+static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	struct inode *inode = dentry->d_inode;
+	unsigned int old_ia_valid = ia->ia_valid;
+	int ret = 0;
+
+	/* by providing our own setattr() method, we skip this quotaism */
+	if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
+	    (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
+		ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
+
+	/* pick out size-changing events */
+	if (ia->ia_valid & ATTR_SIZE) {
+		loff_t size = i_size_read(inode);
+		if (ia->ia_size != size) {
+			ret = ramfs_nommu_resize(inode, ia->ia_size, size);
+			if (ret < 0 || ia->ia_valid == ATTR_SIZE)
+				goto out;
+		} else {
+			/* we skipped the truncate but must still update
+			 * timestamps
+			 */
+			ia->ia_valid |= ATTR_MTIME|ATTR_CTIME;
+		}
+	}
+
+	ret = inode_setattr(inode, ia);
+ out:
+	ia->ia_valid = old_ia_valid;
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * try to determine where a shared mapping can be made
+ * - we require that:
+ *   - the pages to be mapped must exist
+ *   - the pages be physically contiguous in sequence
+ */
+unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+					    unsigned long addr, unsigned long len,
+					    unsigned long pgoff, unsigned long flags)
+{
+	unsigned long maxpages, lpages, nr, loop, ret;
+	struct inode *inode = file->f_dentry->d_inode;
+	struct page **pages = NULL, **ptr, *page;
+	loff_t isize;
+
+	if (!(flags & MAP_SHARED))
+		return addr;
+
+	/* the mapping mustn't extend beyond the EOF */
+	lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	isize = i_size_read(inode);
+
+	ret = -EINVAL;
+	maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (pgoff >= maxpages)
+		goto out;
+
+	if (maxpages - pgoff < lpages)
+		goto out;
+
+	/* gang-find the pages */
+	ret = -ENOMEM;
+	pages = kmalloc(lpages * sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		goto out;
+
+	memset(pages, 0, lpages * sizeof(struct page *));
+
+	nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+	if (nr != lpages)
+		goto out; /* leave if some pages were missing */
+
+	/* check the pages for physical adjacency */
+	ptr = pages;
+	page = *ptr++;
+	page++;
+	for (loop = lpages; loop > 1; loop--)
+		if (*ptr++ != page++)
+			goto out;
+
+	/* okay - all conditions fulfilled */
+	ret = (unsigned long) page_address(pages[0]);
+
+ out:
+	if (pages) {
+		ptr = pages;
+		for (loop = lpages; loop > 0; loop--)
+			put_page(*ptr++);
+		kfree(pages);
+	}
+
+	return ret;
+}
+
+/*****************************************************************************/
+/*
+ * set up a mapping
+ */
+int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return 0;
+}
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/inode.c linux-2.6.10-rc2-mm3-shmem/fs/ramfs/inode.c
--- linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/inode.c	2004-10-19 10:42:09.000000000 +0100
+++ linux-2.6.10-rc2-mm3-shmem/fs/ramfs/inode.c	2004-11-26 13:58:54.000000000 +0000
@@ -34,13 +34,12 @@
 #include <linux/ramfs.h>
 
 #include <asm/uaccess.h>
+#include "internal.h"
 
 /* some random number */
 #define RAMFS_MAGIC	0x858458f6
 
 static struct super_operations ramfs_ops;
-static struct address_space_operations ramfs_aops;
-static struct inode_operations ramfs_file_inode_operations;
 static struct inode_operations ramfs_dir_inode_operations;
 
 static struct backing_dev_info ramfs_backing_dev_info = {
@@ -140,25 +139,6 @@ static int ramfs_symlink(struct inode * 
 	return error;
 }
 
-static struct address_space_operations ramfs_aops = {
-	.readpage	= simple_readpage,
-	.prepare_write	= simple_prepare_write,
-	.commit_write	= simple_commit_write
-};
-
-struct file_operations ramfs_file_operations = {
-	.read		= generic_file_read,
-	.write		= generic_file_write,
-	.mmap		= generic_file_mmap,
-	.fsync		= simple_sync_file,
-	.sendfile	= generic_file_sendfile,
-	.llseek		= generic_file_llseek,
-};
-
-static struct inode_operations ramfs_file_inode_operations = {
-	.getattr	= simple_getattr,
-};
-
 static struct inode_operations ramfs_dir_inode_operations = {
 	.create		= ramfs_create,
 	.lookup		= simple_lookup,
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/internal.h linux-2.6.10-rc2-mm3-shmem/fs/ramfs/internal.h
--- linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/internal.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-rc2-mm3-shmem/fs/ramfs/internal.h	2004-11-26 13:58:54.000000000 +0000
@@ -0,0 +1,15 @@
+/* internal.h: ramfs internal definitions
+ *
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+extern struct address_space_operations ramfs_aops;
+extern struct file_operations ramfs_file_operations;
+extern struct inode_operations ramfs_file_inode_operations;
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/Makefile linux-2.6.10-rc2-mm3-shmem/fs/ramfs/Makefile
--- linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/Makefile	2004-06-18 13:41:28.000000000 +0100
+++ linux-2.6.10-rc2-mm3-shmem/fs/ramfs/Makefile	2004-11-26 15:36:07.000000000 +0000
@@ -4,4 +4,10 @@
 
 obj-$(CONFIG_RAMFS) += ramfs.o
 
-ramfs-objs := inode.o
+ifeq ($(CONFIG_MMU),y)
+ramfs-objs := file-mmu.o
+else
+ramfs-objs := file-nommu.o
+endif
+
+ramfs-objs += inode.o
diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/include/linux/ramfs.h linux-2.6.10-rc2-mm3-shmem/include/linux/ramfs.h
--- linux-2.6.10-rc2-mm3-mmcleanup/include/linux/ramfs.h	2004-10-19 10:42:17.000000000 +0100
+++ linux-2.6.10-rc2-mm3-shmem/include/linux/ramfs.h	2004-12-02 16:21:26.000000000 +0000
@@ -5,6 +5,16 @@ struct inode *ramfs_get_inode(struct sup
 struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
 	 int flags, const char *dev_name, void *data);
 
+#ifndef CONFIG_MMU
+extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+						   unsigned long addr,
+						   unsigned long len,
+						   unsigned long pgoff,
+						   unsigned long flags);
+
+extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+#endif
+
 extern struct file_operations ramfs_file_operations;
 extern struct vm_operations_struct generic_file_vm_ops;
 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 4/5] NOMMU: Make POSIX shmem work on ramfs-backed files
  2004-12-09 15:08 ` [PATCH 4/5] NOMMU: Make POSIX shmem work on ramfs-backed files dhowells
@ 2004-12-10  3:08   ` Ingo Oeser
  2004-12-10 14:28   ` David Howells
  1 sibling, 0 replies; 11+ messages in thread
From: Ingo Oeser @ 2004-12-10  3:08 UTC (permalink / raw)
  To: dhowells; +Cc: linux-kernel, uclinux-dev

You wrote:
> diff -uNrp linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/Makefile
> linux-2.6.10-rc2-mm3-shmem/fs/ramfs/Makefile ---
> linux-2.6.10-rc2-mm3-mmcleanup/fs/ramfs/Makefile 2004-06-18
> 13:41:28.000000000 +0100 +++
> linux-2.6.10-rc2-mm3-shmem/fs/ramfs/Makefile 2004-11-26 15:36:07.000000000
> +0000 @@ -4,4 +4,10 @@
>
>  obj-$(CONFIG_RAMFS) += ramfs.o
>
> -ramfs-objs := inode.o
> +ifeq ($(CONFIG_MMU),y)
> +ramfs-objs := file-mmu.o
> +else
> +ramfs-objs := file-nommu.o
> +endif
> +
> +ramfs-objs += inode.o

What about this pattern instead:

file-mmu-y := file-mmu.o
file-mmu-n := file-nommu.o
file-mmu- := file-nommu.o
ramfs-objs += file-mmu-$(CONFIG_MMU)


Requires more work while writing it, but removes the ifeq, 
which should be avoided in makefiles as hell
-- 
Ingo Oeser
axxeo GmbH
Tiestestr. 16, 30171 Hannover
Tel. +49-511-4753706
Fax. +49-511-4753716

mailto:support@axxeo.de


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 4/5] NOMMU: Make POSIX shmem work on ramfs-backed files
  2004-12-09 15:08 ` [PATCH 4/5] NOMMU: Make POSIX shmem work on ramfs-backed files dhowells
  2004-12-10  3:08   ` Ingo Oeser
@ 2004-12-10 14:28   ` David Howells
  2004-12-10 17:23     ` Ingo Oeser
  1 sibling, 1 reply; 11+ messages in thread
From: David Howells @ 2004-12-10 14:28 UTC (permalink / raw)
  To: linux-kernel; +Cc: uclinux-dev

Ingo Oeser <ioe@axxeo.de> wrote:

> What about this pattern instead:
> 
> file-mmu-y := file-mmu.o
> file-mmu-n := file-nommu.o
> file-mmu- := file-nommu.o
> ramfs-objs += file-mmu-$(CONFIG_MMU)
>
> Requires more work while writing it, but removes the ifeq, 
> which should be avoided in makefiles as hell

Your suggestion adds duplicate information. This solution is worse than the
thing you're trying to fix.

Do we really need both the file-mmu-n and file-mmu- variants?

Actually, this would probably do instead:

	file-mmu-y := file-nommu.o
	file-mmu-$(CONFIG_MMU) := file-mmu.o
	ramfs-objs := inode.o file-mmu-y

Will this work? Or should it be $(file-mmu-y) on the last line?

David

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 4/5] NOMMU: Make POSIX shmem work on ramfs-backed files
  2004-12-10 14:28   ` David Howells
@ 2004-12-10 17:23     ` Ingo Oeser
  0 siblings, 0 replies; 11+ messages in thread
From: Ingo Oeser @ 2004-12-10 17:23 UTC (permalink / raw)
  To: David Howells; +Cc: linux-kernel

You wrote:
> Ingo Oeser <ioe@axxeo.de> wrote:
> Actually, this would probably do instead:
>
>  file-mmu-y := file-nommu.o
>  file-mmu-$(CONFIG_MMU) := file-mmu.o
>  ramfs-objs := inode.o file-mmu-y
>
> Will this work? Or should it be $(file-mmu-y) on the last line?

Yes, so actually this would cut it:

file-mmu-y := file-nommu.o
file-mmu-$(CONFIG_MMU) := file-mmu.o
ramfs-objs := inode.o $(file-mmu-y)

But you got the idea, so I'm happy already ;-)


Ingo Oeser


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/5] NOMMU: High-order page management overhaul
  2004-12-10 15:45     ` David Howells
  2004-12-10 21:01       ` Andrew Morton
@ 2004-12-13 16:32       ` David Howells
  1 sibling, 0 replies; 11+ messages in thread
From: David Howells @ 2004-12-13 16:32 UTC (permalink / raw)
  To: Andrew Morton; +Cc: davidm, gerg%snapgear.com.wli, linux-kernel, uclinux-dev


Andrew Morton <akpm@osdl.org> wrote:

> I think I was the original "use compound pages" culprit.

You were, but several other people have chimed in since.

> But when I realised that nommu needs access to fields in the sub-pages which
> are currently used for compound page metadata I withdrew into the "if what's
> there now works, stick with it" camp.

The nommu stuff only needs access to a flag or two (PG_compound or
PG_compound_slave) and the refcount. I don't believe that any of the stuff
that pins secondary pages for userspace's benefit cares about anything else.

And, apart from that, as far as kernel side code is concerned, high-order
pages should be dealt with as high-order pages, or they should be properly
split and used as arrays of pages.

> >  (2) Splitting high-order pages has to be done differently on MMU vs
> >      NOMMU.
> 
> Oh.  Why?

There are three cases of splitting that I can think of:

 (1) Split down to zero-order pages. I think this can be handled the same in
     both cases, since _every_ secondaty page needs reinitialisation.

     Note that I'm ignoring the case of a secondary page already being
     pinned. That is one case where the old way is superior _ASSUMING_ the
     counts on the secondary pages are incremented, not just set to 1.

     However, if a high-order page is being split after being exposed to
     userspace, the driver writer probably deserves everything they get:-)

 (2) Split down to smaller high-order pages. If a driver doing this just
     reinitialises the first page of every chunk, it'll probably be okay,
     _provided_ it doesn't touch the secondary pages. If it does do that - say
     by initialising the size to zero, the whole thing is likely to explode.

 (3) Splitting compound pages. Obviously, if a driver requests a compound
     page, it should be able to handle dissociation into lower-order compound
     pages or zero-order pages. I'd argue that the core kernel should provide
     a function to do this.

So, case (2) is potentially problematical.

> The current code (which pins each subpage individually) seems robust
> enough.

Maybe.

> I assume that nommu will thenceforth simply treat the region as an
> array of zero-order pages.

That depends what you mean by "nommu". It's actually the common bits that
thenceforth treat high-order pages as individual pages, be they compound pages
from hugetlbfs, single pages from the page cache or high-order pages from the
slab allocator or alloc_pages().

> >  (5) Abstraction of some compound page related functions, including a way to
> >      make it more efficient to access the first page (PG_compound_slave).
> 
> If there is any way at all in which we can avoid consuming another page
> flag then we should do so.  There are various concepts (many zones,
> advanced page aging algorithms) which would be unfeasible if there are not
> several more bits available in ->flags.   And they continue to dribble away.

There is. We can move the current occupant of the compound-second struct
page's mapping into page[1].lru and stick a unique magic value in there.

	[mm/page_alloc.c]
	const char compound_page_slave_magic[4];

	[include/linux/mm.h]
	extern const char compound_page_slave_magic[];
	#define COMPOUND_PAGE_SLAVE_MAGIC \
		((struct address space *) &compound_page_slave[3])

	#define PageCompoundSlave(page) \
		((page)->mapping == COMPOUND_PAGE_SLAVE_MAGIC)

	#define SetPageCompoundSlave(page) \
	do { \
		BUG_ON((page)->mapping); \
		(page)->mapping = COMPOUND_PAGE_SLAVE_MAGIC; \
	} while(0)

	#define ClearPageCompoundSlave(page) \
	do { \
		BUG_ON(!PageCompoundSlave(page)); \
		(page)->mapping = NULL; \
	} while(0)

This would have a useful property of causing a misalignment exception
(assuming it's not the i386 arch) if someone tries to access the mapping.

Andrew Morton <akpm@osdl.org> wrote:

> But there's nothing actually *essential* here, is there?  No bugs are
> fixed?

Well, I feel it's more robust. I can't say that it _definitely_ fixes any
bugs, but I can see how they could happen.

> > I think the drivers need a good auditing too. A lot of them allocate
> > high-order pages for various uses, some for use as single units, and some
> > for use as arrays of pages.
> 
> I think an ARM driver is freeing zero-order pages within a higher-order
> page.  But as long as the driver didn't set __GFP_COMP then the higher
> order page is not compound, and that splitting treatment is appropriate.

I'd changed my patch to honour __GFP_COMP. However, such driver should
probably be changed to call a splitting function in mm/page_alloc.c. This sort
of thing is definitely the territory of the master mm routines.

It might be worth adding a new allocator routine that takes arguments along
the lines of calloc() - so that you ask for 2^N pages of 2^M size. This would
allow the allocator to initialise everything correctly up front.

David

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/5] NOMMU: High-order page management overhaul
  2004-12-10 15:45     ` David Howells
@ 2004-12-10 21:01       ` Andrew Morton
  2004-12-13 16:32       ` David Howells
  1 sibling, 0 replies; 11+ messages in thread
From: Andrew Morton @ 2004-12-10 21:01 UTC (permalink / raw)
  To: David Howells; +Cc: davidm, gerg%snapgear.com.wli, linux-kernel, uclinux-dev

David Howells <dhowells@redhat.com> wrote:
>
> Andrew Morton <akpm@osdl.org> wrote:
> 
> > > The attached patch overhauls high-order page handling.
> >
> > This patch (which is actually twelve patches)
> 
> How did you work that one out? Just because there're twelve points in my list
> doesn't mean the patch can be split twelve ways. If you really want it
> dissociating into sub-patches, I'm sure I can do that, but not all the
> intermediate stages would be compilable and testable.

Of course, splitting the work into one-concept-per-patch would be a big help.

> > seems to be taking out old code and replacing it with new code for no
> > apparent reason.
> 
>  (1) I've been moaned at by a lot of people for:
> 
>      (a) #ifdefs in page_alloc.c... This gets rid of some of them, even if I
>        	 didn't add them.
> 
>      (b) The way page_alloc.c was handling page refcounting differently under
>      	 nommu conditions. All I did was to fix it, but it seems it's my
>      	 fault:-/ This fixes it to use compound pages "as [I] should've done
>      	 in the first place".

I think I was the original "use compound pages" culprit.  But when I
realised that nommu needs access to fields in the sub-pages which are
currently used for compound page metadata I withdrew into the "if what's
there now works, stick with it" camp.

>  (2) Splitting high-order pages has to be done differently on MMU vs
>      NOMMU.

Oh.  Why?

> Part of this makes it simpler by providing convenience functions
>      for the job.
> 
>  (3) More robust nommu high-order page handling. I'm wary of the current way
>      the individual secondary pages of a high-order page are handled in nomuu
>      conditions. I can see ways it can go wrong all too easily (the existence
>      of the whole thing is contingent on the count on the first page, but
>      pinning the secondary pages doesn't affect that).

The current code (which pins each subpage individually) seems robust
enough.  I assume that nommu will thenceforth simply treat the region as an
array of zero-order pages.

>  (4) Making it easier to debug problems with compound pages (bad_page
>      changes).
> 
>  (5) Abstraction of some compound page related functions, including a way to
>      make it more efficient to access the first page (PG_compound_slave).

If there is any way at all in which we can avoid consuming another page
flag then we should do so.  There are various concepts (many zones,
advanced page aging algorithms) which would be unfeasible if there are not
several more bits available in ->flags.   And they continue to dribble away.

> > I mean, what is the *objective* of doing all of this stuff?  What problems
> > does it cause if the patch is simply dropped???
> 
> Objectives? Well:
> 
>  (1) More robust high-order page handling in nommu conditions.
> 
>  (2) Use compound pages to achieve (1) as per the numerous suggestions.
> 
>  (3) Remove #ifdefs as per the numerous suggestions.

But there's nothing actually *essential* here, is there?  No bugs are
fixed?

> I think the drivers need a good auditing too. A lot of them allocate
> high-order pages for various uses, some for use as single units, and some for
> use as arrays of pages.

I think an ARM driver is freeing zero-order pages within a higher-order
page.  But as long as the driver didn't set __GFP_COMP then the higher
order page is not compound, and that splitting treatment is appropriate.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/5] NOMMU: High-order page management overhaul
       [not found]   ` <200412082012.iB8KCTBK010123@warthog.cambridge.redhat.com>
@ 2004-12-10 15:45     ` David Howells
  2004-12-10 21:01       ` Andrew Morton
  2004-12-13 16:32       ` David Howells
  0 siblings, 2 replies; 11+ messages in thread
From: David Howells @ 2004-12-10 15:45 UTC (permalink / raw)
  To: Andrew Morton; +Cc: davidm, gerg%snapgear.com.wli, linux-kernel, uclinux-dev

Andrew Morton <akpm@osdl.org> wrote:

> > The attached patch overhauls high-order page handling.
>
> This patch (which is actually twelve patches)

How did you work that one out? Just because there're twelve points in my list
doesn't mean the patch can be split twelve ways. If you really want it
dissociating into sub-patches, I'm sure I can do that, but not all the
intermediate stages would be compilable and testable.

> seems to be taking out old code and replacing it with new code for no
> apparent reason.

 (1) I've been moaned at by a lot of people for:

     (a) #ifdefs in page_alloc.c... This gets rid of some of them, even if I
       	 didn't add them.

     (b) The way page_alloc.c was handling page refcounting differently under
     	 nommu conditions. All I did was to fix it, but it seems it's my
     	 fault:-/ This fixes it to use compound pages "as [I] should've done
     	 in the first place".

 (2) Splitting high-order pages has to be done differently on MMU vs
     NOMMU. Part of this makes it simpler by providing convenience functions
     for the job.

 (3) More robust nommu high-order page handling. I'm wary of the current way
     the individual secondary pages of a high-order page are handled in nomuu
     conditions. I can see ways it can go wrong all too easily (the existence
     of the whole thing is contingent on the count on the first page, but
     pinning the secondary pages doesn't affect that).

 (4) Making it easier to debug problems with compound pages (bad_page
     changes).

 (5) Abstraction of some compound page related functions, including a way to
     make it more efficient to access the first page (PG_compound_slave).

> I mean, what is the *objective* of doing all of this stuff?  What problems
> does it cause if the patch is simply dropped???

Objectives? Well:

 (1) More robust high-order page handling in nommu conditions.

 (2) Use compound pages to achieve (1) as per the numerous suggestions.

 (3) Remove #ifdefs as per the numerous suggestions.

I think the drivers need a good auditing too. A lot of them allocate
high-order pages for various uses, some for use as single units, and some for
use as arrays of pages.

David

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2004-12-13 16:33 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-12-09 15:08 [PATCH 1/5] NOMMU: MM cleanups dhowells
2004-12-09 15:08 ` [PATCH 4/5] NOMMU: Make POSIX shmem work on ramfs-backed files dhowells
2004-12-10  3:08   ` Ingo Oeser
2004-12-10 14:28   ` David Howells
2004-12-10 17:23     ` Ingo Oeser
2004-12-09 15:08 ` [PATCH 5/5] NOMMU: Futher nommu shared memory support dhowells
2004-12-09 15:08 ` [PATCH 3/5] NOMMU: mmap fixes and extensions dhowells
2004-12-09 15:08 ` [PATCH 2/5] NOMMU: High-order page management overhaul dhowells
     [not found] <20041209141718.6acec9ee.akpm@osdl.org>
     [not found] ` <7ad0b24c-4955-11d9-8e19-0002b3163499@redhat.com>
     [not found]   ` <200412082012.iB8KCTBK010123@warthog.cambridge.redhat.com>
2004-12-10 15:45     ` David Howells
2004-12-10 21:01       ` Andrew Morton
2004-12-13 16:32       ` David Howells

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).