linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
@ 2004-03-11  0:04 Nick Piggin
  2004-03-11 17:25 ` Marc-Christian Petersen
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2004-03-11  0:04 UTC (permalink / raw)
  To: linux-kernel, linux-mm; +Cc: Mike Fedyk, plate

[-- Attachment #1: Type: text/plain, Size: 45 bytes --]

Here is my updated patches rolled into one.


[-- Attachment #2: vm-split-active.patch --]
[-- Type: text/x-patch, Size: 28929 bytes --]

 linux-2.6-npiggin/arch/i386/mm/hugetlbpage.c    |    4 
 linux-2.6-npiggin/arch/ia64/mm/hugetlbpage.c    |    4 
 linux-2.6-npiggin/arch/ppc64/mm/hugetlbpage.c   |    4 
 linux-2.6-npiggin/arch/sparc64/mm/hugetlbpage.c |    4 
 linux-2.6-npiggin/include/linux/mm_inline.h     |   33 +++-
 linux-2.6-npiggin/include/linux/mmzone.h        |   28 ---
 linux-2.6-npiggin/include/linux/page-flags.h    |   50 +++---
 linux-2.6-npiggin/include/linux/swap.h          |    2 
 linux-2.6-npiggin/kernel/sysctl.c               |    9 -
 linux-2.6-npiggin/mm/page_alloc.c               |   26 +--
 linux-2.6-npiggin/mm/swap.c                     |   35 +++-
 linux-2.6-npiggin/mm/vmscan.c                   |  193 ++++++++++--------------
 12 files changed, 197 insertions(+), 195 deletions(-)

diff -puN arch/i386/mm/hugetlbpage.c~rollup arch/i386/mm/hugetlbpage.c
--- linux-2.6/arch/i386/mm/hugetlbpage.c~rollup	2004-03-11 10:59:25.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/mm/hugetlbpage.c	2004-03-11 10:59:26.000000000 +1100
@@ -411,8 +411,8 @@ static void update_and_free_page(struct 
 	htlbzone_pages--;
 	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
 		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
+				1 << PG_dirty | 1 << PG_active_mapped | 1 << PG_active_unmapped |
+				1 << PG_reserved | 1 << PG_private | 1<< PG_writeback);
 		set_page_count(map, 0);
 		map++;
 	}
diff -puN arch/ia64/mm/hugetlbpage.c~rollup arch/ia64/mm/hugetlbpage.c
--- linux-2.6/arch/ia64/mm/hugetlbpage.c~rollup	2004-03-11 10:59:25.000000000 +1100
+++ linux-2.6-npiggin/arch/ia64/mm/hugetlbpage.c	2004-03-11 10:59:26.000000000 +1100
@@ -431,8 +431,8 @@ void update_and_free_page(struct page *p
 	htlbzone_pages--;
 	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
 		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
+				1 << PG_dirty | 1 << PG_active_mapped | 1 << PG_active_unmapped |
+				1 << PG_reserved | 1 << PG_private | 1<< PG_writeback);
 		set_page_count(map, 0);
 		map++;
 	}
diff -puN arch/ppc64/mm/hugetlbpage.c~rollup arch/ppc64/mm/hugetlbpage.c
--- linux-2.6/arch/ppc64/mm/hugetlbpage.c~rollup	2004-03-11 10:59:25.000000000 +1100
+++ linux-2.6-npiggin/arch/ppc64/mm/hugetlbpage.c	2004-03-11 10:59:26.000000000 +1100
@@ -800,8 +800,8 @@ static void split_and_free_hugepage(stru
 	htlbpage_total--;
 	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
 		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
+				1 << PG_dirty | 1 << PG_active_mapped | 1 << PG_active_unmapped |
+				1 << PG_reserved | 1 << PG_private | 1<< PG_writeback);
 		set_page_count(map, 0);
 		map++;
 	}
diff -puN arch/sparc64/mm/hugetlbpage.c~rollup arch/sparc64/mm/hugetlbpage.c
--- linux-2.6/arch/sparc64/mm/hugetlbpage.c~rollup	2004-03-11 10:59:25.000000000 +1100
+++ linux-2.6-npiggin/arch/sparc64/mm/hugetlbpage.c	2004-03-11 10:59:26.000000000 +1100
@@ -365,8 +365,8 @@ static void update_and_free_page(struct 
 	htlbzone_pages--;
 	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
 		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
+				1 << PG_dirty | 1 << PG_active_mapped | 1 << PG_active_unmapped |
+				1 << PG_reserved | 1 << PG_private | 1<< PG_writeback);
 		set_page_count(map, 0);
 		map++;
 	}
diff -puN include/linux/mm_inline.h~rollup include/linux/mm_inline.h
--- linux-2.6/include/linux/mm_inline.h~rollup	2004-03-11 10:59:25.000000000 +1100
+++ linux-2.6-npiggin/include/linux/mm_inline.h	2004-03-11 10:59:26.000000000 +1100
@@ -1,9 +1,16 @@
 
 static inline void
-add_page_to_active_list(struct zone *zone, struct page *page)
+add_page_to_active_mapped_list(struct zone *zone, struct page *page)
 {
-	list_add(&page->lru, &zone->active_list);
-	zone->nr_active++;
+	list_add(&page->lru, &zone->active_mapped_list);
+	zone->nr_active_mapped++;
+}
+
+static inline void
+add_page_to_active_unmapped_list(struct zone *zone, struct page *page)
+{
+	list_add(&page->lru, &zone->active_unmapped_list);
+	zone->nr_active_unmapped++;
 }
 
 static inline void
@@ -14,10 +21,17 @@ add_page_to_inactive_list(struct zone *z
 }
 
 static inline void
-del_page_from_active_list(struct zone *zone, struct page *page)
+del_page_from_active_mapped_list(struct zone *zone, struct page *page)
+{
+	list_del(&page->lru);
+	zone->nr_active_mapped--;
+}
+
+static inline void
+del_page_from_active_unmapped_list(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
-	zone->nr_active--;
+	zone->nr_active_unmapped--;
 }
 
 static inline void
@@ -31,9 +45,12 @@ static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
-	if (PageActive(page)) {
-		ClearPageActive(page);
-		zone->nr_active--;
+	if (PageActiveMapped(page)) {
+		ClearPageActiveMapped(page);
+		zone->nr_active_mapped--;
+	} else if (PageActiveUnmapped(page)) {
+		ClearPageActiveUnmapped(page);
+		zone->nr_active_unmapped--;
 	} else {
 		zone->nr_inactive--;
 	}
diff -puN include/linux/mmzone.h~rollup include/linux/mmzone.h
--- linux-2.6/include/linux/mmzone.h~rollup	2004-03-11 10:59:25.000000000 +1100
+++ linux-2.6-npiggin/include/linux/mmzone.h	2004-03-11 10:59:26.000000000 +1100
@@ -74,11 +74,14 @@ struct zone {
 	ZONE_PADDING(_pad1_)
 
 	spinlock_t		lru_lock;	
-	struct list_head	active_list;
+	struct list_head	active_mapped_list;
+	struct list_head	active_unmapped_list;
 	struct list_head	inactive_list;
-	atomic_t		nr_scan_active;
+	atomic_t		nr_scan_active_mapped;
+	atomic_t		nr_scan_active_unmapped;
 	atomic_t		nr_scan_inactive;
-	unsigned long		nr_active;
+	unsigned long		nr_active_mapped;
+	unsigned long		nr_active_unmapped;
 	unsigned long		nr_inactive;
 	int			all_unreclaimable; /* All pages pinned */
 	unsigned long		pages_scanned;	   /* since last reclaim */
@@ -86,25 +89,6 @@ struct zone {
 	ZONE_PADDING(_pad2_)
 
 	/*
-	 * prev_priority holds the scanning priority for this zone.  It is
-	 * defined as the scanning priority at which we achieved our reclaim
-	 * target at the previous try_to_free_pages() or balance_pgdat()
-	 * invokation.
-	 *
-	 * We use prev_priority as a measure of how much stress page reclaim is
-	 * under - it drives the swappiness decision: whether to unmap mapped
-	 * pages.
-	 *
-	 * temp_priority is used to remember the scanning priority at which
-	 * this zone was successfully refilled to free_pages == pages_high.
-	 *
-	 * Access to both these fields is quite racy even on uniprocessor.  But
-	 * it is expected to average out OK.
-	 */
-	int temp_priority;
-	int prev_priority;
-
-	/*
 	 * free areas of different sizes
 	 */
 	struct free_area	free_area[MAX_ORDER];
diff -puN include/linux/page-flags.h~rollup include/linux/page-flags.h
--- linux-2.6/include/linux/page-flags.h~rollup	2004-03-11 10:59:25.000000000 +1100
+++ linux-2.6-npiggin/include/linux/page-flags.h	2004-03-11 10:59:26.000000000 +1100
@@ -58,23 +58,25 @@
 
 #define PG_dirty	 	 4
 #define PG_lru			 5
-#define PG_active		 6
-#define PG_slab			 7	/* slab debug (Suparna wants this) */
+#define PG_active_mapped	 6
+#define PG_active_unmapped	 7
 
-#define PG_highmem		 8
-#define PG_checked		 9	/* kill me in 2.5.<early>. */
-#define PG_arch_1		10
-#define PG_reserved		11
-
-#define PG_private		12	/* Has something at ->private */
-#define PG_writeback		13	/* Page is under writeback */
-#define PG_nosave		14	/* Used for system suspend/resume */
-#define PG_chainlock		15	/* lock bit for ->pte_chain */
-
-#define PG_direct		16	/* ->pte_chain points directly at pte */
-#define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
-#define PG_reclaim		18	/* To be reclaimed asap */
-#define PG_compound		19	/* Part of a compound page */
+#define PG_slab			 8	/* slab debug (Suparna wants this) */
+#define PG_highmem		 9
+#define PG_checked		10	/* kill me in 2.5.<early>. */
+#define PG_arch_1		11
+
+#define PG_reserved		12
+#define PG_private		13	/* Has something at ->private */
+#define PG_writeback		14	/* Page is under writeback */
+#define PG_nosave		15	/* Used for system suspend/resume */
+
+#define PG_chainlock		16	/* lock bit for ->pte_chain */
+#define PG_direct		17	/* ->pte_chain points directly at pte */
+#define PG_mappedtodisk		18	/* Has blocks allocated on-disk */
+#define PG_reclaim		19	/* To be reclaimed asap */
+
+#define PG_compound		20	/* Part of a compound page */
 
 
 /*
@@ -211,11 +213,17 @@ extern void get_full_page_state(struct p
 #define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
 #define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
 
-#define PageActive(page)	test_bit(PG_active, &(page)->flags)
-#define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
-#define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
-#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags)
-#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags)
+#define PageActiveMapped(page)		test_bit(PG_active_mapped, &(page)->flags)
+#define SetPageActiveMapped(page)	set_bit(PG_active_mapped, &(page)->flags)
+#define ClearPageActiveMapped(page)	clear_bit(PG_active_mapped, &(page)->flags)
+#define TestClearPageActiveMapped(page) test_and_clear_bit(PG_active_mapped, &(page)->flags)
+#define TestSetPageActiveMapped(page) test_and_set_bit(PG_active_mapped, &(page)->flags)
+
+#define PageActiveUnmapped(page)	test_bit(PG_active_unmapped, &(page)->flags)
+#define SetPageActiveUnmapped(page)	set_bit(PG_active_unmapped, &(page)->flags)
+#define ClearPageActiveUnmapped(page)	clear_bit(PG_active_unmapped, &(page)->flags)
+#define TestClearPageActiveUnmapped(page) test_and_clear_bit(PG_active_unmapped, &(page)->flags)
+#define TestSetPageActiveUnmapped(page) test_and_set_bit(PG_active_unmapped, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
 #define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
diff -puN include/linux/swap.h~rollup include/linux/swap.h
--- linux-2.6/include/linux/swap.h~rollup	2004-03-11 10:59:26.000000000 +1100
+++ linux-2.6-npiggin/include/linux/swap.h	2004-03-11 10:59:26.000000000 +1100
@@ -175,7 +175,7 @@ extern void swap_setup(void);
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(struct zone **, unsigned int, unsigned int);
 extern int shrink_all_memory(int);
-extern int vm_swappiness;
+extern int vm_mapped_page_cost;
 
 /* linux/mm/rmap.c */
 #ifdef CONFIG_MMU
diff -puN kernel/sysctl.c~rollup kernel/sysctl.c
--- linux-2.6/kernel/sysctl.c~rollup	2004-03-11 10:59:26.000000000 +1100
+++ linux-2.6-npiggin/kernel/sysctl.c	2004-03-11 10:59:26.000000000 +1100
@@ -621,6 +621,7 @@ static ctl_table kern_table[] = {
 /* Constants for minimum and maximum testing in vm_table.
    We use these as one-element integer vectors. */
 static int zero;
+static int one = 1;
 static int one_hundred = 100;
 
 
@@ -697,13 +698,13 @@ static ctl_table vm_table[] = {
 	},
 	{
 		.ctl_name	= VM_SWAPPINESS,
-		.procname	= "swappiness",
-		.data		= &vm_swappiness,
-		.maxlen		= sizeof(vm_swappiness),
+		.procname	= "mapped_page_cost",
+		.data		= &vm_mapped_page_cost,
+		.maxlen		= sizeof(vm_mapped_page_cost),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
+		.extra1		= &one,
 		.extra2		= &one_hundred,
 	},
 #ifdef CONFIG_HUGETLB_PAGE
diff -puN mm/page_alloc.c~rollup mm/page_alloc.c
--- linux-2.6/mm/page_alloc.c~rollup	2004-03-11 10:59:26.000000000 +1100
+++ linux-2.6-npiggin/mm/page_alloc.c	2004-03-11 10:59:26.000000000 +1100
@@ -81,7 +81,7 @@ static void bad_page(const char *functio
 	page->flags &= ~(1 << PG_private	|
 			1 << PG_locked	|
 			1 << PG_lru	|
-			1 << PG_active	|
+			1 << PG_active_mapped	|
 			1 << PG_dirty	|
 			1 << PG_writeback);
 	set_page_count(page, 0);
@@ -217,7 +217,8 @@ static inline void free_pages_check(cons
 			1 << PG_lru	|
 			1 << PG_private |
 			1 << PG_locked	|
-			1 << PG_active	|
+			1 << PG_active_mapped	|
+			1 << PG_active_unmapped	|
 			1 << PG_reclaim	|
 			1 << PG_slab	|
 			1 << PG_writeback )))
@@ -324,7 +325,8 @@ static void prep_new_page(struct page *p
 			1 << PG_private	|
 			1 << PG_locked	|
 			1 << PG_lru	|
-			1 << PG_active	|
+			1 << PG_active_mapped	|
+			1 << PG_active_unmapped	|
 			1 << PG_dirty	|
 			1 << PG_reclaim	|
 			1 << PG_writeback )))
@@ -818,7 +820,8 @@ unsigned int nr_used_zone_pages(void)
 	struct zone *zone;
 
 	for_each_zone(zone)
-		pages += zone->nr_active + zone->nr_inactive;
+		pages += zone->nr_active_mapped + zone->nr_active_unmapped
+			+ zone->nr_inactive;
 
 	return pages;
 }
@@ -955,7 +958,7 @@ void get_zone_counts(unsigned long *acti
 	*inactive = 0;
 	*free = 0;
 	for_each_zone(zone) {
-		*active += zone->nr_active;
+		*active += zone->nr_active_mapped + zone->nr_active_unmapped;
 		*inactive += zone->nr_inactive;
 		*free += zone->free_pages;
 	}
@@ -1068,7 +1071,7 @@ void show_free_areas(void)
 			K(zone->pages_min),
 			K(zone->pages_low),
 			K(zone->pages_high),
-			K(zone->nr_active),
+			K(zone->nr_active_mapped + zone->nr_active_unmapped),
 			K(zone->nr_inactive),
 			K(zone->present_pages)
 			);
@@ -1408,8 +1411,6 @@ static void __init free_area_init_core(s
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 
-		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-
 		/*
 		 * The per-cpu-pages pools are set to around 1000th of the
 		 * size of the zone.  But no more than 1/4 of a meg - there's
@@ -1443,11 +1444,14 @@ static void __init free_area_init_core(s
 		}
 		printk("  %s zone: %lu pages, LIFO batch:%lu\n",
 				zone_names[j], realsize, batch);
-		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->active_mapped_list);
+		INIT_LIST_HEAD(&zone->active_unmapped_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
-		atomic_set(&zone->nr_scan_active, 0);
+		atomic_set(&zone->nr_scan_active_mapped, 0);
+		atomic_set(&zone->nr_scan_active_unmapped, 0);
 		atomic_set(&zone->nr_scan_inactive, 0);
-		zone->nr_active = 0;
+		zone->nr_active_mapped = 0;
+		zone->nr_active_unmapped = 0;
 		zone->nr_inactive = 0;
 		if (!size)
 			continue;
diff -puN mm/swap.c~rollup mm/swap.c
--- linux-2.6/mm/swap.c~rollup	2004-03-11 10:59:26.000000000 +1100
+++ linux-2.6-npiggin/mm/swap.c	2004-03-11 10:59:26.000000000 +1100
@@ -58,14 +58,18 @@ int rotate_reclaimable_page(struct page 
 		return 1;
 	if (PageDirty(page))
 		return 1;
-	if (PageActive(page))
+	if (PageActiveMapped(page))
+		return 1;
+	if (PageActiveUnmapped(page))
 		return 1;
 	if (!PageLRU(page))
 		return 1;
 
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lru_lock, flags);
-	if (PageLRU(page) && !PageActive(page)) {
+	if (PageLRU(page)
+		&& !PageActiveMapped(page) && !PageActiveUnmapped(page)) {
+
 		list_del(&page->lru);
 		list_add_tail(&page->lru, &zone->inactive_list);
 		inc_page_state(pgrotated);
@@ -84,10 +88,18 @@ void fastcall activate_page(struct page 
 	struct zone *zone = page_zone(page);
 
 	spin_lock_irq(&zone->lru_lock);
-	if (PageLRU(page) && !PageActive(page)) {
+	if (PageLRU(page)
+		&& !PageActiveMapped(page) && !PageActiveUnmapped(page)) {
+
 		del_page_from_inactive_list(zone, page);
-		SetPageActive(page);
-		add_page_to_active_list(zone, page);
+
+		if (page_mapped(page)) {
+			SetPageActiveMapped(page);
+			add_page_to_active_mapped_list(zone, page);
+		} else {
+			SetPageActiveUnmapped(page);
+			add_page_to_active_unmapped_list(zone, page);
+		}
 		inc_page_state(pgactivate);
 	}
 	spin_unlock_irq(&zone->lru_lock);
@@ -102,7 +114,8 @@ void fastcall activate_page(struct page 
  */
 void fastcall mark_page_accessed(struct page *page)
 {
-	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
+	if (!PageActiveMapped(page) && !PageActiveUnmapped(page)
+			&& PageReferenced(page) && PageLRU(page)) {
 		activate_page(page);
 		ClearPageReferenced(page);
 	} else if (!PageReferenced(page)) {
@@ -310,9 +323,13 @@ void __pagevec_lru_add_active(struct pag
 		}
 		if (TestSetPageLRU(page))
 			BUG();
-		if (TestSetPageActive(page))
-			BUG();
-		add_page_to_active_list(zone, page);
+		if (page_mapped(page)) {
+			SetPageActiveMapped(page);
+			add_page_to_active_mapped_list(zone, page);
+		} else {
+			SetPageActiveMapped(page);
+			add_page_to_active_unmapped_list(zone, page);
+		}
 	}
 	if (zone)
 		spin_unlock_irq(&zone->lru_lock);
diff -puN mm/vmscan.c~rollup mm/vmscan.c
--- linux-2.6/mm/vmscan.c~rollup	2004-03-11 10:59:26.000000000 +1100
+++ linux-2.6-npiggin/mm/vmscan.c	2004-03-11 10:59:26.000000000 +1100
@@ -40,10 +40,11 @@
 #include <linux/swapops.h>
 
 /*
- * From 0 .. 100.  Higher means more swappy.
+ * From 1 .. 100.  Higher means less swappy.
  */
-int vm_swappiness = 60;
-static long total_memory;
+int vm_mapped_page_cost = 8;
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 #ifdef ARCH_HAS_PREFETCH
 #define prefetch_prev_lru_page(_page, _base, _field)			\
@@ -51,8 +52,7 @@ static long total_memory;
 		if ((_page)->lru.prev != _base) {			\
 			struct page *prev;				\
 									\
-			prev = list_entry(_page->lru.prev,		\
-					struct page, lru);		\
+			prev = lru_to_page(&(_page->lru));		\
 			prefetch(&prev->_field);			\
 		}							\
 	} while (0)
@@ -66,8 +66,7 @@ static long total_memory;
 		if ((_page)->lru.prev != _base) {			\
 			struct page *prev;				\
 									\
-			prev = list_entry(_page->lru.prev,		\
-					struct page, lru);		\
+			prev = lru_to_page(&(_page->lru));			\
 			prefetchw(&prev->_field);			\
 		}							\
 	} while (0)
@@ -262,7 +261,7 @@ shrink_list(struct list_head *page_list,
 		int may_enter_fs;
 		int referenced;
 
-		page = list_entry(page_list->prev, struct page, lru);
+		page = lru_to_page(page_list);
 		list_del(&page->lru);
 
 		if (TestSetPageLocked(page))
@@ -272,7 +271,7 @@ shrink_list(struct list_head *page_list,
 		if (page_mapped(page) || PageSwapCache(page))
 			(*nr_scanned)++;
 
-		BUG_ON(PageActive(page));
+		BUG_ON(PageActiveMapped(page) || PageActiveUnmapped(page));
 
 		if (PageWriteback(page))
 			goto keep_locked;
@@ -450,7 +449,10 @@ free_it:
 		continue;
 
 activate_locked:
-		SetPageActive(page);
+		if (page_mapped(page))
+			SetPageActiveMapped(page);
+		else
+			SetPageActiveUnmapped(page);
 		pgactivate++;
 keep_locked:
 		unlock_page(page);
@@ -496,8 +498,7 @@ shrink_cache(struct zone *zone, unsigned
 
 		while (nr_scan++ < SWAP_CLUSTER_MAX &&
 				!list_empty(&zone->inactive_list)) {
-			page = list_entry(zone->inactive_list.prev,
-						struct page, lru);
+			page = lru_to_page(&zone->inactive_list);
 
 			prefetchw_prev_lru_page(page,
 						&zone->inactive_list, flags);
@@ -542,12 +543,14 @@ shrink_cache(struct zone *zone, unsigned
 		 * Put back any unfreeable pages.
 		 */
 		while (!list_empty(&page_list)) {
-			page = list_entry(page_list.prev, struct page, lru);
+			page = lru_to_page(&page_list);
 			if (TestSetPageLRU(page))
 				BUG();
 			list_del(&page->lru);
-			if (PageActive(page))
-				add_page_to_active_list(zone, page);
+			if (PageActiveMapped(page))
+				add_page_to_active_mapped_list(zone, page);
+			else if (PageActiveUnmapped(page))
+				add_page_to_active_unmapped_list(zone, page);
 			else
 				add_page_to_inactive_list(zone, page);
 			if (!pagevec_add(&pvec, page)) {
@@ -580,36 +583,32 @@ done:
  * The downside is that we have to touch page->count against each page.
  * But we had to alter page->flags anyway.
  */
-static void
-refill_inactive_zone(struct zone *zone, const int nr_pages_in,
-			struct page_state *ps)
+static void shrink_active_list(struct zone *zone, struct list_head *list,
+		unsigned long *list_count, const int nr_scan,
+		struct page_state *ps)
 {
-	int pgmoved;
+	int pgmoved, pgmoved_unmapped;
 	int pgdeactivate = 0;
-	int nr_pages = nr_pages_in;
+	int nr_pages = nr_scan;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
 	struct page *page;
 	struct pagevec pvec;
-	int reclaim_mapped = 0;
-	long mapped_ratio;
-	long distress;
-	long swap_tendency;
 
 	lru_add_drain();
 	pgmoved = 0;
 	spin_lock_irq(&zone->lru_lock);
-	while (nr_pages && !list_empty(&zone->active_list)) {
-		page = list_entry(zone->active_list.prev, struct page, lru);
-		prefetchw_prev_lru_page(page, &zone->active_list, flags);
+	while (nr_pages && !list_empty(list)) {
+		page = lru_to_page(list);
+		prefetchw_prev_lru_page(page, list, flags);
 		if (!TestClearPageLRU(page))
 			BUG();
 		list_del(&page->lru);
 		if (page_count(page) == 0) {
 			/* It is currently in pagevec_release() */
 			SetPageLRU(page);
-			list_add(&page->lru, &zone->active_list);
+			list_add(&page->lru, list);
 		} else {
 			page_cache_get(page);
 			list_add(&page->lru, &l_hold);
@@ -617,62 +616,26 @@ refill_inactive_zone(struct zone *zone, 
 		}
 		nr_pages--;
 	}
-	zone->nr_active -= pgmoved;
+	*list_count -= pgmoved;
 	spin_unlock_irq(&zone->lru_lock);
 
-	/*
-	 * `distress' is a measure of how much trouble we're having reclaiming
-	 * pages.  0 -> no problems.  100 -> great trouble.
-	 */
-	distress = 100 >> zone->prev_priority;
-
-	/*
-	 * The point of this algorithm is to decide when to start reclaiming
-	 * mapped memory instead of just pagecache.  Work out how much memory
-	 * is mapped.
-	 */
-	mapped_ratio = (ps->nr_mapped * 100) / total_memory;
-
-	/*
-	 * Now decide how much we really want to unmap some pages.  The mapped
-	 * ratio is downgraded - just because there's a lot of mapped memory
-	 * doesn't necessarily mean that page reclaim isn't succeeding.
-	 *
-	 * The distress ratio is important - we don't want to start going oom.
-	 *
-	 * A 100% value of vm_swappiness overrides this algorithm altogether.
-	 */
-	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
-	/*
-	 * Now use this metric to decide whether to start moving mapped memory
-	 * onto the inactive list.
-	 */
-	if (swap_tendency >= 100)
-		reclaim_mapped = 1;
-
 	while (!list_empty(&l_hold)) {
-		page = list_entry(l_hold.prev, struct page, lru);
+		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
-		if (page_mapped(page)) {
-			if (!reclaim_mapped) {
-				list_add(&page->lru, &l_active);
-				continue;
-			}
-			pte_chain_lock(page);
-			if (page_referenced(page)) {
-				pte_chain_unlock(page);
-				list_add(&page->lru, &l_active);
-				continue;
-			}
+		pte_chain_lock(page);
+		if (page_referenced(page)) {
 			pte_chain_unlock(page);
+			list_add(&page->lru, &l_active);
+			continue;
 		}
+		pte_chain_unlock(page);
+
 		/*
 		 * FIXME: need to consider page_count(page) here if/when we
 		 * reap orphaned pages via the LRU (Daniel's locking stuff)
 		 */
-		if (total_swap_pages == 0 && !page->mapping &&
-						!PagePrivate(page)) {
+		if (unlikely(total_swap_pages == 0 && !page->mapping &&
+						!PagePrivate(page))) {
 			list_add(&page->lru, &l_active);
 			continue;
 		}
@@ -683,11 +646,12 @@ refill_inactive_zone(struct zone *zone, 
 	pgmoved = 0;
 	spin_lock_irq(&zone->lru_lock);
 	while (!list_empty(&l_inactive)) {
-		page = list_entry(l_inactive.prev, struct page, lru);
+		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
 		if (TestSetPageLRU(page))
 			BUG();
-		if (!TestClearPageActive(page))
+		if (!TestClearPageActiveMapped(page)
+				&& !TestClearPageActiveUnmapped(page))
 			BUG();
 		list_move(&page->lru, &zone->inactive_list);
 		pgmoved++;
@@ -711,27 +675,41 @@ refill_inactive_zone(struct zone *zone, 
 	}
 
 	pgmoved = 0;
+	pgmoved_unmapped = 0;
 	while (!list_empty(&l_active)) {
-		page = list_entry(l_active.prev, struct page, lru);
+		page = lru_to_page(&l_active);
 		prefetchw_prev_lru_page(page, &l_active, flags);
 		if (TestSetPageLRU(page))
 			BUG();
-		BUG_ON(!PageActive(page));
-		list_move(&page->lru, &zone->active_list);
-		pgmoved++;
+		if(!TestClearPageActiveMapped(page)
+				&& !TestClearPageActiveUnmapped(page))
+			BUG();
+		if (page_mapped(page)) {
+			SetPageActiveMapped(page);
+			list_move(&page->lru, &zone->active_mapped_list);
+			pgmoved++;
+		} else {
+			SetPageActiveUnmapped(page);
+			list_move(&page->lru, &zone->active_unmapped_list);
+			pgmoved_unmapped++;
+		}
+
 		if (!pagevec_add(&pvec, page)) {
-			zone->nr_active += pgmoved;
+			zone->nr_active_mapped += pgmoved;
 			pgmoved = 0;
+			zone->nr_active_unmapped += pgmoved_unmapped;
+			pgmoved_unmapped = 0;
 			spin_unlock_irq(&zone->lru_lock);
 			__pagevec_release(&pvec);
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
-	zone->nr_active += pgmoved;
+	zone->nr_active_mapped += pgmoved;
+	zone->nr_active_unmapped += pgmoved_unmapped;
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
 
-	mod_page_state_zone(zone, pgrefill, nr_pages_in - nr_pages);
+	mod_page_state_zone(zone, pgrefill, nr_scan - nr_pages);
 	mod_page_state(pgdeactivate, pgdeactivate);
 }
 
@@ -744,6 +722,8 @@ shrink_zone(struct zone *zone, int max_s
 		int *total_scanned, struct page_state *ps)
 {
 	unsigned long ratio;
+	unsigned long long mapped_ratio;
+	unsigned long nr_active;
 	int count;
 
 	/*
@@ -756,14 +736,27 @@ shrink_zone(struct zone *zone, int max_s
 	 * just to make sure that the kernel will slowly sift through the
 	 * active list.
 	 */
-	ratio = (unsigned long)SWAP_CLUSTER_MAX * zone->nr_active /
-				((zone->nr_inactive | 1) * 2);
+	nr_active = zone->nr_active_mapped + zone->nr_active_unmapped;
+	ratio = (unsigned long)SWAP_CLUSTER_MAX * nr_active /
+				(zone->nr_inactive * 2 + 1);
+	mapped_ratio = (unsigned long long)ratio * nr_active;
+	do_div(mapped_ratio, (zone->nr_active_unmapped * vm_mapped_page_cost) +1);
+
+	ratio = ratio - mapped_ratio;
+	atomic_add(ratio+1, &zone->nr_scan_active_unmapped);
+	count = atomic_read(&zone->nr_scan_active_unmapped);
+	if (count >= SWAP_CLUSTER_MAX) {
+		atomic_set(&zone->nr_scan_active_unmapped, 0);
+		shrink_active_list(zone, &zone->active_unmapped_list,
+					&zone->nr_active_unmapped, count, ps);
+	}
 
-	atomic_add(ratio+1, &zone->nr_scan_active);
-	count = atomic_read(&zone->nr_scan_active);
+	atomic_add(mapped_ratio+1, &zone->nr_scan_active_mapped);
+	count = atomic_read(&zone->nr_scan_active_mapped);
 	if (count >= SWAP_CLUSTER_MAX) {
-		atomic_set(&zone->nr_scan_active, 0);
-		refill_inactive_zone(zone, count, ps);
+		atomic_set(&zone->nr_scan_active_mapped, 0);
+		shrink_active_list(zone, &zone->active_mapped_list,
+					&zone->nr_active_mapped, count, ps);
 	}
 
 	atomic_add(max_scan, &zone->nr_scan_inactive);
@@ -802,9 +795,6 @@ shrink_caches(struct zone **zones, int p
 		struct zone *zone = zones[i];
 		int max_scan;
 
-		if (zone->free_pages < zone->pages_high)
-			zone->temp_priority = priority;
-
 		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
@@ -838,13 +828,9 @@ int try_to_free_pages(struct zone **zone
 	int ret = 0;
 	int nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	int i;
 
 	inc_page_state(allocstall);
 
-	for (i = 0; zones[i] != 0; i++)
-		zones[i]->temp_priority = DEF_PRIORITY;
-
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		int total_scanned = 0;
 		struct page_state ps;
@@ -877,8 +863,6 @@ int try_to_free_pages(struct zone **zone
 	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
 		out_of_memory();
 out:
-	for (i = 0; zones[i] != 0; i++)
-		zones[i]->prev_priority = zones[i]->temp_priority;
 	return ret;
 }
 
@@ -916,12 +900,6 @@ static int balance_pgdat(pg_data_t *pgda
 
 	inc_page_state(pageoutrun);
 
-	for (i = 0; i < pgdat->nr_zones; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		zone->temp_priority = DEF_PRIORITY;
-	}
-
 	for (priority = DEF_PRIORITY; priority; priority--) {
 		int all_zones_ok = 1;
 		int pages_scanned = 0;
@@ -972,7 +950,6 @@ scan:
 				if (zone->free_pages <= zone->pages_high)
 					all_zones_ok = 0;
 			}
-			zone->temp_priority = priority;
 			max_scan = zone->nr_inactive >> priority;
 			reclaimed = shrink_zone(zone, max_scan, GFP_KERNEL,
 					&total_scanned, ps);
@@ -998,11 +975,6 @@ scan:
 			blk_congestion_wait(WRITE, HZ/10);
 	}
 out:
-	for (i = 0; i < pgdat->nr_zones; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		zone->prev_priority = zone->temp_priority;
-	}
 	return nr_pages - to_free;
 }
 
@@ -1136,7 +1108,6 @@ static int __init kswapd_init(void)
 	for_each_pgdat(pgdat)
 		pgdat->kswapd
 		= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
-	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }

_

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-11  0:04 [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists Nick Piggin
@ 2004-03-11 17:25 ` Marc-Christian Petersen
  2004-03-12  9:09   ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Marc-Christian Petersen @ 2004-03-11 17:25 UTC (permalink / raw)
  To: linux-kernel; +Cc: Nick Piggin, linux-mm, Mike Fedyk, plate

On Thursday 11 March 2004 01:04, Nick Piggin wrote:

Hi Nick,

> Here is my updated patches rolled into one.

hmm, using this in 2.6.4-rc2-mm1 my machine starts to swap very very soon. 
Machine has squid, bind, apache running, X 4.3.0, Windowmaker, so nothing 
special.

Swap grows very easily starting to untar'gunzip a kernel tree. About + 
150-200MB goes to swap. Everything is very smooth though, but I just wondered 
because w/o your patches swap isn't used at all, even after some days of 
uptime.

ciao, Marc


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-11 17:25 ` Marc-Christian Petersen
@ 2004-03-12  9:09   ` Nick Piggin
  2004-03-12  9:27     ` Andrew Morton
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2004-03-12  9:09 UTC (permalink / raw)
  To: Marc-Christian Petersen; +Cc: linux-kernel, linux-mm, Mike Fedyk, plate



Marc-Christian Petersen wrote:

>On Thursday 11 March 2004 01:04, Nick Piggin wrote:
>
>Hi Nick,
>
>
>>Here is my updated patches rolled into one.
>>
>
>hmm, using this in 2.6.4-rc2-mm1 my machine starts to swap very very soon. 
>Machine has squid, bind, apache running, X 4.3.0, Windowmaker, so nothing 
>special.
>
>Swap grows very easily starting to untar'gunzip a kernel tree. About + 
>150-200MB goes to swap. Everything is very smooth though, but I just wondered 
>because w/o your patches swap isn't used at all, even after some days of 
>uptime.
>
>

Hmm... I guess it is still smooth because it is swapping out only
inactive pages. If the standard VM isn't being pushed very hard it
doesn't scan mapped pages at all which is why it isn't swapping.

I have a preference for allowing it to scan some mapped pages though.
I'm not sure if there is any attempt at a drop behind logic. That
might help. Add new unmapped pagecache pages to the inactive list or
something might help... hmm, actually that's what it does now by the
looks.

I guess you don't have a problem though.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12  9:09   ` Nick Piggin
@ 2004-03-12  9:27     ` Andrew Morton
  2004-03-12  9:37       ` Nick Piggin
  2004-03-12 11:08       ` Matthias Urlichs
  0 siblings, 2 replies; 27+ messages in thread
From: Andrew Morton @ 2004-03-12  9:27 UTC (permalink / raw)
  To: Nick Piggin; +Cc: m.c.p, linux-kernel, linux-mm, mfedyk, plate

Nick Piggin <piggin@cyberone.com.au> wrote:
>
> Hmm... I guess it is still smooth because it is swapping out only
>  inactive pages. If the standard VM isn't being pushed very hard it
>  doesn't scan mapped pages at all which is why it isn't swapping.
> 
>  I have a preference for allowing it to scan some mapped pages though.

I haven't looked at the code but if, as I assume, it is always scanning
mapped pages, although at a reduced rate then the effect will be the same
as setting swappiness to 100, except it will take longer.

That effect is to cause the whole world to be swapped out when people
return to their machines in the morning.  Once they're swapped back in the
first thing they do it send bitchy emails to you know who.

>From a performance perspective it's the right thing to do, but nobody likes
it.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12  9:27     ` Andrew Morton
@ 2004-03-12  9:37       ` Nick Piggin
  2004-03-12 11:08       ` Matthias Urlichs
  1 sibling, 0 replies; 27+ messages in thread
From: Nick Piggin @ 2004-03-12  9:37 UTC (permalink / raw)
  To: Andrew Morton; +Cc: m.c.p, linux-kernel, linux-mm, mfedyk, plate



Andrew Morton wrote:

>Nick Piggin <piggin@cyberone.com.au> wrote:
>
>>Hmm... I guess it is still smooth because it is swapping out only
>> inactive pages. If the standard VM isn't being pushed very hard it
>> doesn't scan mapped pages at all which is why it isn't swapping.
>>
>> I have a preference for allowing it to scan some mapped pages though.
>>
>
>I haven't looked at the code but if, as I assume, it is always scanning
>mapped pages, although at a reduced rate then the effect will be the same
>as setting swappiness to 100, except it will take longer.
>
>

Yep

>That effect is to cause the whole world to be swapped out when people
>return to their machines in the morning.  Once they're swapped back in the
>first thing they do it send bitchy emails to you know who.
>
>>From a performance perspective it's the right thing to do, but nobody likes
>it.
>
>

Yeah. I wonder if there is a way to be smarter about dropping these
used once pages without putting pressure on more permanent pages...
I guess all heuristics will fall down somewhere or other.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12  9:27     ` Andrew Morton
  2004-03-12  9:37       ` Nick Piggin
@ 2004-03-12 11:08       ` Matthias Urlichs
  2004-03-12 11:47         ` Jamie Lokier
                           ` (2 more replies)
  1 sibling, 3 replies; 27+ messages in thread
From: Matthias Urlichs @ 2004-03-12 11:08 UTC (permalink / raw)
  To: linux-kernel

Hi, Andrew Morton wrote:

> That effect is to cause the whole world to be swapped out when people
> return to their machines in the morning.

The correct solution to this problem is "suspend-to-disk" --
if the machine isn't doing anything anyway, TURN IT OFF.

One slightly more practical solution from the "you-now-who gets angry
mails" POV anyway, would be to tie the reduced-rate scanning to the load
average -- if nothing at all happens, swap-out doesn't need to happen
either.

-- 
Matthias Urlichs

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 11:08       ` Matthias Urlichs
@ 2004-03-12 11:47         ` Jamie Lokier
  2004-03-12 12:44         ` Nick Piggin
  2004-03-12 21:46         ` Pavel Machek
  2 siblings, 0 replies; 27+ messages in thread
From: Jamie Lokier @ 2004-03-12 11:47 UTC (permalink / raw)
  To: Matthias Urlichs; +Cc: linux-kernel

Matthias Urlichs wrote:
> > That effect is to cause the whole world to be swapped out when people
> > return to their machines in the morning.
> 
> The correct solution to this problem is "suspend-to-disk" --
> if the machine isn't doing anything anyway, TURN IT OFF.

How is that better for people complaining that everything needs to be
swapped in in the morning?

Suspend-to-disk will cause everything to be paged in too.  Faster I
suspect (haven't tried it; it doesn't work on my box), but still a
wait especially when you add in the BIOS boot time.

Environmentally turning an unused machine off is good.  But I don't
see how suspend-to-disk will convince people who are annoyed by
swapping in the morning.

> One slightly more practical solution from the "you-now-who gets angry
> mails" POV anyway, would be to tie the reduced-rate scanning to the load
> average -- if nothing at all happens, swap-out doesn't need to happen
> either.

If nothing at all happens, does it matter that pages are written to
swap?  They're still in RAM as well.

-- Jamie

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 11:08       ` Matthias Urlichs
  2004-03-12 11:47         ` Jamie Lokier
@ 2004-03-12 12:44         ` Nick Piggin
  2004-03-12 14:15           ` Nick Piggin
  2004-03-12 19:12           ` Bill Davidsen
  2004-03-12 21:46         ` Pavel Machek
  2 siblings, 2 replies; 27+ messages in thread
From: Nick Piggin @ 2004-03-12 12:44 UTC (permalink / raw)
  To: Matthias Urlichs; +Cc: linux-kernel



Matthias Urlichs wrote:

>Hi, Andrew Morton wrote:
>
>
>>That effect is to cause the whole world to be swapped out when people
>>return to their machines in the morning.
>>
>
>The correct solution to this problem is "suspend-to-disk" --
>if the machine isn't doing anything anyway, TURN IT OFF.
>
>

Without arguing that point, the VM also should have a solution
to the problem where people don't turn it off.

>One slightly more practical solution from the "you-now-who gets angry
>mails" POV anyway, would be to tie the reduced-rate scanning to the load
>average -- if nothing at all happens, swap-out doesn't need to happen
>either.
>
>

Well if nothing at all happens we don't swap out, but when something
is happening, desktop users don't want any of their programs to be
swapped out no matter how long they have been sitting idle. They don't
want to wait 10 seconds to page something in even if it means they're
waiting an extra 10 minutes throughout the day for their kernel greps
and diffs to finish.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 12:44         ` Nick Piggin
@ 2004-03-12 14:15           ` Nick Piggin
  2004-03-12 15:05             ` Nikita Danilov
  2004-03-12 19:12             ` Andrew Morton
  2004-03-12 19:12           ` Bill Davidsen
  1 sibling, 2 replies; 27+ messages in thread
From: Nick Piggin @ 2004-03-12 14:15 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Matthias Urlichs, linux-kernel, Andrew Morton

[-- Attachment #1: Type: text/plain, Size: 1336 bytes --]



Nick Piggin wrote:

>
> Well if nothing at all happens we don't swap out, but when something
> is happening, desktop users don't want any of their programs to be
> swapped out no matter how long they have been sitting idle. They don't
> want to wait 10 seconds to page something in even if it means they're
> waiting an extra 10 minutes throughout the day for their kernel greps
> and diffs to finish.
>
>

Just had a try of doing things like updatedb and dd if=/dev/zero of=./blah
It is pretty swappy I guess. The following patch I think makes things less
swappy. It still isn't true dropbehind because new unmapped pages still do
place some pressure on the more established pagecache, but not as much.

It is unclear whether full dropbehind is actually good or not. If you have
512MB of memory and a 256MB working set of file data (unmapped), with 400MB
of mapped memory doing nothing, after enough thrashing through your 256MB,
you'd expect some of that mapped memory to be swapped out.

By the way, I would be interested to know the rationale behind
mark_page_accessed as it is without this patch, also what is it doing in
rmap.c (I know hardly anything actually uses page_test_and_clear_young, but
still). It seems to me like it only serves to make VM behaviour harder to
understand, but I'm probably missing something. Andrew?


[-- Attachment #2: vm-dropbehind.patch --]
[-- Type: text/x-patch, Size: 1638 bytes --]

 linux-2.6-npiggin/mm/filemap.c |    3 +--
 linux-2.6-npiggin/mm/rmap.c    |    2 +-
 linux-2.6-npiggin/mm/swap.c    |    7 +------
 3 files changed, 3 insertions(+), 9 deletions(-)

diff -puN mm/filemap.c~vm-dropbehind mm/filemap.c
--- linux-2.6/mm/filemap.c~vm-dropbehind	2004-03-13 00:14:56.000000000 +1100
+++ linux-2.6-npiggin/mm/filemap.c	2004-03-13 00:55:17.000000000 +1100
@@ -662,8 +662,7 @@ page_ok:
 		/*
 		 * Mark the page accessed if we read the beginning.
 		 */
-		if (!offset)
-			mark_page_accessed(page);
+		mark_page_accessed(page);
 
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
diff -puN mm/swap.c~vm-dropbehind mm/swap.c
--- linux-2.6/mm/swap.c~vm-dropbehind	2004-03-13 00:17:29.000000000 +1100
+++ linux-2.6-npiggin/mm/swap.c	2004-03-13 00:18:11.000000000 +1100
@@ -111,13 +111,8 @@ void fastcall activate_page(struct page 
  */
 void fastcall mark_page_accessed(struct page *page)
 {
-	if (!PageActiveMapped(page) && !PageActiveUnmapped(page)
-			&& PageReferenced(page) && PageLRU(page)) {
-		activate_page(page);
-		ClearPageReferenced(page);
-	} else if (!PageReferenced(page)) {
+	if (!PageReferenced(page))
 		SetPageReferenced(page);
-	}
 }
 
 EXPORT_SYMBOL(mark_page_accessed);
diff -puN mm/rmap.c~vm-dropbehind mm/rmap.c
--- linux-2.6/mm/rmap.c~vm-dropbehind	2004-03-13 01:08:00.000000000 +1100
+++ linux-2.6-npiggin/mm/rmap.c	2004-03-13 01:08:28.000000000 +1100
@@ -118,7 +118,7 @@ int fastcall page_referenced(struct page
 	int referenced = 0;
 
 	if (page_test_and_clear_young(page))
-		mark_page_accessed(page);
+		referenced++;
 
 	if (TestClearPageReferenced(page))
 		referenced++;

_

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 14:15           ` Nick Piggin
@ 2004-03-12 15:05             ` Nikita Danilov
  2004-03-12 15:28               ` Nick Piggin
  2004-03-12 19:12             ` Andrew Morton
  1 sibling, 1 reply; 27+ messages in thread
From: Nikita Danilov @ 2004-03-12 15:05 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Matthias Urlichs, linux-kernel, Andrew Morton

Nick Piggin writes:
 > 

[...]

 > 
 > By the way, I would be interested to know the rationale behind
 > mark_page_accessed as it is without this patch, also what is it doing in
 > rmap.c (I know hardly anything actually uses page_test_and_clear_young, but
 > still). It seems to me like it only serves to make VM behaviour harder to
 > understand, but I'm probably missing something. Andrew?

With your patch, once a page got into inactive list, its PG_referenced
bit will only be checked by VM scanner when page wanders to the tail of
list. In particular, if is impossible to tell pages that were accessed
only once while on inactive list from ones that were accessed multiple
times. Original mark_page_accessed() moves page to the active list on
the second access, thus making it less eligible for the reclaim.

I actually tried quite an opposite modification:
(ftp://ftp.namesys.com/pub/misc-patches/unsupported/extra/2004.03.10-2.6.4-rc3/a_1[5678]*)

/* roughly, modulo locking, etc. */
void fastcall mark_page_accessed(struct page *page)
{
		if (!PageReferenced(page))
			SetPageReferenced(page);
		else if (!PageLRU(page))
			continue;
		else if (!PageActive(page)) {
			/* page is on inactive list */
			del_page_from_inactive_list(zone, page);
			SetPageActive(page);
			add_page_to_active_list(zone, page);
			inc_page_state(pgactivate);
			ClearPageReferenced(page);
		} else {
			/* page is on active list, move it to head */
			list_move(&page->lru, &zone->active_list);
			ClearPageReferenced(page);
		}
}

That is, referenced and active page is moved to head of the active
list. While somewhat improving file system performance it badly affects
anonymous memory, because (it seems) file system pages tend to push
mapped ones out of active list. Probably it should have better effect
with your split active lists.

 > 

Nikita.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 15:05             ` Nikita Danilov
@ 2004-03-12 15:28               ` Nick Piggin
  2004-03-12 16:31                 ` Nikita Danilov
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2004-03-12 15:28 UTC (permalink / raw)
  To: Nikita Danilov; +Cc: Matthias Urlichs, linux-kernel, Andrew Morton



Nikita Danilov wrote:

>Nick Piggin writes:
> > 
>
>[...]
>
> > 
> > By the way, I would be interested to know the rationale behind
> > mark_page_accessed as it is without this patch, also what is it doing in
> > rmap.c (I know hardly anything actually uses page_test_and_clear_young, but
> > still). It seems to me like it only serves to make VM behaviour harder to
> > understand, but I'm probably missing something. Andrew?
>
>With your patch, once a page got into inactive list, its PG_referenced
>bit will only be checked by VM scanner when page wanders to the tail of
>list. In particular, if is impossible to tell pages that were accessed
>only once while on inactive list from ones that were accessed multiple
>times. Original mark_page_accessed() moves page to the active list on
>the second access, thus making it less eligible for the reclaim.
>
>

With my patch though, it gives unmapped pages the same treatment as
mapped pages. Without my patch, pages getting a lot of mark_page_accessed
activity can easily be promoted unfairly past mapped ones which are simply
getting activity through the pte.

I say just set the bit and let the scanner handle it.

>I actually tried quite an opposite modification:
>(ftp://ftp.namesys.com/pub/misc-patches/unsupported/extra/2004.03.10-2.6.4-rc3/a_1[5678]*)
>
>/* roughly, modulo locking, etc. */
>void fastcall mark_page_accessed(struct page *page)
>{
>		if (!PageReferenced(page))
>			SetPageReferenced(page);
>		else if (!PageLRU(page))
>			continue;
>		else if (!PageActive(page)) {
>			/* page is on inactive list */
>			del_page_from_inactive_list(zone, page);
>			SetPageActive(page);
>			add_page_to_active_list(zone, page);
>			inc_page_state(pgactivate);
>			ClearPageReferenced(page);
>		} else {
>			/* page is on active list, move it to head */
>			list_move(&page->lru, &zone->active_list);
>			ClearPageReferenced(page);
>		}
>}
>
>That is, referenced and active page is moved to head of the active
>list. While somewhat improving file system performance it badly affects
>anonymous memory, because (it seems) file system pages tend to push
>mapped ones out of active list. Probably it should have better effect
>with your split active lists.
>

Yeah. Hmm, I think it might be a good idea to do this sorting for
unmapped pages on the active list. It shouldn't do ClearPageReferenced
though, because your !PageReferenced pages now get their referenced
bit set, and next time around the scanner they come in above the
PageReferenced page.

I don't like the inactive->active promotion here though, as I explained.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 15:28               ` Nick Piggin
@ 2004-03-12 16:31                 ` Nikita Danilov
  2004-03-12 23:05                   ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Nikita Danilov @ 2004-03-12 16:31 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Matthias Urlichs, linux-kernel, Andrew Morton

Nick Piggin writes:
 > 
 > 
 > Nikita Danilov wrote:
 > 
 > >Nick Piggin writes:
 > > > 
 > >
 > >[...]
 > >
 > > > 
 > > > By the way, I would be interested to know the rationale behind
 > > > mark_page_accessed as it is without this patch, also what is it doing in
 > > > rmap.c (I know hardly anything actually uses page_test_and_clear_young, but
 > > > still). It seems to me like it only serves to make VM behaviour harder to
 > > > understand, but I'm probably missing something. Andrew?
 > >
 > >With your patch, once a page got into inactive list, its PG_referenced
 > >bit will only be checked by VM scanner when page wanders to the tail of
 > >list. In particular, if is impossible to tell pages that were accessed
 > >only once while on inactive list from ones that were accessed multiple
 > >times. Original mark_page_accessed() moves page to the active list on
 > >the second access, thus making it less eligible for the reclaim.
 > >
 > >
 > 
 > With my patch though, it gives unmapped pages the same treatment as
 > mapped pages. Without my patch, pages getting a lot of mark_page_accessed
 > activity can easily be promoted unfairly past mapped ones which are simply
 > getting activity through the pte.

Another way to put it is that treatment of file system pages is dumbed
down to the level of mapped ones: information about access patterns is
just discarded.

 > 
 > I say just set the bit and let the scanner handle it.

I think that decisions about balancing VM and file system caches should
be done by higher level, rather than by forcing file system to use
low-level mechanisms designed for VM, where only limited information is
provided by hardware. Splitting page queues is a step in a right
direction, as it allows to implement more precise replacement for the
file system cache.

Nikita.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 12:44         ` Nick Piggin
  2004-03-12 14:15           ` Nick Piggin
@ 2004-03-12 19:12           ` Bill Davidsen
  2004-03-12 23:50             ` Nick Piggin
  1 sibling, 1 reply; 27+ messages in thread
From: Bill Davidsen @ 2004-03-12 19:12 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Matthias Urlichs, linux-kernel

Nick Piggin wrote:
> 
> 
> Matthias Urlichs wrote:
> 
>> Hi, Andrew Morton wrote:
>>
>>
>>> That effect is to cause the whole world to be swapped out when people
>>> return to their machines in the morning.
>>>
>>
>> The correct solution to this problem is "suspend-to-disk" --
>> if the machine isn't doing anything anyway, TURN IT OFF.
>>
>>
> 
> Without arguing that point, the VM also should have a solution
> to the problem where people don't turn it off.
> 
>> One slightly more practical solution from the "you-now-who gets angry
>> mails" POV anyway, would be to tie the reduced-rate scanning to the load
>> average -- if nothing at all happens, swap-out doesn't need to happen
>> either.
>>
>>
> 
> Well if nothing at all happens we don't swap out, but when something
> is happening, desktop users don't want any of their programs to be
> swapped out no matter how long they have been sitting idle. They don't
> want to wait 10 seconds to page something in even if it means they're
> waiting an extra 10 minutes throughout the day for their kernel greps
> and diffs to finish.

I have noticed that 2.6 seems to clear memory (any version I've run for 
a while) and a lunch break results in a burst of disk activity before 
the screen saver even gets in to unlock the screen. I know this box has 
no cron activity during the day, so the pages were not forced out.

It's a good thing IMHO to write dirty pages to swap so the space can be 
reclaimed if needed, but shouldn't the page be marked as clean and left 
in memory for use without swap-in nif it's needed? I see this on backup 
servers, and a machine with 3GB of free memory, no mail, no cron and no 
app running isn't getting much memory pressure ;-)

I am not saying the behaviour is wrong, I just fail to see why the last 
application run isn't still in memory an hour later, absent memory pressure.

-- 
		-bill

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 14:15           ` Nick Piggin
  2004-03-12 15:05             ` Nikita Danilov
@ 2004-03-12 19:12             ` Andrew Morton
  2004-03-12 23:23               ` Nick Piggin
  1 sibling, 1 reply; 27+ messages in thread
From: Andrew Morton @ 2004-03-12 19:12 UTC (permalink / raw)
  To: Nick Piggin; +Cc: piggin, smurf, linux-kernel

Nick Piggin <piggin@cyberone.com.au> wrote:
>
> Just had a try of doing things like updatedb and dd if=/dev/zero of=./blah
> It is pretty swappy I guess.

You'll need to bring the scanning priority back into the picture: don't
move mapped pages down onto the inactive list at low scanning priorities. 
And that eans retaining the remember-the-priority-from-last-time logic.

Otherwise it's inevitable that even a `cat monster_file > /dev/null' will
eventually swap out everything it can.

> By the way, I would be interested to know the rationale behind
> mark_page_accessed as it is without this patch, also what is it doing in
> rmap.c (I know hardly anything actually uses page_test_and_clear_young, but
> still). It seems to me like it only serves to make VM behaviour harder to
> understand, but I'm probably missing something. Andrew?

hm, that's left-over code which is pretty pointless now.


	if (page_test_and_clear_young(page))
		mark_page_accessed(page);

	if (TestClearPageReferenced(page))
		referenced++;

The pages in here are never on the LRU, so all the mark_page_accessed()
will do is to set PG_Referenced.  And we immediately clear it again.  So
the mark_page_accessed() can be replaced with referenced++.



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 11:08       ` Matthias Urlichs
  2004-03-12 11:47         ` Jamie Lokier
  2004-03-12 12:44         ` Nick Piggin
@ 2004-03-12 21:46         ` Pavel Machek
  2 siblings, 0 replies; 27+ messages in thread
From: Pavel Machek @ 2004-03-12 21:46 UTC (permalink / raw)
  To: Matthias Urlichs; +Cc: linux-kernel

Hi!
> > That effect is to cause the whole world to be swapped out when people
> > return to their machines in the morning.
> 
> The correct solution to this problem is "suspend-to-disk" --
> if the machine isn't doing anything anyway, TURN IT OFF.

Try it.

With current design, machine swaps *a lot* after resume.

Suspend-to-ram is probably better.

But if you don't run your updatedb overnight, you are going to
run it while you are logged in, and that is going to suck.
-- 
64 bytes from 195.113.31.123: icmp_seq=28 ttl=51 time=448769.1 ms         


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 16:31                 ` Nikita Danilov
@ 2004-03-12 23:05                   ` Nick Piggin
  0 siblings, 0 replies; 27+ messages in thread
From: Nick Piggin @ 2004-03-12 23:05 UTC (permalink / raw)
  To: Nikita Danilov; +Cc: Matthias Urlichs, linux-kernel, Andrew Morton



Nikita Danilov wrote:

>Nick Piggin writes:
>
> > With my patch though, it gives unmapped pages the same treatment as
> > mapped pages. Without my patch, pages getting a lot of mark_page_accessed
> > activity can easily be promoted unfairly past mapped ones which are simply
> > getting activity through the pte.
>
>Another way to put it is that treatment of file system pages is dumbed
>down to the level of mapped ones: information about access patterns is
>just discarded.
>
>

In a way, yes.

> > 
> > I say just set the bit and let the scanner handle it.
>
>I think that decisions about balancing VM and file system caches should
>be done by higher level, rather than by forcing file system to use
>low-level mechanisms designed for VM, where only limited information is
>provided by hardware. Splitting page queues is a step in a right
>direction, as it allows to implement more precise replacement for the
>file system cache.
>
>

It makes it that much harder to calculate the pressure you are putting
on mapped vs unmapped pages though.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 19:12             ` Andrew Morton
@ 2004-03-12 23:23               ` Nick Piggin
  0 siblings, 0 replies; 27+ messages in thread
From: Nick Piggin @ 2004-03-12 23:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: smurf, linux-kernel



Andrew Morton wrote:

>Nick Piggin <piggin@cyberone.com.au> wrote:
>
>>Just had a try of doing things like updatedb and dd if=/dev/zero of=./blah
>>It is pretty swappy I guess.
>>
>
>You'll need to bring the scanning priority back into the picture: don't
>move mapped pages down onto the inactive list at low scanning priorities. 
>And that eans retaining the remember-the-priority-from-last-time logic.
>
>Otherwise it's inevitable that even a `cat monster_file > /dev/null' will
>eventually swap out everything it can.
>
>

Hmm I dunno. At mapped_page_cost 8, I don't think it is swappy enough
that your desktop users will be running into problems. I need to write
4GB of file to push out 70MB of swap here (256MB RAM). And not much of
that swap has come back in, by the way...

>>By the way, I would be interested to know the rationale behind
>>mark_page_accessed as it is without this patch, also what is it doing in
>>rmap.c (I know hardly anything actually uses page_test_and_clear_young, but
>>still). It seems to me like it only serves to make VM behaviour harder to
>>understand, but I'm probably missing something. Andrew?
>>
>
>hm, that's left-over code which is pretty pointless now.
>
>
>	if (page_test_and_clear_young(page))
>		mark_page_accessed(page);
>
>	if (TestClearPageReferenced(page))
>		referenced++;
>
>The pages in here are never on the LRU, so all the mark_page_accessed()
>will do is to set PG_Referenced.  And we immediately clear it again.  So
>the mark_page_accessed() can be replaced with referenced++.
>
>
>

Yep, see the patch I'd attached before.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 19:12           ` Bill Davidsen
@ 2004-03-12 23:50             ` Nick Piggin
  0 siblings, 0 replies; 27+ messages in thread
From: Nick Piggin @ 2004-03-12 23:50 UTC (permalink / raw)
  To: Bill Davidsen; +Cc: Matthias Urlichs, linux-kernel



Bill Davidsen wrote:

>
> I have noticed that 2.6 seems to clear memory (any version I've run 
> for a while) and a lunch break results in a burst of disk activity 
> before the screen saver even gets in to unlock the screen. I know this 
> box has no cron activity during the day, so the pages were not forced 
> out.
>


It shouldn't. Perhaps something else is using memory in the background?


> It's a good thing IMHO to write dirty pages to swap so the space can 
> be reclaimed if needed, but shouldn't the page be marked as clean and 
> left in memory for use without swap-in nif it's needed? I see this on 
> backup servers, and a machine with 3GB of free memory, no mail, no 
> cron and no app running isn't getting much memory pressure ;-)
>

Well it is basically just written out and reclaimed when it is needed,
it won't just be swapped out without memory pressure.

Although, there were some highmem balancing problems in 2.6 including
2.6.4 (now fixed in -bk). This causes too much pressure to be put on
ZONE_NORMAL mapped and file cache memory in favour of slab cache. This
could easily be causing the misbehaviour.

> I am not saying the behaviour is wrong, I just fail to see why the 
> last application run isn't still in memory an hour later, absent 
> memory pressure.
>

There would have to be *some* memory pressure... honestly, try 2.6-bk,
or if they are production machines and you can't, then wait for 2.6.5.



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 22:21       ` Jamie Lokier
@ 2004-03-12 22:36         ` Mike Fedyk
  0 siblings, 0 replies; 27+ messages in thread
From: Mike Fedyk @ 2004-03-12 22:36 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Nick Piggin, Mark_H_Johnson, Andrew Morton, linux-kernel,
	linux-mm, m.c.p, owner-linux-mm, plate, William Lee Irwin III

Jamie Lokier wrote:
> Mike Fedyk wrote:
> 
>>That would have other side benefits.  If the anon page matches (I'm not 
>>calling it "!dirty" since that might have other semantics in the current 
>>VM) what is in swap, it can be cleaned without performing any IO.  Also, 
>> suspending will have much less IO to perform before completion.
> 
> 
> Exactly those sort of benefits.

:)

> 
> Btw, When you say "You're saying all anon memory should become
> swap_cache eventually" it's worth noting that there are benefits to
> doing it the other way too: speculatively pulling in pages that are
> thought likely to be good for interactive response, at the expense of
> pages which have been used more recently, and must remain in RAM for a
> short while while they are considered in use, but aren't ranked so
> highly based on some interactivity heuristics.
> 

IIUC, the current VM loses the aging information as soon as a page is 
swapped out.  You might be asking for a LFU list instead of a LRU list.
Though, a reverse LFU (MFU -- most frequently used?) used only for swap 
might do what you want also...

> I.e. fixing the "everything swapped out in the morning" problem by
> having a long term slow rebalancing in favour of pages which seem to
> be requested for interactive purposes, competing against the short
> term balance of whichever pages have been used recently or are
> predicted by short term readahead.
> 

There was talk in Andrea's objrmap thread about using two LRU lists, but 
I forget what the benefits of that were.

> Both replicating RAM pages to swap, and replicating swap or
> file-backed pages to RAM can be speculative and down slowly, over the
> long term, and when there is little other activity or I/O.

In short, that probably would require some major surgery in the VM.

Mike

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 21:17     ` Mike Fedyk
@ 2004-03-12 22:21       ` Jamie Lokier
  2004-03-12 22:36         ` Mike Fedyk
  0 siblings, 1 reply; 27+ messages in thread
From: Jamie Lokier @ 2004-03-12 22:21 UTC (permalink / raw)
  To: Mike Fedyk
  Cc: Nick Piggin, Mark_H_Johnson, Andrew Morton, linux-kernel,
	linux-mm, m.c.p, owner-linux-mm, plate

Mike Fedyk wrote:
> That would have other side benefits.  If the anon page matches (I'm not 
> calling it "!dirty" since that might have other semantics in the current 
> VM) what is in swap, it can be cleaned without performing any IO.  Also, 
>  suspending will have much less IO to perform before completion.

Exactly those sort of benefits.

Btw, When you say "You're saying all anon memory should become
swap_cache eventually" it's worth noting that there are benefits to
doing it the other way too: speculatively pulling in pages that are
thought likely to be good for interactive response, at the expense of
pages which have been used more recently, and must remain in RAM for a
short while while they are considered in use, but aren't ranked so
highly based on some interactivity heuristics.

I.e. fixing the "everything swapped out in the morning" problem by
having a long term slow rebalancing in favour of pages which seem to
be requested for interactive purposes, competing against the short
term balance of whichever pages have been used recently or are
predicted by short term readahead.

Both replicating RAM pages to swap, and replicating swap or
file-backed pages to RAM can be speculative and down slowly, over the
long term, and when there is little other activity or I/O.

-- Jamie

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 19:35   ` Jamie Lokier
@ 2004-03-12 21:17     ` Mike Fedyk
  2004-03-12 22:21       ` Jamie Lokier
  0 siblings, 1 reply; 27+ messages in thread
From: Mike Fedyk @ 2004-03-12 21:17 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Nick Piggin, Mark_H_Johnson, Andrew Morton, linux-kernel,
	linux-mm, m.c.p, owner-linux-mm, plate

Jamie Lokier wrote:
> Nick Piggin wrote:
> 
>>In Linux, all reclaim is driven by a memory shortage. Often it
>>is just because more memory is being requested for more file
>>cache.
> 
> 
> Is reclaim the same as swapping, though?  I'd expect pages to be
> written to the swapfile speculatively, before they are needed for
> reclaim.  Is that one of those behaviours which everyone agrees is
> sensible, but it's yet to be implemented in the 2.6 VM?
> 

Nobody has mentioned the swap cache yet.  If a page is in ram, and swap 
and not dirty, it's counted in the swap cache.

> 
>>But presumably if you are running into memory pressure, you really
>>will need to free those free list pages, requiring the page to be
>>read from disk when it is used again.
> 
> 
> The idea is that you write pages to swap _before_ the memory pressure
> arrives, which makes those pages available immediately when memory
> pressure does arrive, provided they are still clean.  It's speculative.
> 
> I thought Linux did this already, but I don't know the current VM well.
> 

You're saying all anon memory should become swap_cache eventually 
(though, it should be a background "task" so it doesn't block userspace 
memory requests).

That would have other side benefits.  If the anon page matches (I'm not 
calling it "!dirty" since that might have other semantics in the current 
VM) what is in swap, it can be cleaned without performing any IO.  Also, 
  suspending will have much less IO to perform before completion.

Though there would have to be swap recycling algo if swap size < ram.

Mike

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 14:27 ` Nick Piggin
@ 2004-03-12 19:46   ` Jamie Lokier
  0 siblings, 0 replies; 27+ messages in thread
From: Jamie Lokier @ 2004-03-12 19:46 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Mark_H_Johnson, Andrew Morton, linux-kernel, linux-mm, mfedyk,
	m.c.p, owner-linux-mm, plate

Nick Piggin wrote:
> One thing you could do is re read swapped pages when you have
> plenty of free memory and the disks are idle.

Better: re-read swapped pages _and_ file-backed pages that are likely
to be used in future, when you have plenty of free memory and the
disks are idle.

updatedb would push plenty of memory out overnight.  But after the
cron jobs and before people wake up in the morning, the kernel would
gradually re-read the pages corresponding to mapped regions in
processes.  Possibly with emphasis on some processes more than others.
Possibly remembering some of that likelihood information even when a
particular executable isn't currently running.

During the day, after a big compile the kernel would gradually re-read
pages for processes which are running on your desktop but which you're
not actively using.  The editor you were using during the compile will
still be responsive because it wasn't swapped out.  The Nautilus or
Mozilla that you weren't using will appear responsive when you switch
to it, because the kernel was re-reading their mapped pages after the
compile, while you didn't notice because you were still using the
editor.

The intention is to avoid those long stalls where you switch to a
Mozilla window and it takes 30 seconds to page in all those libraries
randomly.  It's not necessary to keep Mozilla in memory all the time,
even when the memory is specifically useful for a compile, to provide
that illusion of snappy response most of the time.

-- Jamie

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 15:13 ` Nick Piggin
@ 2004-03-12 19:35   ` Jamie Lokier
  2004-03-12 21:17     ` Mike Fedyk
  0 siblings, 1 reply; 27+ messages in thread
From: Jamie Lokier @ 2004-03-12 19:35 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Mark_H_Johnson, Andrew Morton, linux-kernel, linux-mm, mfedyk,
	m.c.p, owner-linux-mm, plate

Nick Piggin wrote:
> In Linux, all reclaim is driven by a memory shortage. Often it
> is just because more memory is being requested for more file
> cache.

Is reclaim the same as swapping, though?  I'd expect pages to be
written to the swapfile speculatively, before they are needed for
reclaim.  Is that one of those behaviours which everyone agrees is
sensible, but it's yet to be implemented in the 2.6 VM?

> But presumably if you are running into memory pressure, you really
> will need to free those free list pages, requiring the page to be
> read from disk when it is used again.

The idea is that you write pages to swap _before_ the memory pressure
arrives, which makes those pages available immediately when memory
pressure does arrive, provided they are still clean.  It's speculative.

I thought Linux did this already, but I don't know the current VM well.

-- Jamie

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 15:00 Mark_H_Johnson
@ 2004-03-12 15:13 ` Nick Piggin
  2004-03-12 19:35   ` Jamie Lokier
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2004-03-12 15:13 UTC (permalink / raw)
  To: Mark_H_Johnson
  Cc: Andrew Morton, linux-kernel, linux-mm, mfedyk, m.c.p,
	owner-linux-mm, plate



Mark_H_Johnson@raytheon.com wrote:

>
>
>
>Nick Piggin <piggin@cyberone.com.au> wrote:
>
>>Not too sure what you mean. If we've swapped out the pages, it is
>>because we need the memory for something else. So no.
>>
>
>Actually - no, from what Andrew said, the system was not under memory
>pressure and did not need the memory for something else. The swapping
>occurred "just because". In that case, it would be better to keep track
>of where the pages came from (i.e., swap them in from the free list).
>
>

In Linux, all reclaim is driven by a memory shortage. Often it
is just because more memory is being requested for more file
cache.

My patch does make it a bit more probable that process memory will
be swapped out before file cache is discarded.

>Don't get me wrong - that behavior may be the "right thing" from an
>overall performance standpoint. A little extra disk I/O when the system
>is relatively idle may provide needed reserve (free pages) for when the
>system gets busy again.
>
>
>>One thing you could do is re read swapped pages when you have
>>plenty of free memory and the disks are idle.
>>
>That may also be a good idea. However, if you keep a mapping between
>pages on the "free list" and those in the swap file / partition, you
>do not actually have to do the disk I/O to accomplish that.
>
>

But presumably if you are running into memory pressure, you really
will need to free those free list pages, requiring the page to be
read from disk when it is used again.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
@ 2004-03-12 15:00 Mark_H_Johnson
  2004-03-12 15:13 ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Mark_H_Johnson @ 2004-03-12 15:00 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, linux-kernel, linux-mm, mfedyk, m.c.p,
	owner-linux-mm, plate





Nick Piggin <piggin@cyberone.com.au> wrote:
>Mark_H_Johnson@Raytheon.com wrote:
>>Nick Piggin <piggin@cyberone.com.au> wrote:
>>
>>>Andrew Morton wrote:
>>>
>>
>>>>That effect is to cause the whole world to be swapped out when people
>>>>return to their machines in the morning.  Once they're swapped back in
>>>>
[this is the symptom being reported]
>>Just a question, but I remember from VMS a long time ago that
>>as part of the working set limits, the "free list" was used to keep
>>pages that could be freely used but could be put back into the working
>>set quite easily (a "fast" page fault). Could you keep track of the
>>swapped pages in a similar manner so you don't have to go to disk to
>>get these pages [or is this already being done]? You would pull them
>>back from the free list and avoid the disk I/O in the morning.
>
>Not too sure what you mean. If we've swapped out the pages, it is
>because we need the memory for something else. So no.

Actually - no, from what Andrew said, the system was not under memory
pressure and did not need the memory for something else. The swapping
occurred "just because". In that case, it would be better to keep track
of where the pages came from (i.e., swap them in from the free list).

Don't get me wrong - that behavior may be the "right thing" from an
overall performance standpoint. A little extra disk I/O when the system
is relatively idle may provide needed reserve (free pages) for when the
system gets busy again.

>One thing you could do is re read swapped pages when you have
>plenty of free memory and the disks are idle.
That may also be a good idea. However, if you keep a mapping between
pages on the "free list" and those in the swap file / partition, you
do not actually have to do the disk I/O to accomplish that.

--Mark H Johnson
  <mailto:Mark_H_Johnson@raytheon.com>


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
  2004-03-12 14:18 Mark_H_Johnson
@ 2004-03-12 14:27 ` Nick Piggin
  2004-03-12 19:46   ` Jamie Lokier
  0 siblings, 1 reply; 27+ messages in thread
From: Nick Piggin @ 2004-03-12 14:27 UTC (permalink / raw)
  To: Mark_H_Johnson
  Cc: Andrew Morton, linux-kernel, linux-mm, mfedyk, m.c.p,
	owner-linux-mm, plate



Mark_H_Johnson@Raytheon.com wrote:

>
>
>
>Nick Piggin <piggin@cyberone.com.au> wrote:
>
>>Andrew Morton wrote:
>>
>
>>>That effect is to cause the whole world to be swapped out when people
>>>return to their machines in the morning.  Once they're swapped back in
>>>
>the
>
>>>first thing they do it send bitchy emails to you know who.
>>>
>>>>From a performance perspective it's the right thing to do, but nobody
>>>
>likes
>
>>>it.
>>>
>>>
>>>
>>Yeah. I wonder if there is a way to be smarter about dropping these
>>used once pages without putting pressure on more permanent pages...
>>I guess all heuristics will fall down somewhere or other.
>>
>
>Just a question, but I remember from VMS a long time ago that
>as part of the working set limits, the "free list" was used to keep
>pages that could be freely used but could be put back into the working
>set quite easily (a "fast" page fault). Could you keep track of the
>swapped pages in a similar manner so you don't have to go to disk to
>get these pages [or is this already being done]? You would pull them
>back from the free list and avoid the disk I/O in the morning.
>
>

Not too sure what you mean. If we've swapped out the pages, it is
because we need the memory for something else. So no.

One thing you could do is re read swapped pages when you have
plenty of free memory and the disks are idle.

>By the way - with 2.4.24 I see a similar behavior anyway [slow to get
>going in the morning]. I believe it is due to our nightly backup walking
>through the disks. If you could FIX the retention of sequentially read
>disk blocks from the various caches - that would help a lot more in
>my mind.
>
>

updatedb really wants to be able to provide better hints to the VM
that it is never going to use these pages again. I hate to cater for
the worst possible case that only happens because everyone has it as
a 2am cron job.


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists
@ 2004-03-12 14:18 Mark_H_Johnson
  2004-03-12 14:27 ` Nick Piggin
  0 siblings, 1 reply; 27+ messages in thread
From: Mark_H_Johnson @ 2004-03-12 14:18 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, linux-kernel, linux-mm, mfedyk, m.c.p,
	owner-linux-mm, plate





Nick Piggin <piggin@cyberone.com.au> wrote:
>Andrew Morton wrote:

>>That effect is to cause the whole world to be swapped out when people
>>return to their machines in the morning.  Once they're swapped back in
the
>>first thing they do it send bitchy emails to you know who.
>>
>>>From a performance perspective it's the right thing to do, but nobody
likes
>>it.
>>
>>
>
>Yeah. I wonder if there is a way to be smarter about dropping these
>used once pages without putting pressure on more permanent pages...
>I guess all heuristics will fall down somewhere or other.

Just a question, but I remember from VMS a long time ago that
as part of the working set limits, the "free list" was used to keep
pages that could be freely used but could be put back into the working
set quite easily (a "fast" page fault). Could you keep track of the
swapped pages in a similar manner so you don't have to go to disk to
get these pages [or is this already being done]? You would pull them
back from the free list and avoid the disk I/O in the morning.

By the way - with 2.4.24 I see a similar behavior anyway [slow to get
going in the morning]. I believe it is due to our nightly backup walking
through the disks. If you could FIX the retention of sequentially read
disk blocks from the various caches - that would help a lot more in
my mind.

--Mark H Johnson
  <mailto:Mark_H_Johnson@raytheon.com>


^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2004-03-13 12:35 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-03-11  0:04 [PATCH] 2.6.4-rc2-mm1: vm-split-active-lists Nick Piggin
2004-03-11 17:25 ` Marc-Christian Petersen
2004-03-12  9:09   ` Nick Piggin
2004-03-12  9:27     ` Andrew Morton
2004-03-12  9:37       ` Nick Piggin
2004-03-12 11:08       ` Matthias Urlichs
2004-03-12 11:47         ` Jamie Lokier
2004-03-12 12:44         ` Nick Piggin
2004-03-12 14:15           ` Nick Piggin
2004-03-12 15:05             ` Nikita Danilov
2004-03-12 15:28               ` Nick Piggin
2004-03-12 16:31                 ` Nikita Danilov
2004-03-12 23:05                   ` Nick Piggin
2004-03-12 19:12             ` Andrew Morton
2004-03-12 23:23               ` Nick Piggin
2004-03-12 19:12           ` Bill Davidsen
2004-03-12 23:50             ` Nick Piggin
2004-03-12 21:46         ` Pavel Machek
2004-03-12 14:18 Mark_H_Johnson
2004-03-12 14:27 ` Nick Piggin
2004-03-12 19:46   ` Jamie Lokier
2004-03-12 15:00 Mark_H_Johnson
2004-03-12 15:13 ` Nick Piggin
2004-03-12 19:35   ` Jamie Lokier
2004-03-12 21:17     ` Mike Fedyk
2004-03-12 22:21       ` Jamie Lokier
2004-03-12 22:36         ` Mike Fedyk

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).