All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] Refactor zone_reclaim code (v4)
@ 2011-01-25  5:10 ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-25  5:10 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: npiggin, kvm, linux-kernel, kosaki.motohiro, cl, kamezawa.hiroyu

Changelog v3
1. Renamed zone_reclaim_unmapped_pages to zone_reclaim_pages

Refactor zone_reclaim, move reusable functionality outside
of zone_reclaim. Make zone_reclaim_unmapped_pages modular

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
---
 mm/vmscan.c |   35 +++++++++++++++++++++++------------
 1 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5899f2f..02cc82e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2943,6 +2943,27 @@ static long zone_pagecache_reclaimable(struct zone *zone)
 }
 
 /*
+ * Helper function to reclaim unmapped pages, we might add something
+ * similar to this for slab cache as well. Currently this function
+ * is shared with __zone_reclaim()
+ */
+static inline void
+zone_reclaim_pages(struct zone *zone, struct scan_control *sc,
+				unsigned long nr_pages)
+{
+	int priority;
+	/*
+	 * Free memory by calling shrink zone with increasing
+	 * priorities until we have enough memory freed.
+	 */
+	priority = ZONE_RECLAIM_PRIORITY;
+	do {
+		shrink_zone(priority, zone, sc);
+		priority--;
+	} while (priority >= 0 && sc->nr_reclaimed < nr_pages);
+}
+
+/*
  * Try to free up some pages from this zone through reclaim.
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -2951,7 +2972,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	int priority;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2975,17 +2995,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
-		/*
-		 * Free memory by calling shrink zone with increasing
-		 * priorities until we have enough memory freed.
-		 */
-		priority = ZONE_RECLAIM_PRIORITY;
-		do {
-			shrink_zone(priority, zone, &sc);
-			priority--;
-		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
-	}
+	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages)
+		zone_reclaim_pages(zone, &sc, nr_pages);
 
 	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
 	if (nr_slab_pages0 > zone->min_slab_pages) {


^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH 1/2] Refactor zone_reclaim code (v4)
@ 2011-01-25  5:10 ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-25  5:10 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: npiggin, kvm, linux-kernel, kosaki.motohiro, cl, kamezawa.hiroyu

Changelog v3
1. Renamed zone_reclaim_unmapped_pages to zone_reclaim_pages

Refactor zone_reclaim, move reusable functionality outside
of zone_reclaim. Make zone_reclaim_unmapped_pages modular

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
---
 mm/vmscan.c |   35 +++++++++++++++++++++++------------
 1 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5899f2f..02cc82e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2943,6 +2943,27 @@ static long zone_pagecache_reclaimable(struct zone *zone)
 }
 
 /*
+ * Helper function to reclaim unmapped pages, we might add something
+ * similar to this for slab cache as well. Currently this function
+ * is shared with __zone_reclaim()
+ */
+static inline void
+zone_reclaim_pages(struct zone *zone, struct scan_control *sc,
+				unsigned long nr_pages)
+{
+	int priority;
+	/*
+	 * Free memory by calling shrink zone with increasing
+	 * priorities until we have enough memory freed.
+	 */
+	priority = ZONE_RECLAIM_PRIORITY;
+	do {
+		shrink_zone(priority, zone, sc);
+		priority--;
+	} while (priority >= 0 && sc->nr_reclaimed < nr_pages);
+}
+
+/*
  * Try to free up some pages from this zone through reclaim.
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -2951,7 +2972,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	int priority;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2975,17 +2995,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
-		/*
-		 * Free memory by calling shrink zone with increasing
-		 * priorities until we have enough memory freed.
-		 */
-		priority = ZONE_RECLAIM_PRIORITY;
-		do {
-			shrink_zone(priority, zone, &sc);
-			priority--;
-		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
-	}
+	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages)
+		zone_reclaim_pages(zone, &sc, nr_pages);
 
 	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
 	if (nr_slab_pages0 > zone->min_slab_pages) {

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-25  5:10 ` Balbir Singh
@ 2011-01-25  5:10   ` Balbir Singh
  -1 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-25  5:10 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: npiggin, kvm, linux-kernel, kosaki.motohiro, cl, kamezawa.hiroyu

Changelog v4
1. Add max_unmapped_ratio and use that as the upper limit
to check when to shrink the unmapped page cache (Christoph
Lameter)

Changelog v2
1. Use a config option to enable the code (Andrew Morton)
2. Explain the magic tunables in the code or at-least attempt
   to explain them (General comment)
3. Hint uses of the boot parameter with unlikely (Andrew Morton)
4. Use better names (balanced is not a good naming convention)

Provide control using zone_reclaim() and a boot parameter. The
code reuses functionality from zone_reclaim() to isolate unmapped
pages and reclaim them as a priority, ahead of other mapped pages.
A new sysctl for max_unmapped_ratio is provided and set to 16,
indicating 16% of the total zone pages are unmapped, we start
shrinking unmapped page cache.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |    8 +++
 include/linux/mmzone.h              |    5 ++
 include/linux/swap.h                |   23 ++++++++-
 init/Kconfig                        |   12 +++++
 kernel/sysctl.c                     |   11 ++++
 mm/page_alloc.c                     |   25 ++++++++++
 mm/vmscan.c                         |   87 +++++++++++++++++++++++++++++++++++
 7 files changed, 166 insertions(+), 5 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fee5f57..65a4ee6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2500,6 +2500,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			[X86]
 			Set unknown_nmi_panic=1 early on boot.
 
+	unmapped_page_control
+			[KNL] Available if CONFIG_UNMAPPED_PAGECACHE_CONTROL
+			is enabled. It controls the amount of unmapped memory
+			that is present in the system. This boot option plus
+			vm.min_unmapped_ratio (sysctl) provide granular control
+			over how much unmapped page cache can exist in the system
+			before kswapd starts reclaiming unmapped page cache pages.
+
 	usbcore.autosuspend=
 			[USB] The autosuspend time delay (in seconds) used
 			for newly-detected USB devices (default 2).  This
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2485acc..18f0f09 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -306,7 +306,10 @@ struct zone {
 	/*
 	 * zone reclaim becomes active if more unmapped pages exist.
 	 */
+#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
 	unsigned long		min_unmapped_pages;
+	unsigned long		max_unmapped_pages;
+#endif
 #ifdef CONFIG_NUMA
 	int node;
 	unsigned long		min_slab_pages;
@@ -773,6 +776,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
 			void __user *, size_t *, loff_t *);
+int sysctl_max_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
+			void __user *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 			void __user *, size_t *, loff_t *);
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7b75626..ae62a03 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -255,19 +255,34 @@ extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern long vm_total_pages;
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
 extern int sysctl_min_unmapped_ratio;
+extern int sysctl_max_unmapped_ratio;
+
 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
-#ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
-extern int sysctl_min_slab_ratio;
 #else
-#define zone_reclaim_mode 0
 static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 {
 	return 0;
 }
 #endif
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+extern bool should_reclaim_unmapped_pages(struct zone *zone);
+#else
+static inline bool should_reclaim_unmapped_pages(struct zone *zone)
+{
+	return false;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+extern int zone_reclaim_mode;
+extern int sysctl_min_slab_ratio;
+#else
+#define zone_reclaim_mode 0
+#endif
+
 extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
 
diff --git a/init/Kconfig b/init/Kconfig
index 4f6cdbf..2dfbc09 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -828,6 +828,18 @@ config SCHED_AUTOGROUP
 config MM_OWNER
 	bool
 
+config UNMAPPED_PAGECACHE_CONTROL
+	bool "Provide control over unmapped page cache"
+	default n
+	help
+	  This option adds support for controlling unmapped page cache
+	  via a boot parameter (unmapped_page_control). The boot parameter
+	  with sysctl (vm.min_unmapped_ratio) control the total number
+	  of unmapped pages in the system. This feature is useful if
+	  you want to limit the amount of unmapped page cache or want
+	  to reduce page cache duplication in a virtualized environment.
+	  If unsure say 'N'
+
 config SYSFS_DEPRECATED
 	bool "enable deprecated sysfs features to support old userspace tools"
 	depends on SYSFS
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 12e8f26..63dbba6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1224,6 +1224,7 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 #endif
+#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
 	{
 		.procname	= "min_unmapped_ratio",
 		.data		= &sysctl_min_unmapped_ratio,
@@ -1233,6 +1234,16 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "max_unmapped_ratio",
+		.data		= &sysctl_max_unmapped_ratio,
+		.maxlen		= sizeof(sysctl_max_unmapped_ratio),
+		.mode		= 0644,
+		.proc_handler	= sysctl_max_unmapped_ratio_sysctl_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
 #ifdef CONFIG_NUMA
 	{
 		.procname	= "zone_reclaim_mode",
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7b56473..2ac8549 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1660,6 +1660,9 @@ zonelist_scan:
 			unsigned long mark;
 			int ret;
 
+			if (should_reclaim_unmapped_pages(zone))
+				wakeup_kswapd(zone, order, classzone_idx);
+
 			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 			if (zone_watermark_ok(zone, order, mark,
 				    classzone_idx, alloc_flags))
@@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
+#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
 						/ 100;
+		zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
+						/ 100;
+#endif
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
@@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
@@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	for_each_zone(zone)
+		zone->max_unmapped_pages = (zone->present_pages *
+				sysctl_max_unmapped_ratio) / 100;
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_NUMA
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 02cc82e..6377411 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
 #define scanning_global_lru(sc)	(1)
 #endif
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
+						struct scan_control *sc);
+static int unmapped_page_control __read_mostly;
+
+static int __init unmapped_page_control_parm(char *str)
+{
+	unmapped_page_control = 1;
+	/*
+	 * XXX: Should we tweak swappiness here?
+	 */
+	return 1;
+}
+__setup("unmapped_page_control", unmapped_page_control_parm);
+
+#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
+static inline unsigned long reclaim_unmapped_pages(int priority,
+				struct zone *zone, struct scan_control *sc)
+{
+	return 0;
+}
+#endif
+
 static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
 						  struct scan_control *sc)
 {
@@ -2359,6 +2382,12 @@ loop_again:
 				shrink_active_list(SWAP_CLUSTER_MAX, zone,
 							&sc, priority, 0);
 
+			/*
+			 * We do unmapped page reclaim once here and once
+			 * below, so that we don't lose out
+			 */
+			reclaim_unmapped_pages(priority, zone, &sc);
+
 			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone), 0, 0)) {
 				end_zone = i;
@@ -2396,6 +2425,11 @@ loop_again:
 				continue;
 
 			sc.nr_scanned = 0;
+			/*
+			 * Reclaim unmapped pages upfront, this should be
+			 * really cheap
+			 */
+			reclaim_unmapped_pages(priority, zone, &sc);
 
 			/*
 			 * Call soft limit reclaim before calling shrink_zone.
@@ -2715,7 +2749,8 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	}
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
-	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0) &&
+		!should_reclaim_unmapped_pages(zone))
 		return;
 
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
@@ -2868,6 +2903,7 @@ static int __init kswapd_init(void)
 
 module_init(kswapd_init)
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
 /*
  * Zone reclaim mode
  *
@@ -2893,6 +2929,7 @@ int zone_reclaim_mode __read_mostly;
  * occur.
  */
 int sysctl_min_unmapped_ratio = 1;
+int sysctl_max_unmapped_ratio = 16;
 
 /*
  * If the number of slab pages in a zone grows beyond this percentage then
@@ -3088,6 +3125,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	return ret;
 }
+#endif
+
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+/*
+ * Routine to reclaim unmapped pages, inspired from the code under
+ * CONFIG_NUMA that does unmapped page and slab page control by keeping
+ * min_unmapped_pages in the zone. We currently reclaim just unmapped
+ * pages, slab control will come in soon, at which point this routine
+ * should be called reclaim cached pages
+ */
+unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
+						struct scan_control *sc)
+{
+	if (unlikely(unmapped_page_control) &&
+		(zone_unmapped_file_pages(zone) > zone->min_unmapped_pages)) {
+		struct scan_control nsc;
+		unsigned long nr_pages;
+
+		nsc = *sc;
+
+		nsc.swappiness = 0;
+		nsc.may_writepage = 0;
+		nsc.may_unmap = 0;
+		nsc.nr_reclaimed = 0;
+
+		nr_pages = zone_unmapped_file_pages(zone) -
+				zone->min_unmapped_pages;
+		/*
+		 * We don't want to be too aggressive with our
+		 * reclaim, it is our best effort to control
+		 * unmapped pages
+		 */
+		nr_pages >>= 3;
+
+		zone_reclaim_pages(zone, &nsc, nr_pages);
+		return nsc.nr_reclaimed;
+	}
+	return 0;
+}
+
+bool should_reclaim_unmapped_pages(struct zone *zone)
+{
+	if (unlikely(unmapped_page_control) &&
+		(zone_unmapped_file_pages(zone) > zone->max_unmapped_pages))
+		return true;
+	return false;
+}
+#endif
 
 /*
  * page_evictable - test whether a page is evictable


^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-25  5:10   ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-25  5:10 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: npiggin, kvm, linux-kernel, kosaki.motohiro, cl, kamezawa.hiroyu

Changelog v4
1. Add max_unmapped_ratio and use that as the upper limit
to check when to shrink the unmapped page cache (Christoph
Lameter)

Changelog v2
1. Use a config option to enable the code (Andrew Morton)
2. Explain the magic tunables in the code or at-least attempt
   to explain them (General comment)
3. Hint uses of the boot parameter with unlikely (Andrew Morton)
4. Use better names (balanced is not a good naming convention)

Provide control using zone_reclaim() and a boot parameter. The
code reuses functionality from zone_reclaim() to isolate unmapped
pages and reclaim them as a priority, ahead of other mapped pages.
A new sysctl for max_unmapped_ratio is provided and set to 16,
indicating 16% of the total zone pages are unmapped, we start
shrinking unmapped page cache.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |    8 +++
 include/linux/mmzone.h              |    5 ++
 include/linux/swap.h                |   23 ++++++++-
 init/Kconfig                        |   12 +++++
 kernel/sysctl.c                     |   11 ++++
 mm/page_alloc.c                     |   25 ++++++++++
 mm/vmscan.c                         |   87 +++++++++++++++++++++++++++++++++++
 7 files changed, 166 insertions(+), 5 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fee5f57..65a4ee6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2500,6 +2500,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			[X86]
 			Set unknown_nmi_panic=1 early on boot.
 
+	unmapped_page_control
+			[KNL] Available if CONFIG_UNMAPPED_PAGECACHE_CONTROL
+			is enabled. It controls the amount of unmapped memory
+			that is present in the system. This boot option plus
+			vm.min_unmapped_ratio (sysctl) provide granular control
+			over how much unmapped page cache can exist in the system
+			before kswapd starts reclaiming unmapped page cache pages.
+
 	usbcore.autosuspend=
 			[USB] The autosuspend time delay (in seconds) used
 			for newly-detected USB devices (default 2).  This
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2485acc..18f0f09 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -306,7 +306,10 @@ struct zone {
 	/*
 	 * zone reclaim becomes active if more unmapped pages exist.
 	 */
+#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
 	unsigned long		min_unmapped_pages;
+	unsigned long		max_unmapped_pages;
+#endif
 #ifdef CONFIG_NUMA
 	int node;
 	unsigned long		min_slab_pages;
@@ -773,6 +776,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
 			void __user *, size_t *, loff_t *);
+int sysctl_max_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
+			void __user *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 			void __user *, size_t *, loff_t *);
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7b75626..ae62a03 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -255,19 +255,34 @@ extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
 extern long vm_total_pages;
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
 extern int sysctl_min_unmapped_ratio;
+extern int sysctl_max_unmapped_ratio;
+
 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
-#ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
-extern int sysctl_min_slab_ratio;
 #else
-#define zone_reclaim_mode 0
 static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 {
 	return 0;
 }
 #endif
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+extern bool should_reclaim_unmapped_pages(struct zone *zone);
+#else
+static inline bool should_reclaim_unmapped_pages(struct zone *zone)
+{
+	return false;
+}
+#endif
+
+#ifdef CONFIG_NUMA
+extern int zone_reclaim_mode;
+extern int sysctl_min_slab_ratio;
+#else
+#define zone_reclaim_mode 0
+#endif
+
 extern int page_evictable(struct page *page, struct vm_area_struct *vma);
 extern void scan_mapping_unevictable_pages(struct address_space *);
 
diff --git a/init/Kconfig b/init/Kconfig
index 4f6cdbf..2dfbc09 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -828,6 +828,18 @@ config SCHED_AUTOGROUP
 config MM_OWNER
 	bool
 
+config UNMAPPED_PAGECACHE_CONTROL
+	bool "Provide control over unmapped page cache"
+	default n
+	help
+	  This option adds support for controlling unmapped page cache
+	  via a boot parameter (unmapped_page_control). The boot parameter
+	  with sysctl (vm.min_unmapped_ratio) control the total number
+	  of unmapped pages in the system. This feature is useful if
+	  you want to limit the amount of unmapped page cache or want
+	  to reduce page cache duplication in a virtualized environment.
+	  If unsure say 'N'
+
 config SYSFS_DEPRECATED
 	bool "enable deprecated sysfs features to support old userspace tools"
 	depends on SYSFS
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 12e8f26..63dbba6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1224,6 +1224,7 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 #endif
+#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
 	{
 		.procname	= "min_unmapped_ratio",
 		.data		= &sysctl_min_unmapped_ratio,
@@ -1233,6 +1234,16 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "max_unmapped_ratio",
+		.data		= &sysctl_max_unmapped_ratio,
+		.maxlen		= sizeof(sysctl_max_unmapped_ratio),
+		.mode		= 0644,
+		.proc_handler	= sysctl_max_unmapped_ratio_sysctl_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
 #ifdef CONFIG_NUMA
 	{
 		.procname	= "zone_reclaim_mode",
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7b56473..2ac8549 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1660,6 +1660,9 @@ zonelist_scan:
 			unsigned long mark;
 			int ret;
 
+			if (should_reclaim_unmapped_pages(zone))
+				wakeup_kswapd(zone, order, classzone_idx);
+
 			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 			if (zone_watermark_ok(zone, order, mark,
 				    classzone_idx, alloc_flags))
@@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
+#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
 		zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
 						/ 100;
+		zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
+						/ 100;
+#endif
 #ifdef CONFIG_NUMA
 		zone->node = nid;
 		zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
@@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
@@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 	return 0;
 }
 
+int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	struct zone *zone;
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	for_each_zone(zone)
+		zone->max_unmapped_pages = (zone->present_pages *
+				sysctl_max_unmapped_ratio) / 100;
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_NUMA
 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 02cc82e..6377411 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
 #define scanning_global_lru(sc)	(1)
 #endif
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
+						struct scan_control *sc);
+static int unmapped_page_control __read_mostly;
+
+static int __init unmapped_page_control_parm(char *str)
+{
+	unmapped_page_control = 1;
+	/*
+	 * XXX: Should we tweak swappiness here?
+	 */
+	return 1;
+}
+__setup("unmapped_page_control", unmapped_page_control_parm);
+
+#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
+static inline unsigned long reclaim_unmapped_pages(int priority,
+				struct zone *zone, struct scan_control *sc)
+{
+	return 0;
+}
+#endif
+
 static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
 						  struct scan_control *sc)
 {
@@ -2359,6 +2382,12 @@ loop_again:
 				shrink_active_list(SWAP_CLUSTER_MAX, zone,
 							&sc, priority, 0);
 
+			/*
+			 * We do unmapped page reclaim once here and once
+			 * below, so that we don't lose out
+			 */
+			reclaim_unmapped_pages(priority, zone, &sc);
+
 			if (!zone_watermark_ok_safe(zone, order,
 					high_wmark_pages(zone), 0, 0)) {
 				end_zone = i;
@@ -2396,6 +2425,11 @@ loop_again:
 				continue;
 
 			sc.nr_scanned = 0;
+			/*
+			 * Reclaim unmapped pages upfront, this should be
+			 * really cheap
+			 */
+			reclaim_unmapped_pages(priority, zone, &sc);
 
 			/*
 			 * Call soft limit reclaim before calling shrink_zone.
@@ -2715,7 +2749,8 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	}
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
-	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0) &&
+		!should_reclaim_unmapped_pages(zone))
 		return;
 
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
@@ -2868,6 +2903,7 @@ static int __init kswapd_init(void)
 
 module_init(kswapd_init)
 
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
 /*
  * Zone reclaim mode
  *
@@ -2893,6 +2929,7 @@ int zone_reclaim_mode __read_mostly;
  * occur.
  */
 int sysctl_min_unmapped_ratio = 1;
+int sysctl_max_unmapped_ratio = 16;
 
 /*
  * If the number of slab pages in a zone grows beyond this percentage then
@@ -3088,6 +3125,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	return ret;
 }
+#endif
+
+#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
+/*
+ * Routine to reclaim unmapped pages, inspired from the code under
+ * CONFIG_NUMA that does unmapped page and slab page control by keeping
+ * min_unmapped_pages in the zone. We currently reclaim just unmapped
+ * pages, slab control will come in soon, at which point this routine
+ * should be called reclaim cached pages
+ */
+unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
+						struct scan_control *sc)
+{
+	if (unlikely(unmapped_page_control) &&
+		(zone_unmapped_file_pages(zone) > zone->min_unmapped_pages)) {
+		struct scan_control nsc;
+		unsigned long nr_pages;
+
+		nsc = *sc;
+
+		nsc.swappiness = 0;
+		nsc.may_writepage = 0;
+		nsc.may_unmap = 0;
+		nsc.nr_reclaimed = 0;
+
+		nr_pages = zone_unmapped_file_pages(zone) -
+				zone->min_unmapped_pages;
+		/*
+		 * We don't want to be too aggressive with our
+		 * reclaim, it is our best effort to control
+		 * unmapped pages
+		 */
+		nr_pages >>= 3;
+
+		zone_reclaim_pages(zone, &nsc, nr_pages);
+		return nsc.nr_reclaimed;
+	}
+	return 0;
+}
+
+bool should_reclaim_unmapped_pages(struct zone *zone)
+{
+	if (unlikely(unmapped_page_control) &&
+		(zone_unmapped_file_pages(zone) > zone->max_unmapped_pages))
+		return true;
+	return false;
+}
+#endif
 
 /*
  * page_evictable - test whether a page is evictable

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH 1/2] Refactor zone_reclaim code (v4)
  2011-01-25  5:10 ` Balbir Singh
@ 2011-01-25  5:15   ` Balbir Singh
  -1 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-25  5:15 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: npiggin, kvm, linux-kernel, kosaki.motohiro, cl, kamezawa.hiroyu

* Balbir Singh <balbir@linux.vnet.ibm.com> [2011-01-25 10:40:09]:

> Changelog v3
> 1. Renamed zone_reclaim_unmapped_pages to zone_reclaim_pages
> 
> Refactor zone_reclaim, move reusable functionality outside
> of zone_reclaim. Make zone_reclaim_unmapped_pages modular
> 
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> Reviewed-by: Christoph Lameter <cl@linux.com>

I got the patch numbering wrong due to a internet connection going down
in the middle of stg mail, restarting with specified patches goofed up
the numbering. I can resend the patches with the correct numbering if
desired. This patch should be numbered 2/3

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 1/2] Refactor zone_reclaim code (v4)
@ 2011-01-25  5:15   ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-25  5:15 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: npiggin, kvm, linux-kernel, kosaki.motohiro, cl, kamezawa.hiroyu

* Balbir Singh <balbir@linux.vnet.ibm.com> [2011-01-25 10:40:09]:

> Changelog v3
> 1. Renamed zone_reclaim_unmapped_pages to zone_reclaim_pages
> 
> Refactor zone_reclaim, move reusable functionality outside
> of zone_reclaim. Make zone_reclaim_unmapped_pages modular
> 
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> Reviewed-by: Christoph Lameter <cl@linux.com>

I got the patch numbering wrong due to a internet connection going down
in the middle of stg mail, restarting with specified patches goofed up
the numbering. I can resend the patches with the correct numbering if
desired. This patch should be numbered 2/3

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-25  5:10   ` Balbir Singh
@ 2011-01-26 16:57     ` Christoph Lameter
  -1 siblings, 0 replies; 47+ messages in thread
From: Christoph Lameter @ 2011-01-26 16:57 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro,
	kamezawa.hiroyu


Reviewed-by: Christoph Lameter <cl@linux.com>



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-26 16:57     ` Christoph Lameter
  0 siblings, 0 replies; 47+ messages in thread
From: Christoph Lameter @ 2011-01-26 16:57 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro,
	kamezawa.hiroyu


Reviewed-by: Christoph Lameter <cl@linux.com>


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-26 16:57     ` Christoph Lameter
@ 2011-01-26 17:43       ` Balbir Singh
  -1 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-26 17:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro,
	kamezawa.hiroyu

* Christoph Lameter <cl@linux.com> [2011-01-26 10:57:37]:

> 
> Reviewed-by: Christoph Lameter <cl@linux.com>
>

Thanks for the review! 

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-26 17:43       ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-26 17:43 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro,
	kamezawa.hiroyu

* Christoph Lameter <cl@linux.com> [2011-01-26 10:57:37]:

> 
> Reviewed-by: Christoph Lameter <cl@linux.com>
>

Thanks for the review! 

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-25  5:10   ` Balbir Singh
@ 2011-01-26 23:12     ` Minchan Kim
  -1 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-01-26 23:12 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

Hi Balbir,

On Tue, Jan 25, 2011 at 2:10 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> Changelog v4
> 1. Add max_unmapped_ratio and use that as the upper limit
> to check when to shrink the unmapped page cache (Christoph
> Lameter)
>
> Changelog v2
> 1. Use a config option to enable the code (Andrew Morton)
> 2. Explain the magic tunables in the code or at-least attempt
>   to explain them (General comment)
> 3. Hint uses of the boot parameter with unlikely (Andrew Morton)
> 4. Use better names (balanced is not a good naming convention)
>
> Provide control using zone_reclaim() and a boot parameter. The
> code reuses functionality from zone_reclaim() to isolate unmapped
> pages and reclaim them as a priority, ahead of other mapped pages.
> A new sysctl for max_unmapped_ratio is provided and set to 16,
> indicating 16% of the total zone pages are unmapped, we start
> shrinking unmapped page cache.
>
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> ---
>  Documentation/kernel-parameters.txt |    8 +++
>  include/linux/mmzone.h              |    5 ++
>  include/linux/swap.h                |   23 ++++++++-
>  init/Kconfig                        |   12 +++++
>  kernel/sysctl.c                     |   11 ++++
>  mm/page_alloc.c                     |   25 ++++++++++
>  mm/vmscan.c                         |   87 +++++++++++++++++++++++++++++++++++
>  7 files changed, 166 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index fee5f57..65a4ee6 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -2500,6 +2500,14 @@ and is between 256 and 4096 characters. It is defined in the file
>                        [X86]
>                        Set unknown_nmi_panic=1 early on boot.
>
> +       unmapped_page_control
> +                       [KNL] Available if CONFIG_UNMAPPED_PAGECACHE_CONTROL
> +                       is enabled. It controls the amount of unmapped memory
> +                       that is present in the system. This boot option plus
> +                       vm.min_unmapped_ratio (sysctl) provide granular control
> +                       over how much unmapped page cache can exist in the system
> +                       before kswapd starts reclaiming unmapped page cache pages.
> +
>        usbcore.autosuspend=
>                        [USB] The autosuspend time delay (in seconds) used
>                        for newly-detected USB devices (default 2).  This
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 2485acc..18f0f09 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -306,7 +306,10 @@ struct zone {
>        /*
>         * zone reclaim becomes active if more unmapped pages exist.
>         */
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>        unsigned long           min_unmapped_pages;
> +       unsigned long           max_unmapped_pages;
> +#endif
>  #ifdef CONFIG_NUMA
>        int node;
>        unsigned long           min_slab_pages;
> @@ -773,6 +776,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
>                                        void __user *, size_t *, loff_t *);
>  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
>                        void __user *, size_t *, loff_t *);
> +int sysctl_max_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
> +                       void __user *, size_t *, loff_t *);
>  int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
>                        void __user *, size_t *, loff_t *);
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7b75626..ae62a03 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -255,19 +255,34 @@ extern int vm_swappiness;
>  extern int remove_mapping(struct address_space *mapping, struct page *page);
>  extern long vm_total_pages;
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
>  extern int sysctl_min_unmapped_ratio;
> +extern int sysctl_max_unmapped_ratio;
> +
>  extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
> -#ifdef CONFIG_NUMA
> -extern int zone_reclaim_mode;
> -extern int sysctl_min_slab_ratio;
>  #else
> -#define zone_reclaim_mode 0
>  static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
>  {
>        return 0;
>  }
>  #endif
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> +extern bool should_reclaim_unmapped_pages(struct zone *zone);
> +#else
> +static inline bool should_reclaim_unmapped_pages(struct zone *zone)
> +{
> +       return false;
> +}
> +#endif
> +
> +#ifdef CONFIG_NUMA
> +extern int zone_reclaim_mode;
> +extern int sysctl_min_slab_ratio;
> +#else
> +#define zone_reclaim_mode 0
> +#endif
> +
>  extern int page_evictable(struct page *page, struct vm_area_struct *vma);
>  extern void scan_mapping_unevictable_pages(struct address_space *);
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 4f6cdbf..2dfbc09 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -828,6 +828,18 @@ config SCHED_AUTOGROUP
>  config MM_OWNER
>        bool
>
> +config UNMAPPED_PAGECACHE_CONTROL
> +       bool "Provide control over unmapped page cache"
> +       default n
> +       help
> +         This option adds support for controlling unmapped page cache
> +         via a boot parameter (unmapped_page_control). The boot parameter
> +         with sysctl (vm.min_unmapped_ratio) control the total number
> +         of unmapped pages in the system. This feature is useful if
> +         you want to limit the amount of unmapped page cache or want
> +         to reduce page cache duplication in a virtualized environment.
> +         If unsure say 'N'
> +
>  config SYSFS_DEPRECATED
>        bool "enable deprecated sysfs features to support old userspace tools"
>        depends on SYSFS
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 12e8f26..63dbba6 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1224,6 +1224,7 @@ static struct ctl_table vm_table[] = {
>                .extra1         = &zero,
>        },
>  #endif
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>        {
>                .procname       = "min_unmapped_ratio",
>                .data           = &sysctl_min_unmapped_ratio,
> @@ -1233,6 +1234,16 @@ static struct ctl_table vm_table[] = {
>                .extra1         = &zero,
>                .extra2         = &one_hundred,
>        },
> +       {
> +               .procname       = "max_unmapped_ratio",
> +               .data           = &sysctl_max_unmapped_ratio,
> +               .maxlen         = sizeof(sysctl_max_unmapped_ratio),
> +               .mode           = 0644,
> +               .proc_handler   = sysctl_max_unmapped_ratio_sysctl_handler,
> +               .extra1         = &zero,
> +               .extra2         = &one_hundred,
> +       },
> +#endif
>  #ifdef CONFIG_NUMA
>        {
>                .procname       = "zone_reclaim_mode",
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7b56473..2ac8549 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1660,6 +1660,9 @@ zonelist_scan:
>                        unsigned long mark;
>                        int ret;
>
> +                       if (should_reclaim_unmapped_pages(zone))
> +                               wakeup_kswapd(zone, order, classzone_idx);
> +

Do we really need the check in fastpath?
There are lost of caller of alloc_pages.
Many of them are not related to mapped pages.
Could we move the check into add_to_page_cache_locked?

>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
>                        if (zone_watermark_ok(zone, order, mark,
>                                    classzone_idx, alloc_flags))
> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>
>                zone->spanned_pages = size;
>                zone->present_pages = realsize;
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
>                                                / 100;
> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> +                                               / 100;
> +#endif
>  #ifdef CONFIG_NUMA
>                zone->node = nid;
>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
>        return 0;
>  }
>
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>        void __user *buffer, size_t *length, loff_t *ppos)
>  {
> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>        return 0;
>  }
>
> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> +       void __user *buffer, size_t *length, loff_t *ppos)
> +{
> +       struct zone *zone;
> +       int rc;
> +
> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +       if (rc)
> +               return rc;
> +
> +       for_each_zone(zone)
> +               zone->max_unmapped_pages = (zone->present_pages *
> +                               sysctl_max_unmapped_ratio) / 100;
> +       return 0;
> +}
> +#endif
> +
>  #ifdef CONFIG_NUMA
>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
>        void __user *buffer, size_t *length, loff_t *ppos)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 02cc82e..6377411 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
>  #define scanning_global_lru(sc)        (1)
>  #endif
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> +                                               struct scan_control *sc);
> +static int unmapped_page_control __read_mostly;
> +
> +static int __init unmapped_page_control_parm(char *str)
> +{
> +       unmapped_page_control = 1;
> +       /*
> +        * XXX: Should we tweak swappiness here?
> +        */
> +       return 1;
> +}
> +__setup("unmapped_page_control", unmapped_page_control_parm);
> +
> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> +static inline unsigned long reclaim_unmapped_pages(int priority,
> +                               struct zone *zone, struct scan_control *sc)
> +{
> +       return 0;
> +}
> +#endif
> +
>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
>                                                  struct scan_control *sc)
>  {
> @@ -2359,6 +2382,12 @@ loop_again:
>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>                                                        &sc, priority, 0);
>
> +                       /*
> +                        * We do unmapped page reclaim once here and once
> +                        * below, so that we don't lose out
> +                        */
> +                       reclaim_unmapped_pages(priority, zone, &sc);
> +
>                        if (!zone_watermark_ok_safe(zone, order,
>                                        high_wmark_pages(zone), 0, 0)) {
>                                end_zone = i;
> @@ -2396,6 +2425,11 @@ loop_again:
>                                continue;
>
>                        sc.nr_scanned = 0;
> +                       /*
> +                        * Reclaim unmapped pages upfront, this should be
> +                        * really cheap
> +                        */
> +                       reclaim_unmapped_pages(priority, zone, &sc);

Why should we do by two phase?
It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
If we can't reclaim enough, next allocation would wake up kswapd again
and kswapd try it again.

And I have a concern. I already pointed out.
If memory pressure is heavy and unmappd_pages is more than our
threshold, this can move inactive's tail pages which are mapped into
heads by reclaim_unmapped_pages. It can make confusing LRU order so
working set can be evicted.

zone_reclaim is used by only NUMA until now but you are opening it in the world.
I think it would be a good feature in embedded system, too.
I hope we care of working set eviction problem.

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-26 23:12     ` Minchan Kim
  0 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-01-26 23:12 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

Hi Balbir,

On Tue, Jan 25, 2011 at 2:10 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> Changelog v4
> 1. Add max_unmapped_ratio and use that as the upper limit
> to check when to shrink the unmapped page cache (Christoph
> Lameter)
>
> Changelog v2
> 1. Use a config option to enable the code (Andrew Morton)
> 2. Explain the magic tunables in the code or at-least attempt
>   to explain them (General comment)
> 3. Hint uses of the boot parameter with unlikely (Andrew Morton)
> 4. Use better names (balanced is not a good naming convention)
>
> Provide control using zone_reclaim() and a boot parameter. The
> code reuses functionality from zone_reclaim() to isolate unmapped
> pages and reclaim them as a priority, ahead of other mapped pages.
> A new sysctl for max_unmapped_ratio is provided and set to 16,
> indicating 16% of the total zone pages are unmapped, we start
> shrinking unmapped page cache.
>
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> ---
>  Documentation/kernel-parameters.txt |    8 +++
>  include/linux/mmzone.h              |    5 ++
>  include/linux/swap.h                |   23 ++++++++-
>  init/Kconfig                        |   12 +++++
>  kernel/sysctl.c                     |   11 ++++
>  mm/page_alloc.c                     |   25 ++++++++++
>  mm/vmscan.c                         |   87 +++++++++++++++++++++++++++++++++++
>  7 files changed, 166 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index fee5f57..65a4ee6 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -2500,6 +2500,14 @@ and is between 256 and 4096 characters. It is defined in the file
>                        [X86]
>                        Set unknown_nmi_panic=1 early on boot.
>
> +       unmapped_page_control
> +                       [KNL] Available if CONFIG_UNMAPPED_PAGECACHE_CONTROL
> +                       is enabled. It controls the amount of unmapped memory
> +                       that is present in the system. This boot option plus
> +                       vm.min_unmapped_ratio (sysctl) provide granular control
> +                       over how much unmapped page cache can exist in the system
> +                       before kswapd starts reclaiming unmapped page cache pages.
> +
>        usbcore.autosuspend=
>                        [USB] The autosuspend time delay (in seconds) used
>                        for newly-detected USB devices (default 2).  This
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 2485acc..18f0f09 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -306,7 +306,10 @@ struct zone {
>        /*
>         * zone reclaim becomes active if more unmapped pages exist.
>         */
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>        unsigned long           min_unmapped_pages;
> +       unsigned long           max_unmapped_pages;
> +#endif
>  #ifdef CONFIG_NUMA
>        int node;
>        unsigned long           min_slab_pages;
> @@ -773,6 +776,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
>                                        void __user *, size_t *, loff_t *);
>  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
>                        void __user *, size_t *, loff_t *);
> +int sysctl_max_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
> +                       void __user *, size_t *, loff_t *);
>  int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
>                        void __user *, size_t *, loff_t *);
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7b75626..ae62a03 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -255,19 +255,34 @@ extern int vm_swappiness;
>  extern int remove_mapping(struct address_space *mapping, struct page *page);
>  extern long vm_total_pages;
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
>  extern int sysctl_min_unmapped_ratio;
> +extern int sysctl_max_unmapped_ratio;
> +
>  extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
> -#ifdef CONFIG_NUMA
> -extern int zone_reclaim_mode;
> -extern int sysctl_min_slab_ratio;
>  #else
> -#define zone_reclaim_mode 0
>  static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
>  {
>        return 0;
>  }
>  #endif
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> +extern bool should_reclaim_unmapped_pages(struct zone *zone);
> +#else
> +static inline bool should_reclaim_unmapped_pages(struct zone *zone)
> +{
> +       return false;
> +}
> +#endif
> +
> +#ifdef CONFIG_NUMA
> +extern int zone_reclaim_mode;
> +extern int sysctl_min_slab_ratio;
> +#else
> +#define zone_reclaim_mode 0
> +#endif
> +
>  extern int page_evictable(struct page *page, struct vm_area_struct *vma);
>  extern void scan_mapping_unevictable_pages(struct address_space *);
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 4f6cdbf..2dfbc09 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -828,6 +828,18 @@ config SCHED_AUTOGROUP
>  config MM_OWNER
>        bool
>
> +config UNMAPPED_PAGECACHE_CONTROL
> +       bool "Provide control over unmapped page cache"
> +       default n
> +       help
> +         This option adds support for controlling unmapped page cache
> +         via a boot parameter (unmapped_page_control). The boot parameter
> +         with sysctl (vm.min_unmapped_ratio) control the total number
> +         of unmapped pages in the system. This feature is useful if
> +         you want to limit the amount of unmapped page cache or want
> +         to reduce page cache duplication in a virtualized environment.
> +         If unsure say 'N'
> +
>  config SYSFS_DEPRECATED
>        bool "enable deprecated sysfs features to support old userspace tools"
>        depends on SYSFS
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 12e8f26..63dbba6 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1224,6 +1224,7 @@ static struct ctl_table vm_table[] = {
>                .extra1         = &zero,
>        },
>  #endif
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>        {
>                .procname       = "min_unmapped_ratio",
>                .data           = &sysctl_min_unmapped_ratio,
> @@ -1233,6 +1234,16 @@ static struct ctl_table vm_table[] = {
>                .extra1         = &zero,
>                .extra2         = &one_hundred,
>        },
> +       {
> +               .procname       = "max_unmapped_ratio",
> +               .data           = &sysctl_max_unmapped_ratio,
> +               .maxlen         = sizeof(sysctl_max_unmapped_ratio),
> +               .mode           = 0644,
> +               .proc_handler   = sysctl_max_unmapped_ratio_sysctl_handler,
> +               .extra1         = &zero,
> +               .extra2         = &one_hundred,
> +       },
> +#endif
>  #ifdef CONFIG_NUMA
>        {
>                .procname       = "zone_reclaim_mode",
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7b56473..2ac8549 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1660,6 +1660,9 @@ zonelist_scan:
>                        unsigned long mark;
>                        int ret;
>
> +                       if (should_reclaim_unmapped_pages(zone))
> +                               wakeup_kswapd(zone, order, classzone_idx);
> +

Do we really need the check in fastpath?
There are lost of caller of alloc_pages.
Many of them are not related to mapped pages.
Could we move the check into add_to_page_cache_locked?

>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
>                        if (zone_watermark_ok(zone, order, mark,
>                                    classzone_idx, alloc_flags))
> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>
>                zone->spanned_pages = size;
>                zone->present_pages = realsize;
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
>                                                / 100;
> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> +                                               / 100;
> +#endif
>  #ifdef CONFIG_NUMA
>                zone->node = nid;
>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
>        return 0;
>  }
>
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>        void __user *buffer, size_t *length, loff_t *ppos)
>  {
> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>        return 0;
>  }
>
> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> +       void __user *buffer, size_t *length, loff_t *ppos)
> +{
> +       struct zone *zone;
> +       int rc;
> +
> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +       if (rc)
> +               return rc;
> +
> +       for_each_zone(zone)
> +               zone->max_unmapped_pages = (zone->present_pages *
> +                               sysctl_max_unmapped_ratio) / 100;
> +       return 0;
> +}
> +#endif
> +
>  #ifdef CONFIG_NUMA
>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
>        void __user *buffer, size_t *length, loff_t *ppos)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 02cc82e..6377411 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
>  #define scanning_global_lru(sc)        (1)
>  #endif
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> +                                               struct scan_control *sc);
> +static int unmapped_page_control __read_mostly;
> +
> +static int __init unmapped_page_control_parm(char *str)
> +{
> +       unmapped_page_control = 1;
> +       /*
> +        * XXX: Should we tweak swappiness here?
> +        */
> +       return 1;
> +}
> +__setup("unmapped_page_control", unmapped_page_control_parm);
> +
> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> +static inline unsigned long reclaim_unmapped_pages(int priority,
> +                               struct zone *zone, struct scan_control *sc)
> +{
> +       return 0;
> +}
> +#endif
> +
>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
>                                                  struct scan_control *sc)
>  {
> @@ -2359,6 +2382,12 @@ loop_again:
>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>                                                        &sc, priority, 0);
>
> +                       /*
> +                        * We do unmapped page reclaim once here and once
> +                        * below, so that we don't lose out
> +                        */
> +                       reclaim_unmapped_pages(priority, zone, &sc);
> +
>                        if (!zone_watermark_ok_safe(zone, order,
>                                        high_wmark_pages(zone), 0, 0)) {
>                                end_zone = i;
> @@ -2396,6 +2425,11 @@ loop_again:
>                                continue;
>
>                        sc.nr_scanned = 0;
> +                       /*
> +                        * Reclaim unmapped pages upfront, this should be
> +                        * really cheap
> +                        */
> +                       reclaim_unmapped_pages(priority, zone, &sc);

Why should we do by two phase?
It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
If we can't reclaim enough, next allocation would wake up kswapd again
and kswapd try it again.

And I have a concern. I already pointed out.
If memory pressure is heavy and unmappd_pages is more than our
threshold, this can move inactive's tail pages which are mapped into
heads by reclaim_unmapped_pages. It can make confusing LRU order so
working set can be evicted.

zone_reclaim is used by only NUMA until now but you are opening it in the world.
I think it would be a good feature in embedded system, too.
I hope we care of working set eviction problem.

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-26 23:12     ` Minchan Kim
@ 2011-01-28  2:56       ` Balbir Singh
  -1 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28  2:56 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
[snip]

>> index 7b56473..2ac8549 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1660,6 +1660,9 @@ zonelist_scan:
>>                        unsigned long mark;
>>                        int ret;
>>
>> +                       if (should_reclaim_unmapped_pages(zone))
>> +                               wakeup_kswapd(zone, order, classzone_idx);
>> +
>
> Do we really need the check in fastpath?
> There are lost of caller of alloc_pages.
> Many of them are not related to mapped pages.
> Could we move the check into add_to_page_cache_locked?

The check is a simple check to see if the unmapped pages need
balancing, the reason I placed this check here is to allow other
allocations to benefit as well, if there are some unmapped pages to be
freed. add_to_page_cache_locked (check under a critical section) is
even worse, IMHO.


>
>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
>>                        if (zone_watermark_ok(zone, order, mark,
>>                                    classzone_idx, alloc_flags))
>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>>
>>                zone->spanned_pages = size;
>>                zone->present_pages = realsize;
>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
>>                                                / 100;
>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
>> +                                               / 100;
>> +#endif
>>  #ifdef CONFIG_NUMA
>>                zone->node = nid;
>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
>>        return 0;
>>  }
>>
>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>>        void __user *buffer, size_t *length, loff_t *ppos)
>>  {
>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>>        return 0;
>>  }
>>
>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>> +       void __user *buffer, size_t *length, loff_t *ppos)
>> +{
>> +       struct zone *zone;
>> +       int rc;
>> +
>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
>> +       if (rc)
>> +               return rc;
>> +
>> +       for_each_zone(zone)
>> +               zone->max_unmapped_pages = (zone->present_pages *
>> +                               sysctl_max_unmapped_ratio) / 100;
>> +       return 0;
>> +}
>> +#endif
>> +
>>  #ifdef CONFIG_NUMA
>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
>>        void __user *buffer, size_t *length, loff_t *ppos)
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 02cc82e..6377411 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
>>  #define scanning_global_lru(sc)        (1)
>>  #endif
>>
>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
>> +                                               struct scan_control *sc);
>> +static int unmapped_page_control __read_mostly;
>> +
>> +static int __init unmapped_page_control_parm(char *str)
>> +{
>> +       unmapped_page_control = 1;
>> +       /*
>> +        * XXX: Should we tweak swappiness here?
>> +        */
>> +       return 1;
>> +}
>> +__setup("unmapped_page_control", unmapped_page_control_parm);
>> +
>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
>> +static inline unsigned long reclaim_unmapped_pages(int priority,
>> +                               struct zone *zone, struct scan_control *sc)
>> +{
>> +       return 0;
>> +}
>> +#endif
>> +
>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
>>                                                  struct scan_control *sc)
>>  {
>> @@ -2359,6 +2382,12 @@ loop_again:
>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>>                                                        &sc, priority, 0);
>>
>> +                       /*
>> +                        * We do unmapped page reclaim once here and once
>> +                        * below, so that we don't lose out
>> +                        */
>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>> +
>>                        if (!zone_watermark_ok_safe(zone, order,
>>                                        high_wmark_pages(zone), 0, 0)) {
>>                                end_zone = i;
>> @@ -2396,6 +2425,11 @@ loop_again:
>>                                continue;
>>
>>                        sc.nr_scanned = 0;
>> +                       /*
>> +                        * Reclaim unmapped pages upfront, this should be
>> +                        * really cheap
>> +                        */
>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>
> Why should we do by two phase?
> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> If we can't reclaim enough, next allocation would wake up kswapd again
> and kswapd try it again.
>

I am not sure I understand, the wakeup will occur only if the unmapped
pages are still above the max_unmapped_ratio. They are tunable control
points.

> And I have a concern. I already pointed out.
> If memory pressure is heavy and unmappd_pages is more than our
> threshold, this can move inactive's tail pages which are mapped into
> heads by reclaim_unmapped_pages. It can make confusing LRU order so
> working set can be evicted.
>

Sorry, not sure  I understand completely? The LRU order is disrupted
because we selectively scan unmapped pages. shrink_page_list() will
ignore mapped pages and put them back in the LRU at head? Here is a
quick take on what happens

zone_reclaim() will be invoked as a result of these patches and the
pages it tries to reclaim is very few (1 << order). Active list will
be shrunk only when the inactive anon or inactive list is low in size.
I don't see a major churn happening unless we keep failing to reclaim
unmapped pages. In any case we isolate inactive pages and try to
reclaim minimal memory, the churn is mostly in the inactive list if
the page is not reclaimed (am I missing anything?).


> zone_reclaim is used by only NUMA until now but you are opening it in the world.
> I think it would be a good feature in embedded system, too.
> I hope we care of working set eviction problem.
>

I hope the above answers your questions

Balbir

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28  2:56       ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28  2:56 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
[snip]

>> index 7b56473..2ac8549 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1660,6 +1660,9 @@ zonelist_scan:
>>                        unsigned long mark;
>>                        int ret;
>>
>> +                       if (should_reclaim_unmapped_pages(zone))
>> +                               wakeup_kswapd(zone, order, classzone_idx);
>> +
>
> Do we really need the check in fastpath?
> There are lost of caller of alloc_pages.
> Many of them are not related to mapped pages.
> Could we move the check into add_to_page_cache_locked?

The check is a simple check to see if the unmapped pages need
balancing, the reason I placed this check here is to allow other
allocations to benefit as well, if there are some unmapped pages to be
freed. add_to_page_cache_locked (check under a critical section) is
even worse, IMHO.


>
>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
>>                        if (zone_watermark_ok(zone, order, mark,
>>                                    classzone_idx, alloc_flags))
>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>>
>>                zone->spanned_pages = size;
>>                zone->present_pages = realsize;
>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
>>                                                / 100;
>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
>> +                                               / 100;
>> +#endif
>>  #ifdef CONFIG_NUMA
>>                zone->node = nid;
>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
>>        return 0;
>>  }
>>
>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>>        void __user *buffer, size_t *length, loff_t *ppos)
>>  {
>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>>        return 0;
>>  }
>>
>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>> +       void __user *buffer, size_t *length, loff_t *ppos)
>> +{
>> +       struct zone *zone;
>> +       int rc;
>> +
>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
>> +       if (rc)
>> +               return rc;
>> +
>> +       for_each_zone(zone)
>> +               zone->max_unmapped_pages = (zone->present_pages *
>> +                               sysctl_max_unmapped_ratio) / 100;
>> +       return 0;
>> +}
>> +#endif
>> +
>>  #ifdef CONFIG_NUMA
>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
>>        void __user *buffer, size_t *length, loff_t *ppos)
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index 02cc82e..6377411 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
>>  #define scanning_global_lru(sc)        (1)
>>  #endif
>>
>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
>> +                                               struct scan_control *sc);
>> +static int unmapped_page_control __read_mostly;
>> +
>> +static int __init unmapped_page_control_parm(char *str)
>> +{
>> +       unmapped_page_control = 1;
>> +       /*
>> +        * XXX: Should we tweak swappiness here?
>> +        */
>> +       return 1;
>> +}
>> +__setup("unmapped_page_control", unmapped_page_control_parm);
>> +
>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
>> +static inline unsigned long reclaim_unmapped_pages(int priority,
>> +                               struct zone *zone, struct scan_control *sc)
>> +{
>> +       return 0;
>> +}
>> +#endif
>> +
>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
>>                                                  struct scan_control *sc)
>>  {
>> @@ -2359,6 +2382,12 @@ loop_again:
>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>>                                                        &sc, priority, 0);
>>
>> +                       /*
>> +                        * We do unmapped page reclaim once here and once
>> +                        * below, so that we don't lose out
>> +                        */
>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>> +
>>                        if (!zone_watermark_ok_safe(zone, order,
>>                                        high_wmark_pages(zone), 0, 0)) {
>>                                end_zone = i;
>> @@ -2396,6 +2425,11 @@ loop_again:
>>                                continue;
>>
>>                        sc.nr_scanned = 0;
>> +                       /*
>> +                        * Reclaim unmapped pages upfront, this should be
>> +                        * really cheap
>> +                        */
>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>
> Why should we do by two phase?
> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> If we can't reclaim enough, next allocation would wake up kswapd again
> and kswapd try it again.
>

I am not sure I understand, the wakeup will occur only if the unmapped
pages are still above the max_unmapped_ratio. They are tunable control
points.

> And I have a concern. I already pointed out.
> If memory pressure is heavy and unmappd_pages is more than our
> threshold, this can move inactive's tail pages which are mapped into
> heads by reclaim_unmapped_pages. It can make confusing LRU order so
> working set can be evicted.
>

Sorry, not sure  I understand completely? The LRU order is disrupted
because we selectively scan unmapped pages. shrink_page_list() will
ignore mapped pages and put them back in the LRU at head? Here is a
quick take on what happens

zone_reclaim() will be invoked as a result of these patches and the
pages it tries to reclaim is very few (1 << order). Active list will
be shrunk only when the inactive anon or inactive list is low in size.
I don't see a major churn happening unless we keep failing to reclaim
unmapped pages. In any case we isolate inactive pages and try to
reclaim minimal memory, the churn is mostly in the inactive list if
the page is not reclaimed (am I missing anything?).


> zone_reclaim is used by only NUMA until now but you are opening it in the world.
> I think it would be a good feature in embedded system, too.
> I hope we care of working set eviction problem.
>

I hope the above answers your questions

Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28  2:56       ` Balbir Singh
@ 2011-01-28  5:44         ` Minchan Kim
  -1 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-01-28  5:44 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
<balbir@linux.vnet.ibm.com> wrote:
> On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> [snip]
>
>>> index 7b56473..2ac8549 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1660,6 +1660,9 @@ zonelist_scan:
>>>                        unsigned long mark;
>>>                        int ret;
>>>
>>> +                       if (should_reclaim_unmapped_pages(zone))
>>> +                               wakeup_kswapd(zone, order, classzone_idx);
>>> +
>>
>> Do we really need the check in fastpath?
>> There are lost of caller of alloc_pages.
>> Many of them are not related to mapped pages.
>> Could we move the check into add_to_page_cache_locked?
>
> The check is a simple check to see if the unmapped pages need
> balancing, the reason I placed this check here is to allow other
> allocations to benefit as well, if there are some unmapped pages to be
> freed. add_to_page_cache_locked (check under a critical section) is
> even worse, IMHO.

It just moves the overhead from general into specific case(ie,
allocates page for just page cache).
Another cases(ie, allocates pages for other purpose except page cache,
ex device drivers or fs allocation for internal using) aren't
affected.
So, It would be better.

The goal in this patch is to remove only page cache page, isn't it?
So I think we could the balance check in add_to_page_cache and trigger reclaim.
If we do so, what's the problem?

>
>
>>
>>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
>>>                        if (zone_watermark_ok(zone, order, mark,
>>>                                    classzone_idx, alloc_flags))
>>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>>>
>>>                zone->spanned_pages = size;
>>>                zone->present_pages = realsize;
>>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
>>>                                                / 100;
>>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
>>> +                                               / 100;
>>> +#endif
>>>  #ifdef CONFIG_NUMA
>>>                zone->node = nid;
>>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
>>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
>>>        return 0;
>>>  }
>>>
>>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>>>        void __user *buffer, size_t *length, loff_t *ppos)
>>>  {
>>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>>>        return 0;
>>>  }
>>>
>>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>>> +       void __user *buffer, size_t *length, loff_t *ppos)
>>> +{
>>> +       struct zone *zone;
>>> +       int rc;
>>> +
>>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
>>> +       if (rc)
>>> +               return rc;
>>> +
>>> +       for_each_zone(zone)
>>> +               zone->max_unmapped_pages = (zone->present_pages *
>>> +                               sysctl_max_unmapped_ratio) / 100;
>>> +       return 0;
>>> +}
>>> +#endif
>>> +
>>>  #ifdef CONFIG_NUMA
>>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
>>>        void __user *buffer, size_t *length, loff_t *ppos)
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index 02cc82e..6377411 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
>>>  #define scanning_global_lru(sc)        (1)
>>>  #endif
>>>
>>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
>>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
>>> +                                               struct scan_control *sc);
>>> +static int unmapped_page_control __read_mostly;
>>> +
>>> +static int __init unmapped_page_control_parm(char *str)
>>> +{
>>> +       unmapped_page_control = 1;
>>> +       /*
>>> +        * XXX: Should we tweak swappiness here?
>>> +        */
>>> +       return 1;
>>> +}
>>> +__setup("unmapped_page_control", unmapped_page_control_parm);
>>> +
>>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
>>> +static inline unsigned long reclaim_unmapped_pages(int priority,
>>> +                               struct zone *zone, struct scan_control *sc)
>>> +{
>>> +       return 0;
>>> +}
>>> +#endif
>>> +
>>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
>>>                                                  struct scan_control *sc)
>>>  {
>>> @@ -2359,6 +2382,12 @@ loop_again:
>>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>>>                                                        &sc, priority, 0);
>>>
>>> +                       /*
>>> +                        * We do unmapped page reclaim once here and once
>>> +                        * below, so that we don't lose out
>>> +                        */
>>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>>> +
>>>                        if (!zone_watermark_ok_safe(zone, order,
>>>                                        high_wmark_pages(zone), 0, 0)) {
>>>                                end_zone = i;
>>> @@ -2396,6 +2425,11 @@ loop_again:
>>>                                continue;
>>>
>>>                        sc.nr_scanned = 0;
>>> +                       /*
>>> +                        * Reclaim unmapped pages upfront, this should be
>>> +                        * really cheap
>>> +                        */
>>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>>
>> Why should we do by two phase?
>> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
>> If we can't reclaim enough, next allocation would wake up kswapd again
>> and kswapd try it again.
>>
>
> I am not sure I understand, the wakeup will occur only if the unmapped
> pages are still above the max_unmapped_ratio. They are tunable control
> points.

I mean you try to reclaim twice in one path.
one is when select highest zone to reclaim.
one is when VM reclaim the zone.

What's your intention?


>
>> And I have a concern. I already pointed out.
>> If memory pressure is heavy and unmappd_pages is more than our
>> threshold, this can move inactive's tail pages which are mapped into
>> heads by reclaim_unmapped_pages. It can make confusing LRU order so
>> working set can be evicted.
>>
>
> Sorry, not sure  I understand completely? The LRU order is disrupted
> because we selectively scan unmapped pages. shrink_page_list() will
> ignore mapped pages and put them back in the LRU at head? Here is a
> quick take on what happens
>
> zone_reclaim() will be invoked as a result of these patches and the
> pages it tries to reclaim is very few (1 << order). Active list will
> be shrunk only when the inactive anon or inactive list is low in size.
> I don't see a major churn happening unless we keep failing to reclaim
> unmapped pages. In any case we isolate inactive pages and try to
> reclaim minimal memory, the churn is mostly in the inactive list if
> the page is not reclaimed (am I missing anything?).

You understand my question completely. :)
In inactive list, page order is important, too although it's weak
lumpy and compaction as time goes by.
If threshold up and down happens  frequently, victim pages in inactive
list could move into head and it's not good.


>
>
>> zone_reclaim is used by only NUMA until now but you are opening it in the world.
>> I think it would be a good feature in embedded system, too.
>> I hope we care of working set eviction problem.
>>
>
> I hope the above answers your questions
>
> Balbir
>



-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28  5:44         ` Minchan Kim
  0 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-01-28  5:44 UTC (permalink / raw)
  To: Balbir Singh
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
<balbir@linux.vnet.ibm.com> wrote:
> On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> [snip]
>
>>> index 7b56473..2ac8549 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1660,6 +1660,9 @@ zonelist_scan:
>>>                        unsigned long mark;
>>>                        int ret;
>>>
>>> +                       if (should_reclaim_unmapped_pages(zone))
>>> +                               wakeup_kswapd(zone, order, classzone_idx);
>>> +
>>
>> Do we really need the check in fastpath?
>> There are lost of caller of alloc_pages.
>> Many of them are not related to mapped pages.
>> Could we move the check into add_to_page_cache_locked?
>
> The check is a simple check to see if the unmapped pages need
> balancing, the reason I placed this check here is to allow other
> allocations to benefit as well, if there are some unmapped pages to be
> freed. add_to_page_cache_locked (check under a critical section) is
> even worse, IMHO.

It just moves the overhead from general into specific case(ie,
allocates page for just page cache).
Another cases(ie, allocates pages for other purpose except page cache,
ex device drivers or fs allocation for internal using) aren't
affected.
So, It would be better.

The goal in this patch is to remove only page cache page, isn't it?
So I think we could the balance check in add_to_page_cache and trigger reclaim.
If we do so, what's the problem?

>
>
>>
>>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
>>>                        if (zone_watermark_ok(zone, order, mark,
>>>                                    classzone_idx, alloc_flags))
>>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>>>
>>>                zone->spanned_pages = size;
>>>                zone->present_pages = realsize;
>>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
>>>                                                / 100;
>>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
>>> +                                               / 100;
>>> +#endif
>>>  #ifdef CONFIG_NUMA
>>>                zone->node = nid;
>>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
>>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
>>>        return 0;
>>>  }
>>>
>>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>>>        void __user *buffer, size_t *length, loff_t *ppos)
>>>  {
>>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>>>        return 0;
>>>  }
>>>
>>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>>> +       void __user *buffer, size_t *length, loff_t *ppos)
>>> +{
>>> +       struct zone *zone;
>>> +       int rc;
>>> +
>>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
>>> +       if (rc)
>>> +               return rc;
>>> +
>>> +       for_each_zone(zone)
>>> +               zone->max_unmapped_pages = (zone->present_pages *
>>> +                               sysctl_max_unmapped_ratio) / 100;
>>> +       return 0;
>>> +}
>>> +#endif
>>> +
>>>  #ifdef CONFIG_NUMA
>>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
>>>        void __user *buffer, size_t *length, loff_t *ppos)
>>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>>> index 02cc82e..6377411 100644
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
>>>  #define scanning_global_lru(sc)        (1)
>>>  #endif
>>>
>>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
>>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
>>> +                                               struct scan_control *sc);
>>> +static int unmapped_page_control __read_mostly;
>>> +
>>> +static int __init unmapped_page_control_parm(char *str)
>>> +{
>>> +       unmapped_page_control = 1;
>>> +       /*
>>> +        * XXX: Should we tweak swappiness here?
>>> +        */
>>> +       return 1;
>>> +}
>>> +__setup("unmapped_page_control", unmapped_page_control_parm);
>>> +
>>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
>>> +static inline unsigned long reclaim_unmapped_pages(int priority,
>>> +                               struct zone *zone, struct scan_control *sc)
>>> +{
>>> +       return 0;
>>> +}
>>> +#endif
>>> +
>>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
>>>                                                  struct scan_control *sc)
>>>  {
>>> @@ -2359,6 +2382,12 @@ loop_again:
>>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>>>                                                        &sc, priority, 0);
>>>
>>> +                       /*
>>> +                        * We do unmapped page reclaim once here and once
>>> +                        * below, so that we don't lose out
>>> +                        */
>>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>>> +
>>>                        if (!zone_watermark_ok_safe(zone, order,
>>>                                        high_wmark_pages(zone), 0, 0)) {
>>>                                end_zone = i;
>>> @@ -2396,6 +2425,11 @@ loop_again:
>>>                                continue;
>>>
>>>                        sc.nr_scanned = 0;
>>> +                       /*
>>> +                        * Reclaim unmapped pages upfront, this should be
>>> +                        * really cheap
>>> +                        */
>>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>>
>> Why should we do by two phase?
>> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
>> If we can't reclaim enough, next allocation would wake up kswapd again
>> and kswapd try it again.
>>
>
> I am not sure I understand, the wakeup will occur only if the unmapped
> pages are still above the max_unmapped_ratio. They are tunable control
> points.

I mean you try to reclaim twice in one path.
one is when select highest zone to reclaim.
one is when VM reclaim the zone.

What's your intention?


>
>> And I have a concern. I already pointed out.
>> If memory pressure is heavy and unmappd_pages is more than our
>> threshold, this can move inactive's tail pages which are mapped into
>> heads by reclaim_unmapped_pages. It can make confusing LRU order so
>> working set can be evicted.
>>
>
> Sorry, not sure  I understand completely? The LRU order is disrupted
> because we selectively scan unmapped pages. shrink_page_list() will
> ignore mapped pages and put them back in the LRU at head? Here is a
> quick take on what happens
>
> zone_reclaim() will be invoked as a result of these patches and the
> pages it tries to reclaim is very few (1 << order). Active list will
> be shrunk only when the inactive anon or inactive list is low in size.
> I don't see a major churn happening unless we keep failing to reclaim
> unmapped pages. In any case we isolate inactive pages and try to
> reclaim minimal memory, the churn is mostly in the inactive list if
> the page is not reclaimed (am I missing anything?).

You understand my question completely. :)
In inactive list, page order is important, too although it's weak
lumpy and compaction as time goes by.
If threshold up and down happens  frequently, victim pages in inactive
list could move into head and it's not good.


>
>
>> zone_reclaim is used by only NUMA until now but you are opening it in the world.
>> I think it would be a good feature in embedded system, too.
>> I hope we care of working set eviction problem.
>>
>
> I hope the above answers your questions
>
> Balbir
>



-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28  5:44         ` Minchan Kim
  (?)
@ 2011-01-28  6:48           ` Balbir Singh
  -1 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28  6:48 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

* MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:

> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
> <balbir@linux.vnet.ibm.com> wrote:
> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> > [snip]
> >
> >>> index 7b56473..2ac8549 100644
> >>> --- a/mm/page_alloc.c
> >>> +++ b/mm/page_alloc.c
> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
> >>>                        unsigned long mark;
> >>>                        int ret;
> >>>
> >>> +                       if (should_reclaim_unmapped_pages(zone))
> >>> +                               wakeup_kswapd(zone, order, classzone_idx);
> >>> +
> >>
> >> Do we really need the check in fastpath?
> >> There are lost of caller of alloc_pages.
> >> Many of them are not related to mapped pages.
> >> Could we move the check into add_to_page_cache_locked?
> >
> > The check is a simple check to see if the unmapped pages need
> > balancing, the reason I placed this check here is to allow other
> > allocations to benefit as well, if there are some unmapped pages to be
> > freed. add_to_page_cache_locked (check under a critical section) is
> > even worse, IMHO.
> 
> It just moves the overhead from general into specific case(ie,
> allocates page for just page cache).
> Another cases(ie, allocates pages for other purpose except page cache,
> ex device drivers or fs allocation for internal using) aren't
> affected.
> So, It would be better.
> 
> The goal in this patch is to remove only page cache page, isn't it?
> So I think we could the balance check in add_to_page_cache and trigger reclaim.
> If we do so, what's the problem?
>

I see it as a tradeoff of when to check? add_to_page_cache or when we
are want more free memory (due to allocation). It is OK to wakeup
kswapd while allocating memory, somehow for this purpose (global page
cache), add_to_page_cache or add_to_page_cache_locked does not seem
the right place to hook into. I'd be open to comments/suggestions
though from others as well.
 
> >
> >
> >>
> >>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
> >>>                        if (zone_watermark_ok(zone, order, mark,
> >>>                                    classzone_idx, alloc_flags))
> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> >>>
> >>>                zone->spanned_pages = size;
> >>>                zone->present_pages = realsize;
> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> >>>                                                / 100;
> >>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> >>> +                                               / 100;
> >>> +#endif
> >>>  #ifdef CONFIG_NUMA
> >>>                zone->node = nid;
> >>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
> >>>        return 0;
> >>>  }
> >>>
> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> >>>  {
> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >>>        return 0;
> >>>  }
> >>>
> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >>> +       void __user *buffer, size_t *length, loff_t *ppos)
> >>> +{
> >>> +       struct zone *zone;
> >>> +       int rc;
> >>> +
> >>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> >>> +       if (rc)
> >>> +               return rc;
> >>> +
> >>> +       for_each_zone(zone)
> >>> +               zone->max_unmapped_pages = (zone->present_pages *
> >>> +                               sysctl_max_unmapped_ratio) / 100;
> >>> +       return 0;
> >>> +}
> >>> +#endif
> >>> +
> >>>  #ifdef CONFIG_NUMA
> >>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >>> index 02cc82e..6377411 100644
> >>> --- a/mm/vmscan.c
> >>> +++ b/mm/vmscan.c
> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
> >>>  #define scanning_global_lru(sc)        (1)
> >>>  #endif
> >>>
> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> >>> +                                               struct scan_control *sc);
> >>> +static int unmapped_page_control __read_mostly;
> >>> +
> >>> +static int __init unmapped_page_control_parm(char *str)
> >>> +{
> >>> +       unmapped_page_control = 1;
> >>> +       /*
> >>> +        * XXX: Should we tweak swappiness here?
> >>> +        */
> >>> +       return 1;
> >>> +}
> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
> >>> +
> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
> >>> +                               struct zone *zone, struct scan_control *sc)
> >>> +{
> >>> +       return 0;
> >>> +}
> >>> +#endif
> >>> +
> >>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> >>>                                                  struct scan_control *sc)
> >>>  {
> >>> @@ -2359,6 +2382,12 @@ loop_again:
> >>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
> >>>                                                        &sc, priority, 0);
> >>>
> >>> +                       /*
> >>> +                        * We do unmapped page reclaim once here and once
> >>> +                        * below, so that we don't lose out
> >>> +                        */
> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> >>> +
> >>>                        if (!zone_watermark_ok_safe(zone, order,
> >>>                                        high_wmark_pages(zone), 0, 0)) {
> >>>                                end_zone = i;
> >>> @@ -2396,6 +2425,11 @@ loop_again:
> >>>                                continue;
> >>>
> >>>                        sc.nr_scanned = 0;
> >>> +                       /*
> >>> +                        * Reclaim unmapped pages upfront, this should be
> >>> +                        * really cheap
> >>> +                        */
> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> >>
> >> Why should we do by two phase?
> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> >> If we can't reclaim enough, next allocation would wake up kswapd again
> >> and kswapd try it again.
> >>
> >
> > I am not sure I understand, the wakeup will occur only if the unmapped
> > pages are still above the max_unmapped_ratio. They are tunable control
> > points.
> 
> I mean you try to reclaim twice in one path.
> one is when select highest zone to reclaim.
> one is when VM reclaim the zone.
> 
> What's your intention?
> 

That is because some zones can be skipped, we need to ensure we go
through all zones, rather than selective zones (limited via search for
end_zone).

> 
> >
> >> And I have a concern. I already pointed out.
> >> If memory pressure is heavy and unmappd_pages is more than our
> >> threshold, this can move inactive's tail pages which are mapped into
> >> heads by reclaim_unmapped_pages. It can make confusing LRU order so
> >> working set can be evicted.
> >>
> >
> > Sorry, not sure  I understand completely? The LRU order is disrupted
> > because we selectively scan unmapped pages. shrink_page_list() will
> > ignore mapped pages and put them back in the LRU at head? Here is a
> > quick take on what happens
> >
> > zone_reclaim() will be invoked as a result of these patches and the
> > pages it tries to reclaim is very few (1 << order). Active list will
> > be shrunk only when the inactive anon or inactive list is low in size.
> > I don't see a major churn happening unless we keep failing to reclaim
> > unmapped pages. In any case we isolate inactive pages and try to
> > reclaim minimal memory, the churn is mostly in the inactive list if
> > the page is not reclaimed (am I missing anything?).
> 
> You understand my question completely. :)
> In inactive list, page order is important, too although it's weak
> lumpy and compaction as time goes by.
> If threshold up and down happens  frequently, victim pages in inactive
> list could move into head and it's not good.

But the assumption for LRU order to change happens only if the page
cannot be successfully freed, which means it is in some way active..
and needs to be moved no?

Thanks for the detailed review!

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28  6:48           ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28  6:48 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

* MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:

> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
> <balbir@linux.vnet.ibm.com> wrote:
> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> > [snip]
> >
> >>> index 7b56473..2ac8549 100644
> >>> --- a/mm/page_alloc.c
> >>> +++ b/mm/page_alloc.c
> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
> >>>                        unsigned long mark;
> >>>                        int ret;
> >>>
> >>> +                       if (should_reclaim_unmapped_pages(zone))
> >>> +                               wakeup_kswapd(zone, order, classzone_idx);
> >>> +
> >>
> >> Do we really need the check in fastpath?
> >> There are lost of caller of alloc_pages.
> >> Many of them are not related to mapped pages.
> >> Could we move the check into add_to_page_cache_locked?
> >
> > The check is a simple check to see if the unmapped pages need
> > balancing, the reason I placed this check here is to allow other
> > allocations to benefit as well, if there are some unmapped pages to be
> > freed. add_to_page_cache_locked (check under a critical section) is
> > even worse, IMHO.
> 
> It just moves the overhead from general into specific case(ie,
> allocates page for just page cache).
> Another cases(ie, allocates pages for other purpose except page cache,
> ex device drivers or fs allocation for internal using) aren't
> affected.
> So, It would be better.
> 
> The goal in this patch is to remove only page cache page, isn't it?
> So I think we could the balance check in add_to_page_cache and trigger reclaim.
> If we do so, what's the problem?
>

I see it as a tradeoff of when to check? add_to_page_cache or when we
are want more free memory (due to allocation). It is OK to wakeup
kswapd while allocating memory, somehow for this purpose (global page
cache), add_to_page_cache or add_to_page_cache_locked does not seem
the right place to hook into. I'd be open to comments/suggestions
though from others as well.
 
> >
> >
> >>
> >>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
> >>>                        if (zone_watermark_ok(zone, order, mark,
> >>>                                    classzone_idx, alloc_flags))
> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> >>>
> >>>                zone->spanned_pages = size;
> >>>                zone->present_pages = realsize;
> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> >>>                                                / 100;
> >>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> >>> +                                               / 100;
> >>> +#endif
> >>>  #ifdef CONFIG_NUMA
> >>>                zone->node = nid;
> >>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
> >>>        return 0;
> >>>  }
> >>>
> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> >>>  {
> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >>>        return 0;
> >>>  }
> >>>
> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >>> +       void __user *buffer, size_t *length, loff_t *ppos)
> >>> +{
> >>> +       struct zone *zone;
> >>> +       int rc;
> >>> +
> >>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> >>> +       if (rc)
> >>> +               return rc;
> >>> +
> >>> +       for_each_zone(zone)
> >>> +               zone->max_unmapped_pages = (zone->present_pages *
> >>> +                               sysctl_max_unmapped_ratio) / 100;
> >>> +       return 0;
> >>> +}
> >>> +#endif
> >>> +
> >>>  #ifdef CONFIG_NUMA
> >>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >>> index 02cc82e..6377411 100644
> >>> --- a/mm/vmscan.c
> >>> +++ b/mm/vmscan.c
> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
> >>>  #define scanning_global_lru(sc)        (1)
> >>>  #endif
> >>>
> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> >>> +                                               struct scan_control *sc);
> >>> +static int unmapped_page_control __read_mostly;
> >>> +
> >>> +static int __init unmapped_page_control_parm(char *str)
> >>> +{
> >>> +       unmapped_page_control = 1;
> >>> +       /*
> >>> +        * XXX: Should we tweak swappiness here?
> >>> +        */
> >>> +       return 1;
> >>> +}
> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
> >>> +
> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
> >>> +                               struct zone *zone, struct scan_control *sc)
> >>> +{
> >>> +       return 0;
> >>> +}
> >>> +#endif
> >>> +
> >>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> >>>                                                  struct scan_control *sc)
> >>>  {
> >>> @@ -2359,6 +2382,12 @@ loop_again:
> >>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
> >>>                                                        &sc, priority, 0);
> >>>
> >>> +                       /*
> >>> +                        * We do unmapped page reclaim once here and once
> >>> +                        * below, so that we don't lose out
> >>> +                        */
> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> >>> +
> >>>                        if (!zone_watermark_ok_safe(zone, order,
> >>>                                        high_wmark_pages(zone), 0, 0)) {
> >>>                                end_zone = i;
> >>> @@ -2396,6 +2425,11 @@ loop_again:
> >>>                                continue;
> >>>
> >>>                        sc.nr_scanned = 0;
> >>> +                       /*
> >>> +                        * Reclaim unmapped pages upfront, this should be
> >>> +                        * really cheap
> >>> +                        */
> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> >>
> >> Why should we do by two phase?
> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> >> If we can't reclaim enough, next allocation would wake up kswapd again
> >> and kswapd try it again.
> >>
> >
> > I am not sure I understand, the wakeup will occur only if the unmapped
> > pages are still above the max_unmapped_ratio. They are tunable control
> > points.
> 
> I mean you try to reclaim twice in one path.
> one is when select highest zone to reclaim.
> one is when VM reclaim the zone.
> 
> What's your intention?
> 

That is because some zones can be skipped, we need to ensure we go
through all zones, rather than selective zones (limited via search for
end_zone).

> 
> >
> >> And I have a concern. I already pointed out.
> >> If memory pressure is heavy and unmappd_pages is more than our
> >> threshold, this can move inactive's tail pages which are mapped into
> >> heads by reclaim_unmapped_pages. It can make confusing LRU order so
> >> working set can be evicted.
> >>
> >
> > Sorry, not sure  I understand completely? The LRU order is disrupted
> > because we selectively scan unmapped pages. shrink_page_list() will
> > ignore mapped pages and put them back in the LRU at head? Here is a
> > quick take on what happens
> >
> > zone_reclaim() will be invoked as a result of these patches and the
> > pages it tries to reclaim is very few (1 << order). Active list will
> > be shrunk only when the inactive anon or inactive list is low in size.
> > I don't see a major churn happening unless we keep failing to reclaim
> > unmapped pages. In any case we isolate inactive pages and try to
> > reclaim minimal memory, the churn is mostly in the inactive list if
> > the page is not reclaimed (am I missing anything?).
> 
> You understand my question completely. :)
> In inactive list, page order is important, too although it's weak
> lumpy and compaction as time goes by.
> If threshold up and down happens  frequently, victim pages in inactive
> list could move into head and it's not good.

But the assumption for LRU order to change happens only if the page
cannot be successfully freed, which means it is in some way active..
and needs to be moved no?

Thanks for the detailed review!

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28  6:48           ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28  6:48 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

* MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:

> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
> <balbir@linux.vnet.ibm.com> wrote:
> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> > [snip]
> >
> >>> index 7b56473..2ac8549 100644
> >>> --- a/mm/page_alloc.c
> >>> +++ b/mm/page_alloc.c
> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
> >>>                        unsigned long mark;
> >>>                        int ret;
> >>>
> >>> +                       if (should_reclaim_unmapped_pages(zone))
> >>> +                               wakeup_kswapd(zone, order, classzone_idx);
> >>> +
> >>
> >> Do we really need the check in fastpath?
> >> There are lost of caller of alloc_pages.
> >> Many of them are not related to mapped pages.
> >> Could we move the check into add_to_page_cache_locked?
> >
> > The check is a simple check to see if the unmapped pages need
> > balancing, the reason I placed this check here is to allow other
> > allocations to benefit as well, if there are some unmapped pages to be
> > freed. add_to_page_cache_locked (check under a critical section) is
> > even worse, IMHO.
> 
> It just moves the overhead from general into specific case(ie,
> allocates page for just page cache).
> Another cases(ie, allocates pages for other purpose except page cache,
> ex device drivers or fs allocation for internal using) aren't
> affected.
> So, It would be better.
> 
> The goal in this patch is to remove only page cache page, isn't it?
> So I think we could the balance check in add_to_page_cache and trigger reclaim.
> If we do so, what's the problem?
>

I see it as a tradeoff of when to check? add_to_page_cache or when we
are want more free memory (due to allocation). It is OK to wakeup
kswapd while allocating memory, somehow for this purpose (global page
cache), add_to_page_cache or add_to_page_cache_locked does not seem
the right place to hook into. I'd be open to comments/suggestions
though from others as well.
 
> >
> >
> >>
> >>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
> >>>                        if (zone_watermark_ok(zone, order, mark,
> >>>                                    classzone_idx, alloc_flags))
> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> >>>
> >>>                zone->spanned_pages = size;
> >>>                zone->present_pages = realsize;
> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> >>>                                                / 100;
> >>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> >>> +                                               / 100;
> >>> +#endif
> >>>  #ifdef CONFIG_NUMA
> >>>                zone->node = nid;
> >>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
> >>>        return 0;
> >>>  }
> >>>
> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> >>>  {
> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >>>        return 0;
> >>>  }
> >>>
> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >>> +       void __user *buffer, size_t *length, loff_t *ppos)
> >>> +{
> >>> +       struct zone *zone;
> >>> +       int rc;
> >>> +
> >>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> >>> +       if (rc)
> >>> +               return rc;
> >>> +
> >>> +       for_each_zone(zone)
> >>> +               zone->max_unmapped_pages = (zone->present_pages *
> >>> +                               sysctl_max_unmapped_ratio) / 100;
> >>> +       return 0;
> >>> +}
> >>> +#endif
> >>> +
> >>>  #ifdef CONFIG_NUMA
> >>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >>> index 02cc82e..6377411 100644
> >>> --- a/mm/vmscan.c
> >>> +++ b/mm/vmscan.c
> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
> >>>  #define scanning_global_lru(sc)        (1)
> >>>  #endif
> >>>
> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> >>> +                                               struct scan_control *sc);
> >>> +static int unmapped_page_control __read_mostly;
> >>> +
> >>> +static int __init unmapped_page_control_parm(char *str)
> >>> +{
> >>> +       unmapped_page_control = 1;
> >>> +       /*
> >>> +        * XXX: Should we tweak swappiness here?
> >>> +        */
> >>> +       return 1;
> >>> +}
> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
> >>> +
> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
> >>> +                               struct zone *zone, struct scan_control *sc)
> >>> +{
> >>> +       return 0;
> >>> +}
> >>> +#endif
> >>> +
> >>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> >>>                                                  struct scan_control *sc)
> >>>  {
> >>> @@ -2359,6 +2382,12 @@ loop_again:
> >>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
> >>>                                                        &sc, priority, 0);
> >>>
> >>> +                       /*
> >>> +                        * We do unmapped page reclaim once here and once
> >>> +                        * below, so that we don't lose out
> >>> +                        */
> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> >>> +
> >>>                        if (!zone_watermark_ok_safe(zone, order,
> >>>                                        high_wmark_pages(zone), 0, 0)) {
> >>>                                end_zone = i;
> >>> @@ -2396,6 +2425,11 @@ loop_again:
> >>>                                continue;
> >>>
> >>>                        sc.nr_scanned = 0;
> >>> +                       /*
> >>> +                        * Reclaim unmapped pages upfront, this should be
> >>> +                        * really cheap
> >>> +                        */
> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> >>
> >> Why should we do by two phase?
> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> >> If we can't reclaim enough, next allocation would wake up kswapd again
> >> and kswapd try it again.
> >>
> >
> > I am not sure I understand, the wakeup will occur only if the unmapped
> > pages are still above the max_unmapped_ratio. They are tunable control
> > points.
> 
> I mean you try to reclaim twice in one path.
> one is when select highest zone to reclaim.
> one is when VM reclaim the zone.
> 
> What's your intention?
> 

That is because some zones can be skipped, we need to ensure we go
through all zones, rather than selective zones (limited via search for
end_zone).

> 
> >
> >> And I have a concern. I already pointed out.
> >> If memory pressure is heavy and unmappd_pages is more than our
> >> threshold, this can move inactive's tail pages which are mapped into
> >> heads by reclaim_unmapped_pages. It can make confusing LRU order so
> >> working set can be evicted.
> >>
> >
> > Sorry, not sure  I understand completely? The LRU order is disrupted
> > because we selectively scan unmapped pages. shrink_page_list() will
> > ignore mapped pages and put them back in the LRU at head? Here is a
> > quick take on what happens
> >
> > zone_reclaim() will be invoked as a result of these patches and the
> > pages it tries to reclaim is very few (1 << order). Active list will
> > be shrunk only when the inactive anon or inactive list is low in size.
> > I don't see a major churn happening unless we keep failing to reclaim
> > unmapped pages. In any case we isolate inactive pages and try to
> > reclaim minimal memory, the churn is mostly in the inactive list if
> > the page is not reclaimed (am I missing anything?).
> 
> You understand my question completely. :)
> In inactive list, page order is important, too although it's weak
> lumpy and compaction as time goes by.
> If threshold up and down happens  frequently, victim pages in inactive
> list could move into head and it's not good.

But the assumption for LRU order to change happens only if the page
cannot be successfully freed, which means it is in some way active..
and needs to be moved no?

Thanks for the detailed review!

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28  6:48           ` Balbir Singh
@ 2011-01-28  7:24             ` Minchan Kim
  -1 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-01-28  7:24 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

On Fri, Jan 28, 2011 at 3:48 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:
>
>> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
>> <balbir@linux.vnet.ibm.com> wrote:
>> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
>> > [snip]
>> >
>> >>> index 7b56473..2ac8549 100644
>> >>> --- a/mm/page_alloc.c
>> >>> +++ b/mm/page_alloc.c
>> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
>> >>>                        unsigned long mark;
>> >>>                        int ret;
>> >>>
>> >>> +                       if (should_reclaim_unmapped_pages(zone))
>> >>> +                               wakeup_kswapd(zone, order, classzone_idx);
>> >>> +
>> >>
>> >> Do we really need the check in fastpath?
>> >> There are lost of caller of alloc_pages.
>> >> Many of them are not related to mapped pages.
>> >> Could we move the check into add_to_page_cache_locked?
>> >
>> > The check is a simple check to see if the unmapped pages need
>> > balancing, the reason I placed this check here is to allow other
>> > allocations to benefit as well, if there are some unmapped pages to be
>> > freed. add_to_page_cache_locked (check under a critical section) is
>> > even worse, IMHO.
>>
>> It just moves the overhead from general into specific case(ie,
>> allocates page for just page cache).
>> Another cases(ie, allocates pages for other purpose except page cache,
>> ex device drivers or fs allocation for internal using) aren't
>> affected.
>> So, It would be better.
>>
>> The goal in this patch is to remove only page cache page, isn't it?
>> So I think we could the balance check in add_to_page_cache and trigger reclaim.
>> If we do so, what's the problem?
>>
>
> I see it as a tradeoff of when to check? add_to_page_cache or when we
> are want more free memory (due to allocation). It is OK to wakeup
> kswapd while allocating memory, somehow for this purpose (global page
> cache), add_to_page_cache or add_to_page_cache_locked does not seem
> the right place to hook into. I'd be open to comments/suggestions
> though from others as well.
>
>> >
>> >
>> >>
>> >>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
>> >>>                        if (zone_watermark_ok(zone, order, mark,
>> >>>                                    classzone_idx, alloc_flags))
>> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>> >>>
>> >>>                zone->spanned_pages = size;
>> >>>                zone->present_pages = realsize;
>> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>> >>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
>> >>>                                                / 100;
>> >>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
>> >>> +                                               / 100;
>> >>> +#endif
>> >>>  #ifdef CONFIG_NUMA
>> >>>                zone->node = nid;
>> >>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
>> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
>> >>>        return 0;
>> >>>  }
>> >>>
>> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>> >>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>> >>>        void __user *buffer, size_t *length, loff_t *ppos)
>> >>>  {
>> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>> >>>        return 0;
>> >>>  }
>> >>>
>> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>> >>> +       void __user *buffer, size_t *length, loff_t *ppos)
>> >>> +{
>> >>> +       struct zone *zone;
>> >>> +       int rc;
>> >>> +
>> >>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
>> >>> +       if (rc)
>> >>> +               return rc;
>> >>> +
>> >>> +       for_each_zone(zone)
>> >>> +               zone->max_unmapped_pages = (zone->present_pages *
>> >>> +                               sysctl_max_unmapped_ratio) / 100;
>> >>> +       return 0;
>> >>> +}
>> >>> +#endif
>> >>> +
>> >>>  #ifdef CONFIG_NUMA
>> >>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
>> >>>        void __user *buffer, size_t *length, loff_t *ppos)
>> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> >>> index 02cc82e..6377411 100644
>> >>> --- a/mm/vmscan.c
>> >>> +++ b/mm/vmscan.c
>> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
>> >>>  #define scanning_global_lru(sc)        (1)
>> >>>  #endif
>> >>>
>> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
>> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
>> >>> +                                               struct scan_control *sc);
>> >>> +static int unmapped_page_control __read_mostly;
>> >>> +
>> >>> +static int __init unmapped_page_control_parm(char *str)
>> >>> +{
>> >>> +       unmapped_page_control = 1;
>> >>> +       /*
>> >>> +        * XXX: Should we tweak swappiness here?
>> >>> +        */
>> >>> +       return 1;
>> >>> +}
>> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
>> >>> +
>> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
>> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
>> >>> +                               struct zone *zone, struct scan_control *sc)
>> >>> +{
>> >>> +       return 0;
>> >>> +}
>> >>> +#endif
>> >>> +
>> >>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
>> >>>                                                  struct scan_control *sc)
>> >>>  {
>> >>> @@ -2359,6 +2382,12 @@ loop_again:
>> >>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>> >>>                                                        &sc, priority, 0);
>> >>>
>> >>> +                       /*
>> >>> +                        * We do unmapped page reclaim once here and once
>> >>> +                        * below, so that we don't lose out
>> >>> +                        */
>> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>> >>> +
>> >>>                        if (!zone_watermark_ok_safe(zone, order,
>> >>>                                        high_wmark_pages(zone), 0, 0)) {
>> >>>                                end_zone = i;
>> >>> @@ -2396,6 +2425,11 @@ loop_again:
>> >>>                                continue;
>> >>>
>> >>>                        sc.nr_scanned = 0;
>> >>> +                       /*
>> >>> +                        * Reclaim unmapped pages upfront, this should be
>> >>> +                        * really cheap
>> >>> +                        */
>> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>> >>
>> >> Why should we do by two phase?
>> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
>> >> If we can't reclaim enough, next allocation would wake up kswapd again
>> >> and kswapd try it again.
>> >>
>> >
>> > I am not sure I understand, the wakeup will occur only if the unmapped
>> > pages are still above the max_unmapped_ratio. They are tunable control
>> > points.
>>
>> I mean you try to reclaim twice in one path.
>> one is when select highest zone to reclaim.
>> one is when VM reclaim the zone.
>>
>> What's your intention?
>>
>
> That is because some zones can be skipped, we need to ensure we go
> through all zones, rather than selective zones (limited via search for
> end_zone).

If kswapd is wake up by unmapped memory of some zone, we have to
include the zone while selective victim zones to prevent miss the
zone.
I think it would be better than reclaiming twice

>
>>
>> >
>> >> And I have a concern. I already pointed out.
>> >> If memory pressure is heavy and unmappd_pages is more than our
>> >> threshold, this can move inactive's tail pages which are mapped into
>> >> heads by reclaim_unmapped_pages. It can make confusing LRU order so
>> >> working set can be evicted.
>> >>
>> >
>> > Sorry, not sure  I understand completely? The LRU order is disrupted
>> > because we selectively scan unmapped pages. shrink_page_list() will
>> > ignore mapped pages and put them back in the LRU at head? Here is a
>> > quick take on what happens
>> >
>> > zone_reclaim() will be invoked as a result of these patches and the
>> > pages it tries to reclaim is very few (1 << order). Active list will
>> > be shrunk only when the inactive anon or inactive list is low in size.
>> > I don't see a major churn happening unless we keep failing to reclaim
>> > unmapped pages. In any case we isolate inactive pages and try to
>> > reclaim minimal memory, the churn is mostly in the inactive list if
>> > the page is not reclaimed (am I missing anything?).
>>
>> You understand my question completely. :)
>> In inactive list, page order is important, too although it's weak
>> lumpy and compaction as time goes by.
>> If threshold up and down happens  frequently, victim pages in inactive
>> list could move into head and it's not good.
>
> But the assumption for LRU order to change happens only if the page
> cannot be successfully freed, which means it is in some way active..
> and needs to be moved no?

1. holded page by someone
2. mapped pages
3. active pages

1 is rare so it isn't the problem.
Of course, in case of 3, we have to activate it so no problem.
The problem is 2.

>
> Thanks for the detailed review!

Thanks for giving the fun to me. :)

>
> --
>        Three Cheers,
>        Balbir
>



-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28  7:24             ` Minchan Kim
  0 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-01-28  7:24 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

On Fri, Jan 28, 2011 at 3:48 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:
>
>> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
>> <balbir@linux.vnet.ibm.com> wrote:
>> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
>> > [snip]
>> >
>> >>> index 7b56473..2ac8549 100644
>> >>> --- a/mm/page_alloc.c
>> >>> +++ b/mm/page_alloc.c
>> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
>> >>>                        unsigned long mark;
>> >>>                        int ret;
>> >>>
>> >>> +                       if (should_reclaim_unmapped_pages(zone))
>> >>> +                               wakeup_kswapd(zone, order, classzone_idx);
>> >>> +
>> >>
>> >> Do we really need the check in fastpath?
>> >> There are lost of caller of alloc_pages.
>> >> Many of them are not related to mapped pages.
>> >> Could we move the check into add_to_page_cache_locked?
>> >
>> > The check is a simple check to see if the unmapped pages need
>> > balancing, the reason I placed this check here is to allow other
>> > allocations to benefit as well, if there are some unmapped pages to be
>> > freed. add_to_page_cache_locked (check under a critical section) is
>> > even worse, IMHO.
>>
>> It just moves the overhead from general into specific case(ie,
>> allocates page for just page cache).
>> Another cases(ie, allocates pages for other purpose except page cache,
>> ex device drivers or fs allocation for internal using) aren't
>> affected.
>> So, It would be better.
>>
>> The goal in this patch is to remove only page cache page, isn't it?
>> So I think we could the balance check in add_to_page_cache and trigger reclaim.
>> If we do so, what's the problem?
>>
>
> I see it as a tradeoff of when to check? add_to_page_cache or when we
> are want more free memory (due to allocation). It is OK to wakeup
> kswapd while allocating memory, somehow for this purpose (global page
> cache), add_to_page_cache or add_to_page_cache_locked does not seem
> the right place to hook into. I'd be open to comments/suggestions
> though from others as well.
>
>> >
>> >
>> >>
>> >>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
>> >>>                        if (zone_watermark_ok(zone, order, mark,
>> >>>                                    classzone_idx, alloc_flags))
>> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>> >>>
>> >>>                zone->spanned_pages = size;
>> >>>                zone->present_pages = realsize;
>> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>> >>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
>> >>>                                                / 100;
>> >>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
>> >>> +                                               / 100;
>> >>> +#endif
>> >>>  #ifdef CONFIG_NUMA
>> >>>                zone->node = nid;
>> >>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
>> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
>> >>>        return 0;
>> >>>  }
>> >>>
>> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>> >>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>> >>>        void __user *buffer, size_t *length, loff_t *ppos)
>> >>>  {
>> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>> >>>        return 0;
>> >>>  }
>> >>>
>> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>> >>> +       void __user *buffer, size_t *length, loff_t *ppos)
>> >>> +{
>> >>> +       struct zone *zone;
>> >>> +       int rc;
>> >>> +
>> >>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
>> >>> +       if (rc)
>> >>> +               return rc;
>> >>> +
>> >>> +       for_each_zone(zone)
>> >>> +               zone->max_unmapped_pages = (zone->present_pages *
>> >>> +                               sysctl_max_unmapped_ratio) / 100;
>> >>> +       return 0;
>> >>> +}
>> >>> +#endif
>> >>> +
>> >>>  #ifdef CONFIG_NUMA
>> >>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
>> >>>        void __user *buffer, size_t *length, loff_t *ppos)
>> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> >>> index 02cc82e..6377411 100644
>> >>> --- a/mm/vmscan.c
>> >>> +++ b/mm/vmscan.c
>> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
>> >>>  #define scanning_global_lru(sc)        (1)
>> >>>  #endif
>> >>>
>> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
>> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
>> >>> +                                               struct scan_control *sc);
>> >>> +static int unmapped_page_control __read_mostly;
>> >>> +
>> >>> +static int __init unmapped_page_control_parm(char *str)
>> >>> +{
>> >>> +       unmapped_page_control = 1;
>> >>> +       /*
>> >>> +        * XXX: Should we tweak swappiness here?
>> >>> +        */
>> >>> +       return 1;
>> >>> +}
>> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
>> >>> +
>> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
>> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
>> >>> +                               struct zone *zone, struct scan_control *sc)
>> >>> +{
>> >>> +       return 0;
>> >>> +}
>> >>> +#endif
>> >>> +
>> >>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
>> >>>                                                  struct scan_control *sc)
>> >>>  {
>> >>> @@ -2359,6 +2382,12 @@ loop_again:
>> >>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>> >>>                                                        &sc, priority, 0);
>> >>>
>> >>> +                       /*
>> >>> +                        * We do unmapped page reclaim once here and once
>> >>> +                        * below, so that we don't lose out
>> >>> +                        */
>> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>> >>> +
>> >>>                        if (!zone_watermark_ok_safe(zone, order,
>> >>>                                        high_wmark_pages(zone), 0, 0)) {
>> >>>                                end_zone = i;
>> >>> @@ -2396,6 +2425,11 @@ loop_again:
>> >>>                                continue;
>> >>>
>> >>>                        sc.nr_scanned = 0;
>> >>> +                       /*
>> >>> +                        * Reclaim unmapped pages upfront, this should be
>> >>> +                        * really cheap
>> >>> +                        */
>> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
>> >>
>> >> Why should we do by two phase?
>> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
>> >> If we can't reclaim enough, next allocation would wake up kswapd again
>> >> and kswapd try it again.
>> >>
>> >
>> > I am not sure I understand, the wakeup will occur only if the unmapped
>> > pages are still above the max_unmapped_ratio. They are tunable control
>> > points.
>>
>> I mean you try to reclaim twice in one path.
>> one is when select highest zone to reclaim.
>> one is when VM reclaim the zone.
>>
>> What's your intention?
>>
>
> That is because some zones can be skipped, we need to ensure we go
> through all zones, rather than selective zones (limited via search for
> end_zone).

If kswapd is wake up by unmapped memory of some zone, we have to
include the zone while selective victim zones to prevent miss the
zone.
I think it would be better than reclaiming twice

>
>>
>> >
>> >> And I have a concern. I already pointed out.
>> >> If memory pressure is heavy and unmappd_pages is more than our
>> >> threshold, this can move inactive's tail pages which are mapped into
>> >> heads by reclaim_unmapped_pages. It can make confusing LRU order so
>> >> working set can be evicted.
>> >>
>> >
>> > Sorry, not sure  I understand completely? The LRU order is disrupted
>> > because we selectively scan unmapped pages. shrink_page_list() will
>> > ignore mapped pages and put them back in the LRU at head? Here is a
>> > quick take on what happens
>> >
>> > zone_reclaim() will be invoked as a result of these patches and the
>> > pages it tries to reclaim is very few (1 << order). Active list will
>> > be shrunk only when the inactive anon or inactive list is low in size.
>> > I don't see a major churn happening unless we keep failing to reclaim
>> > unmapped pages. In any case we isolate inactive pages and try to
>> > reclaim minimal memory, the churn is mostly in the inactive list if
>> > the page is not reclaimed (am I missing anything?).
>>
>> You understand my question completely. :)
>> In inactive list, page order is important, too although it's weak
>> lumpy and compaction as time goes by.
>> If threshold up and down happens  frequently, victim pages in inactive
>> list could move into head and it's not good.
>
> But the assumption for LRU order to change happens only if the page
> cannot be successfully freed, which means it is in some way active..
> and needs to be moved no?

1. holded page by someone
2. mapped pages
3. active pages

1 is rare so it isn't the problem.
Of course, in case of 3, we have to activate it so no problem.
The problem is 2.

>
> Thanks for the detailed review!

Thanks for giving the fun to me. :)

>
> --
>        Three Cheers,
>        Balbir
>



-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28  7:24             ` Minchan Kim
  (?)
@ 2011-01-28  7:56               ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 47+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-01-28  7:56 UTC (permalink / raw)
  To: Minchan Kim
  Cc: balbir, linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl

On Fri, 28 Jan 2011 16:24:19 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> On Fri, Jan 28, 2011 at 3:48 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:
> >
> >> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
> >> <balbir@linux.vnet.ibm.com> wrote:
> >> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> >> > [snip]
> >> >
> >> >>> index 7b56473..2ac8549 100644
> >> >>> --- a/mm/page_alloc.c
> >> >>> +++ b/mm/page_alloc.c
> >> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
> >> >>>                        unsigned long mark;
> >> >>>                        int ret;
> >> >>>
> >> >>> +                       if (should_reclaim_unmapped_pages(zone))
> >> >>> +                               wakeup_kswapd(zone, order, classzone_idx);
> >> >>> +
> >> >>
> >> >> Do we really need the check in fastpath?
> >> >> There are lost of caller of alloc_pages.
> >> >> Many of them are not related to mapped pages.
> >> >> Could we move the check into add_to_page_cache_locked?
> >> >
> >> > The check is a simple check to see if the unmapped pages need
> >> > balancing, the reason I placed this check here is to allow other
> >> > allocations to benefit as well, if there are some unmapped pages to be
> >> > freed. add_to_page_cache_locked (check under a critical section) is
> >> > even worse, IMHO.
> >>
> >> It just moves the overhead from general into specific case(ie,
> >> allocates page for just page cache).
> >> Another cases(ie, allocates pages for other purpose except page cache,
> >> ex device drivers or fs allocation for internal using) aren't
> >> affected.
> >> So, It would be better.
> >>
> >> The goal in this patch is to remove only page cache page, isn't it?
> >> So I think we could the balance check in add_to_page_cache and trigger reclaim.
> >> If we do so, what's the problem?
> >>
> >
> > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > are want more free memory (due to allocation). It is OK to wakeup
> > kswapd while allocating memory, somehow for this purpose (global page
> > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > the right place to hook into. I'd be open to comments/suggestions
> > though from others as well.

I don't like add hook here.
AND I don't want to run kswapd because 'kswapd' has been a sign as
there are memory shortage. (reusing code is ok.)

How about adding new daemon ? Recently, khugepaged, ksmd works for
managing memory. Adding one more daemon for special purpose is not
very bad, I think. Then, you can do
 - wake up without hook
 - throttle its work.
 - balance the whole system rather than zone.
   I think per-node balance is enough...








> >
> >> >
> >> >
> >> >>
> >> >>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
> >> >>>                        if (zone_watermark_ok(zone, order, mark,
> >> >>>                                    classzone_idx, alloc_flags))
> >> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> >> >>>
> >> >>>                zone->spanned_pages = size;
> >> >>>                zone->present_pages = realsize;
> >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >> >>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> >> >>>                                                / 100;
> >> >>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> >> >>> +                                               / 100;
> >> >>> +#endif
> >> >>>  #ifdef CONFIG_NUMA
> >> >>>                zone->node = nid;
> >> >>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> >> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
> >> >>>        return 0;
> >> >>>  }
> >> >>>
> >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >> >>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> >> >>>  {
> >> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>>        return 0;
> >> >>>  }
> >> >>>
> >> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>> +       void __user *buffer, size_t *length, loff_t *ppos)
> >> >>> +{
> >> >>> +       struct zone *zone;
> >> >>> +       int rc;
> >> >>> +
> >> >>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> >> >>> +       if (rc)
> >> >>> +               return rc;
> >> >>> +
> >> >>> +       for_each_zone(zone)
> >> >>> +               zone->max_unmapped_pages = (zone->present_pages *
> >> >>> +                               sysctl_max_unmapped_ratio) / 100;
> >> >>> +       return 0;
> >> >>> +}
> >> >>> +#endif
> >> >>> +
> >> >>>  #ifdef CONFIG_NUMA
> >> >>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> >> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >> >>> index 02cc82e..6377411 100644
> >> >>> --- a/mm/vmscan.c
> >> >>> +++ b/mm/vmscan.c
> >> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
> >> >>>  #define scanning_global_lru(sc)        (1)
> >> >>>  #endif
> >> >>>
> >> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> >> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> >> >>> +                                               struct scan_control *sc);
> >> >>> +static int unmapped_page_control __read_mostly;
> >> >>> +
> >> >>> +static int __init unmapped_page_control_parm(char *str)
> >> >>> +{
> >> >>> +       unmapped_page_control = 1;
> >> >>> +       /*
> >> >>> +        * XXX: Should we tweak swappiness here?
> >> >>> +        */
> >> >>> +       return 1;
> >> >>> +}
> >> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
> >> >>> +
> >> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> >> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
> >> >>> +                               struct zone *zone, struct scan_control *sc)
> >> >>> +{
> >> >>> +       return 0;
> >> >>> +}
> >> >>> +#endif
> >> >>> +
> >> >>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> >> >>>                                                  struct scan_control *sc)
> >> >>>  {
> >> >>> @@ -2359,6 +2382,12 @@ loop_again:
> >> >>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
> >> >>>                                                        &sc, priority, 0);
> >> >>>
> >> >>> +                       /*
> >> >>> +                        * We do unmapped page reclaim once here and once
> >> >>> +                        * below, so that we don't lose out
> >> >>> +                        */
> >> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> >> >>> +
> >> >>>                        if (!zone_watermark_ok_safe(zone, order,
> >> >>>                                        high_wmark_pages(zone), 0, 0)) {
> >> >>>                                end_zone = i;
> >> >>> @@ -2396,6 +2425,11 @@ loop_again:
> >> >>>                                continue;
> >> >>>
> >> >>>                        sc.nr_scanned = 0;
> >> >>> +                       /*
> >> >>> +                        * Reclaim unmapped pages upfront, this should be
> >> >>> +                        * really cheap
> >> >>> +                        */
> >> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> >> >>
> >> >> Why should we do by two phase?
> >> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> >> >> If we can't reclaim enough, next allocation would wake up kswapd again
> >> >> and kswapd try it again.
> >> >>
> >> >
> >> > I am not sure I understand, the wakeup will occur only if the unmapped
> >> > pages are still above the max_unmapped_ratio. They are tunable control
> >> > points.
> >>
> >> I mean you try to reclaim twice in one path.
> >> one is when select highest zone to reclaim.
> >> one is when VM reclaim the zone.
> >>
> >> What's your intention?
> >>
> >
> > That is because some zones can be skipped, we need to ensure we go
> > through all zones, rather than selective zones (limited via search for
> > end_zone).
> 
> If kswapd is wake up by unmapped memory of some zone, we have to
> include the zone while selective victim zones to prevent miss the
> zone.
> I think it would be better than reclaiming twice
> 

That sounds checking all zones and loop again is enough.


BTW, it seems this doesn't work when some apps use huge shmem.
How to handle the issue ?

Thanks,
-Kame



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28  7:56               ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 47+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-01-28  7:56 UTC (permalink / raw)
  To: Minchan Kim
  Cc: balbir, linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl

On Fri, 28 Jan 2011 16:24:19 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> On Fri, Jan 28, 2011 at 3:48 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:
> >
> >> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
> >> <balbir@linux.vnet.ibm.com> wrote:
> >> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> >> > [snip]
> >> >
> >> >>> index 7b56473..2ac8549 100644
> >> >>> --- a/mm/page_alloc.c
> >> >>> +++ b/mm/page_alloc.c
> >> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
> >> >>>                        unsigned long mark;
> >> >>>                        int ret;
> >> >>>
> >> >>> +                       if (should_reclaim_unmapped_pages(zone))
> >> >>> +                               wakeup_kswapd(zone, order, classzone_idx);
> >> >>> +
> >> >>
> >> >> Do we really need the check in fastpath?
> >> >> There are lost of caller of alloc_pages.
> >> >> Many of them are not related to mapped pages.
> >> >> Could we move the check into add_to_page_cache_locked?
> >> >
> >> > The check is a simple check to see if the unmapped pages need
> >> > balancing, the reason I placed this check here is to allow other
> >> > allocations to benefit as well, if there are some unmapped pages to be
> >> > freed. add_to_page_cache_locked (check under a critical section) is
> >> > even worse, IMHO.
> >>
> >> It just moves the overhead from general into specific case(ie,
> >> allocates page for just page cache).
> >> Another cases(ie, allocates pages for other purpose except page cache,
> >> ex device drivers or fs allocation for internal using) aren't
> >> affected.
> >> So, It would be better.
> >>
> >> The goal in this patch is to remove only page cache page, isn't it?
> >> So I think we could the balance check in add_to_page_cache and trigger reclaim.
> >> If we do so, what's the problem?
> >>
> >
> > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > are want more free memory (due to allocation). It is OK to wakeup
> > kswapd while allocating memory, somehow for this purpose (global page
> > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > the right place to hook into. I'd be open to comments/suggestions
> > though from others as well.

I don't like add hook here.
AND I don't want to run kswapd because 'kswapd' has been a sign as
there are memory shortage. (reusing code is ok.)

How about adding new daemon ? Recently, khugepaged, ksmd works for
managing memory. Adding one more daemon for special purpose is not
very bad, I think. Then, you can do
 - wake up without hook
 - throttle its work.
 - balance the whole system rather than zone.
   I think per-node balance is enough...








> >
> >> >
> >> >
> >> >>
> >> >>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
> >> >>>                        if (zone_watermark_ok(zone, order, mark,
> >> >>>                                    classzone_idx, alloc_flags))
> >> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> >> >>>
> >> >>>                zone->spanned_pages = size;
> >> >>>                zone->present_pages = realsize;
> >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >> >>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> >> >>>                                                / 100;
> >> >>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> >> >>> +                                               / 100;
> >> >>> +#endif
> >> >>>  #ifdef CONFIG_NUMA
> >> >>>                zone->node = nid;
> >> >>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> >> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
> >> >>>        return 0;
> >> >>>  }
> >> >>>
> >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >> >>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> >> >>>  {
> >> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>>        return 0;
> >> >>>  }
> >> >>>
> >> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>> +       void __user *buffer, size_t *length, loff_t *ppos)
> >> >>> +{
> >> >>> +       struct zone *zone;
> >> >>> +       int rc;
> >> >>> +
> >> >>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> >> >>> +       if (rc)
> >> >>> +               return rc;
> >> >>> +
> >> >>> +       for_each_zone(zone)
> >> >>> +               zone->max_unmapped_pages = (zone->present_pages *
> >> >>> +                               sysctl_max_unmapped_ratio) / 100;
> >> >>> +       return 0;
> >> >>> +}
> >> >>> +#endif
> >> >>> +
> >> >>>  #ifdef CONFIG_NUMA
> >> >>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> >> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >> >>> index 02cc82e..6377411 100644
> >> >>> --- a/mm/vmscan.c
> >> >>> +++ b/mm/vmscan.c
> >> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
> >> >>>  #define scanning_global_lru(sc)        (1)
> >> >>>  #endif
> >> >>>
> >> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> >> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> >> >>> +                                               struct scan_control *sc);
> >> >>> +static int unmapped_page_control __read_mostly;
> >> >>> +
> >> >>> +static int __init unmapped_page_control_parm(char *str)
> >> >>> +{
> >> >>> +       unmapped_page_control = 1;
> >> >>> +       /*
> >> >>> +        * XXX: Should we tweak swappiness here?
> >> >>> +        */
> >> >>> +       return 1;
> >> >>> +}
> >> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
> >> >>> +
> >> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> >> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
> >> >>> +                               struct zone *zone, struct scan_control *sc)
> >> >>> +{
> >> >>> +       return 0;
> >> >>> +}
> >> >>> +#endif
> >> >>> +
> >> >>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> >> >>>                                                  struct scan_control *sc)
> >> >>>  {
> >> >>> @@ -2359,6 +2382,12 @@ loop_again:
> >> >>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
> >> >>>                                                        &sc, priority, 0);
> >> >>>
> >> >>> +                       /*
> >> >>> +                        * We do unmapped page reclaim once here and once
> >> >>> +                        * below, so that we don't lose out
> >> >>> +                        */
> >> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> >> >>> +
> >> >>>                        if (!zone_watermark_ok_safe(zone, order,
> >> >>>                                        high_wmark_pages(zone), 0, 0)) {
> >> >>>                                end_zone = i;
> >> >>> @@ -2396,6 +2425,11 @@ loop_again:
> >> >>>                                continue;
> >> >>>
> >> >>>                        sc.nr_scanned = 0;
> >> >>> +                       /*
> >> >>> +                        * Reclaim unmapped pages upfront, this should be
> >> >>> +                        * really cheap
> >> >>> +                        */
> >> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> >> >>
> >> >> Why should we do by two phase?
> >> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> >> >> If we can't reclaim enough, next allocation would wake up kswapd again
> >> >> and kswapd try it again.
> >> >>
> >> >
> >> > I am not sure I understand, the wakeup will occur only if the unmapped
> >> > pages are still above the max_unmapped_ratio. They are tunable control
> >> > points.
> >>
> >> I mean you try to reclaim twice in one path.
> >> one is when select highest zone to reclaim.
> >> one is when VM reclaim the zone.
> >>
> >> What's your intention?
> >>
> >
> > That is because some zones can be skipped, we need to ensure we go
> > through all zones, rather than selective zones (limited via search for
> > end_zone).
> 
> If kswapd is wake up by unmapped memory of some zone, we have to
> include the zone while selective victim zones to prevent miss the
> zone.
> I think it would be better than reclaiming twice
> 

That sounds checking all zones and loop again is enough.


BTW, it seems this doesn't work when some apps use huge shmem.
How to handle the issue ?

Thanks,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28  7:56               ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 47+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-01-28  7:56 UTC (permalink / raw)
  To: Minchan Kim
  Cc: balbir, linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl

On Fri, 28 Jan 2011 16:24:19 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> On Fri, Jan 28, 2011 at 3:48 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:
> >
> >> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
> >> <balbir@linux.vnet.ibm.com> wrote:
> >> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> >> > [snip]
> >> >
> >> >>> index 7b56473..2ac8549 100644
> >> >>> --- a/mm/page_alloc.c
> >> >>> +++ b/mm/page_alloc.c
> >> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A unsigned long mark;
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A int ret;
> >> >>>
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  if (should_reclaim_unmapped_pages(zone))
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  wakeup_kswapd(zone, order, classzone_idx);
> >> >>> +
> >> >>
> >> >> Do we really need the check in fastpath?
> >> >> There are lost of caller of alloc_pages.
> >> >> Many of them are not related to mapped pages.
> >> >> Could we move the check into add_to_page_cache_locked?
> >> >
> >> > The check is a simple check to see if the unmapped pages need
> >> > balancing, the reason I placed this check here is to allow other
> >> > allocations to benefit as well, if there are some unmapped pages to be
> >> > freed. add_to_page_cache_locked (check under a critical section) is
> >> > even worse, IMHO.
> >>
> >> It just moves the overhead from general into specific case(ie,
> >> allocates page for just page cache).
> >> Another cases(ie, allocates pages for other purpose except page cache,
> >> ex device drivers or fs allocation for internal using) aren't
> >> affected.
> >> So, It would be better.
> >>
> >> The goal in this patch is to remove only page cache page, isn't it?
> >> So I think we could the balance check in add_to_page_cache and trigger reclaim.
> >> If we do so, what's the problem?
> >>
> >
> > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > are want more free memory (due to allocation). It is OK to wakeup
> > kswapd while allocating memory, somehow for this purpose (global page
> > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > the right place to hook into. I'd be open to comments/suggestions
> > though from others as well.

I don't like add hook here.
AND I don't want to run kswapd because 'kswapd' has been a sign as
there are memory shortage. (reusing code is ok.)

How about adding new daemon ? Recently, khugepaged, ksmd works for
managing memory. Adding one more daemon for special purpose is not
very bad, I think. Then, you can do
 - wake up without hook
 - throttle its work.
 - balance the whole system rather than zone.
   I think per-node balance is enough...








> >
> >> >
> >> >
> >> >>
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A if (zone_watermark_ok(zone, order, mark,
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A classzone_idx, alloc_flags))
> >> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> >> >>>
> >> >>> A  A  A  A  A  A  A  A zone->spanned_pages = size;
> >> >>> A  A  A  A  A  A  A  A zone->present_pages = realsize;
> >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >> >>> A  A  A  A  A  A  A  A zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A / 100;
> >> >>> + A  A  A  A  A  A  A  zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  / 100;
> >> >>> +#endif
> >> >>> A #ifdef CONFIG_NUMA
> >> >>> A  A  A  A  A  A  A  A zone->node = nid;
> >> >>> A  A  A  A  A  A  A  A zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> >> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
> >> >>> A  A  A  A return 0;
> >> >>> A }
> >> >>>
> >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> >> >>> A int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>> A  A  A  A void __user *buffer, size_t *length, loff_t *ppos)
> >> >>> A {
> >> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>> A  A  A  A return 0;
> >> >>> A }
> >> >>>
> >> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>> + A  A  A  void __user *buffer, size_t *length, loff_t *ppos)
> >> >>> +{
> >> >>> + A  A  A  struct zone *zone;
> >> >>> + A  A  A  int rc;
> >> >>> +
> >> >>> + A  A  A  rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> >> >>> + A  A  A  if (rc)
> >> >>> + A  A  A  A  A  A  A  return rc;
> >> >>> +
> >> >>> + A  A  A  for_each_zone(zone)
> >> >>> + A  A  A  A  A  A  A  zone->max_unmapped_pages = (zone->present_pages *
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  sysctl_max_unmapped_ratio) / 100;
> >> >>> + A  A  A  return 0;
> >> >>> +}
> >> >>> +#endif
> >> >>> +
> >> >>> A #ifdef CONFIG_NUMA
> >> >>> A int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
> >> >>> A  A  A  A void __user *buffer, size_t *length, loff_t *ppos)
> >> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> >> >>> index 02cc82e..6377411 100644
> >> >>> --- a/mm/vmscan.c
> >> >>> +++ b/mm/vmscan.c
> >> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
> >> >>> A #define scanning_global_lru(sc) A  A  A  A (1)
> >> >>> A #endif
> >> >>>
> >> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> >> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  struct scan_control *sc);
> >> >>> +static int unmapped_page_control __read_mostly;
> >> >>> +
> >> >>> +static int __init unmapped_page_control_parm(char *str)
> >> >>> +{
> >> >>> + A  A  A  unmapped_page_control = 1;
> >> >>> + A  A  A  /*
> >> >>> + A  A  A  A * XXX: Should we tweak swappiness here?
> >> >>> + A  A  A  A */
> >> >>> + A  A  A  return 1;
> >> >>> +}
> >> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
> >> >>> +
> >> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> >> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  struct zone *zone, struct scan_control *sc)
> >> >>> +{
> >> >>> + A  A  A  return 0;
> >> >>> +}
> >> >>> +#endif
> >> >>> +
> >> >>> A static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A struct scan_control *sc)
> >> >>> A {
> >> >>> @@ -2359,6 +2382,12 @@ loop_again:
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A shrink_active_list(SWAP_CLUSTER_MAX, zone,
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A &sc, priority, 0);
> >> >>>
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  /*
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A * We do unmapped page reclaim once here and once
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A * below, so that we don't lose out
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A */
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  reclaim_unmapped_pages(priority, zone, &sc);
> >> >>> +
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A if (!zone_watermark_ok_safe(zone, order,
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A high_wmark_pages(zone), 0, 0)) {
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A end_zone = i;
> >> >>> @@ -2396,6 +2425,11 @@ loop_again:
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A continue;
> >> >>>
> >> >>> A  A  A  A  A  A  A  A  A  A  A  A sc.nr_scanned = 0;
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  /*
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A * Reclaim unmapped pages upfront, this should be
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A * really cheap
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  A */
> >> >>> + A  A  A  A  A  A  A  A  A  A  A  reclaim_unmapped_pages(priority, zone, &sc);
> >> >>
> >> >> Why should we do by two phase?
> >> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> >> >> If we can't reclaim enough, next allocation would wake up kswapd again
> >> >> and kswapd try it again.
> >> >>
> >> >
> >> > I am not sure I understand, the wakeup will occur only if the unmapped
> >> > pages are still above the max_unmapped_ratio. They are tunable control
> >> > points.
> >>
> >> I mean you try to reclaim twice in one path.
> >> one is when select highest zone to reclaim.
> >> one is when VM reclaim the zone.
> >>
> >> What's your intention?
> >>
> >
> > That is because some zones can be skipped, we need to ensure we go
> > through all zones, rather than selective zones (limited via search for
> > end_zone).
> 
> If kswapd is wake up by unmapped memory of some zone, we have to
> include the zone while selective victim zones to prevent miss the
> zone.
> I think it would be better than reclaiming twice
> 

That sounds checking all zones and loop again is enough.


BTW, it seems this doesn't work when some apps use huge shmem.
How to handle the issue ?

Thanks,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28  8:19                 ` Balbir Singh
@ 2011-01-28  8:17                   ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 47+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-01-28  8:17 UTC (permalink / raw)
  To: balbir
  Cc: Minchan Kim, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro, cl

On Fri, 28 Jan 2011 13:49:28 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-28 16:56:05]:
 
> > BTW, it seems this doesn't work when some apps use huge shmem.
> > How to handle the issue ?
> >
> 
> Could you elaborate further? 
> 
==
static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
{
        unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
        unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
                zone_page_state(zone, NR_ACTIVE_FILE);

        /*
         * It's possible for there to be more file mapped pages than
         * accounted for by the pages on the file LRU lists because
         * tmpfs pages accounted for as ANON can also be FILE_MAPPED
         */
        return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
}
==

Did you read ?

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28  8:17                   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 47+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-01-28  8:17 UTC (permalink / raw)
  To: balbir
  Cc: Minchan Kim, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro, cl

On Fri, 28 Jan 2011 13:49:28 +0530
Balbir Singh <balbir@linux.vnet.ibm.com> wrote:

> * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-28 16:56:05]:
 
> > BTW, it seems this doesn't work when some apps use huge shmem.
> > How to handle the issue ?
> >
> 
> Could you elaborate further? 
> 
==
static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
{
        unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
        unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
                zone_page_state(zone, NR_ACTIVE_FILE);

        /*
         * It's possible for there to be more file mapped pages than
         * accounted for by the pages on the file LRU lists because
         * tmpfs pages accounted for as ANON can also be FILE_MAPPED
         */
        return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
}
==

Did you read ?

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28  7:56               ` KAMEZAWA Hiroyuki
  (?)
@ 2011-01-28  8:19                 ` Balbir Singh
  -1 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28  8:19 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro, cl

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-28 16:56:05]:

> On Fri, 28 Jan 2011 16:24:19 +0900
> Minchan Kim <minchan.kim@gmail.com> wrote:
> 
> > On Fri, Jan 28, 2011 at 3:48 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:
> > >
> > >> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
> > >> <balbir@linux.vnet.ibm.com> wrote:
> > >> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> > >> > [snip]
> > >> >
> > >> >>> index 7b56473..2ac8549 100644
> > >> >>> --- a/mm/page_alloc.c
> > >> >>> +++ b/mm/page_alloc.c
> > >> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
> > >> >>>                        unsigned long mark;
> > >> >>>                        int ret;
> > >> >>>
> > >> >>> +                       if (should_reclaim_unmapped_pages(zone))
> > >> >>> +                               wakeup_kswapd(zone, order, classzone_idx);
> > >> >>> +
> > >> >>
> > >> >> Do we really need the check in fastpath?
> > >> >> There are lost of caller of alloc_pages.
> > >> >> Many of them are not related to mapped pages.
> > >> >> Could we move the check into add_to_page_cache_locked?
> > >> >
> > >> > The check is a simple check to see if the unmapped pages need
> > >> > balancing, the reason I placed this check here is to allow other
> > >> > allocations to benefit as well, if there are some unmapped pages to be
> > >> > freed. add_to_page_cache_locked (check under a critical section) is
> > >> > even worse, IMHO.
> > >>
> > >> It just moves the overhead from general into specific case(ie,
> > >> allocates page for just page cache).
> > >> Another cases(ie, allocates pages for other purpose except page cache,
> > >> ex device drivers or fs allocation for internal using) aren't
> > >> affected.
> > >> So, It would be better.
> > >>
> > >> The goal in this patch is to remove only page cache page, isn't it?
> > >> So I think we could the balance check in add_to_page_cache and trigger reclaim.
> > >> If we do so, what's the problem?
> > >>
> > >
> > > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > > are want more free memory (due to allocation). It is OK to wakeup
> > > kswapd while allocating memory, somehow for this purpose (global page
> > > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > > the right place to hook into. I'd be open to comments/suggestions
> > > though from others as well.
> 
> I don't like add hook here.
> AND I don't want to run kswapd because 'kswapd' has been a sign as
> there are memory shortage. (reusing code is ok.)
> 
> How about adding new daemon ? Recently, khugepaged, ksmd works for
> managing memory. Adding one more daemon for special purpose is not
> very bad, I think. Then, you can do
>  - wake up without hook
>  - throttle its work.
>  - balance the whole system rather than zone.
>    I think per-node balance is enough...
> 
> 
> 
> 
>

Honestly, I did look at that option, but balancing via kswapd seemed
like the best option. Creating a new thread/daemon did not make sense
because

1. The control is very lose
2. kswapd can deal with it while balancing other things, in fact
imagine kswapd waking up to free memory, but there being other free
memory easily available. Parallel reclaim, zone lock contention
addition does not help, IMHO.
3. kswapd does not indicate memory shortage per-se, please see
min_free_kbytes_sysctl_handler, kswapd is to balance the nodes/zone.
If you tune min_free_kbytes and kswapd runs, it does not mean memory
shortage on the system

> 
> 
> 
> > >
> > >> >
> > >> >
> > >> >>
> > >> >>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
> > >> >>>                        if (zone_watermark_ok(zone, order, mark,
> > >> >>>                                    classzone_idx, alloc_flags))
> > >> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> > >> >>>
> > >> >>>                zone->spanned_pages = size;
> > >> >>>                zone->present_pages = realsize;
> > >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> > >> >>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> > >> >>>                                                / 100;
> > >> >>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> > >> >>> +                                               / 100;
> > >> >>> +#endif
> > >> >>>  #ifdef CONFIG_NUMA
> > >> >>>                zone->node = nid;
> > >> >>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> > >> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
> > >> >>>        return 0;
> > >> >>>  }
> > >> >>>
> > >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> > >> >>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> > >> >>>  {
> > >> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>>        return 0;
> > >> >>>  }
> > >> >>>
> > >> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>> +       void __user *buffer, size_t *length, loff_t *ppos)
> > >> >>> +{
> > >> >>> +       struct zone *zone;
> > >> >>> +       int rc;
> > >> >>> +
> > >> >>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> > >> >>> +       if (rc)
> > >> >>> +               return rc;
> > >> >>> +
> > >> >>> +       for_each_zone(zone)
> > >> >>> +               zone->max_unmapped_pages = (zone->present_pages *
> > >> >>> +                               sysctl_max_unmapped_ratio) / 100;
> > >> >>> +       return 0;
> > >> >>> +}
> > >> >>> +#endif
> > >> >>> +
> > >> >>>  #ifdef CONFIG_NUMA
> > >> >>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> > >> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> > >> >>> index 02cc82e..6377411 100644
> > >> >>> --- a/mm/vmscan.c
> > >> >>> +++ b/mm/vmscan.c
> > >> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
> > >> >>>  #define scanning_global_lru(sc)        (1)
> > >> >>>  #endif
> > >> >>>
> > >> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> > >> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> > >> >>> +                                               struct scan_control *sc);
> > >> >>> +static int unmapped_page_control __read_mostly;
> > >> >>> +
> > >> >>> +static int __init unmapped_page_control_parm(char *str)
> > >> >>> +{
> > >> >>> +       unmapped_page_control = 1;
> > >> >>> +       /*
> > >> >>> +        * XXX: Should we tweak swappiness here?
> > >> >>> +        */
> > >> >>> +       return 1;
> > >> >>> +}
> > >> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
> > >> >>> +
> > >> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> > >> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
> > >> >>> +                               struct zone *zone, struct scan_control *sc)
> > >> >>> +{
> > >> >>> +       return 0;
> > >> >>> +}
> > >> >>> +#endif
> > >> >>> +
> > >> >>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> > >> >>>                                                  struct scan_control *sc)
> > >> >>>  {
> > >> >>> @@ -2359,6 +2382,12 @@ loop_again:
> > >> >>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
> > >> >>>                                                        &sc, priority, 0);
> > >> >>>
> > >> >>> +                       /*
> > >> >>> +                        * We do unmapped page reclaim once here and once
> > >> >>> +                        * below, so that we don't lose out
> > >> >>> +                        */
> > >> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> > >> >>> +
> > >> >>>                        if (!zone_watermark_ok_safe(zone, order,
> > >> >>>                                        high_wmark_pages(zone), 0, 0)) {
> > >> >>>                                end_zone = i;
> > >> >>> @@ -2396,6 +2425,11 @@ loop_again:
> > >> >>>                                continue;
> > >> >>>
> > >> >>>                        sc.nr_scanned = 0;
> > >> >>> +                       /*
> > >> >>> +                        * Reclaim unmapped pages upfront, this should be
> > >> >>> +                        * really cheap
> > >> >>> +                        */
> > >> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> > >> >>
> > >> >> Why should we do by two phase?
> > >> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> > >> >> If we can't reclaim enough, next allocation would wake up kswapd again
> > >> >> and kswapd try it again.
> > >> >>
> > >> >
> > >> > I am not sure I understand, the wakeup will occur only if the unmapped
> > >> > pages are still above the max_unmapped_ratio. They are tunable control
> > >> > points.
> > >>
> > >> I mean you try to reclaim twice in one path.
> > >> one is when select highest zone to reclaim.
> > >> one is when VM reclaim the zone.
> > >>
> > >> What's your intention?
> > >>
> > >
> > > That is because some zones can be skipped, we need to ensure we go
> > > through all zones, rather than selective zones (limited via search for
> > > end_zone).
> > 
> > If kswapd is wake up by unmapped memory of some zone, we have to
> > include the zone while selective victim zones to prevent miss the
> > zone.
> > I think it would be better than reclaiming twice
> > 
> 
> That sounds checking all zones and loop again is enough.
> 
> 
> BTW, it seems this doesn't work when some apps use huge shmem.
> How to handle the issue ?
>

Could you elaborate further? 

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28  8:19                 ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28  8:19 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro, cl

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-28 16:56:05]:

> On Fri, 28 Jan 2011 16:24:19 +0900
> Minchan Kim <minchan.kim@gmail.com> wrote:
> 
> > On Fri, Jan 28, 2011 at 3:48 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:
> > >
> > >> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
> > >> <balbir@linux.vnet.ibm.com> wrote:
> > >> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> > >> > [snip]
> > >> >
> > >> >>> index 7b56473..2ac8549 100644
> > >> >>> --- a/mm/page_alloc.c
> > >> >>> +++ b/mm/page_alloc.c
> > >> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
> > >> >>>                        unsigned long mark;
> > >> >>>                        int ret;
> > >> >>>
> > >> >>> +                       if (should_reclaim_unmapped_pages(zone))
> > >> >>> +                               wakeup_kswapd(zone, order, classzone_idx);
> > >> >>> +
> > >> >>
> > >> >> Do we really need the check in fastpath?
> > >> >> There are lost of caller of alloc_pages.
> > >> >> Many of them are not related to mapped pages.
> > >> >> Could we move the check into add_to_page_cache_locked?
> > >> >
> > >> > The check is a simple check to see if the unmapped pages need
> > >> > balancing, the reason I placed this check here is to allow other
> > >> > allocations to benefit as well, if there are some unmapped pages to be
> > >> > freed. add_to_page_cache_locked (check under a critical section) is
> > >> > even worse, IMHO.
> > >>
> > >> It just moves the overhead from general into specific case(ie,
> > >> allocates page for just page cache).
> > >> Another cases(ie, allocates pages for other purpose except page cache,
> > >> ex device drivers or fs allocation for internal using) aren't
> > >> affected.
> > >> So, It would be better.
> > >>
> > >> The goal in this patch is to remove only page cache page, isn't it?
> > >> So I think we could the balance check in add_to_page_cache and trigger reclaim.
> > >> If we do so, what's the problem?
> > >>
> > >
> > > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > > are want more free memory (due to allocation). It is OK to wakeup
> > > kswapd while allocating memory, somehow for this purpose (global page
> > > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > > the right place to hook into. I'd be open to comments/suggestions
> > > though from others as well.
> 
> I don't like add hook here.
> AND I don't want to run kswapd because 'kswapd' has been a sign as
> there are memory shortage. (reusing code is ok.)
> 
> How about adding new daemon ? Recently, khugepaged, ksmd works for
> managing memory. Adding one more daemon for special purpose is not
> very bad, I think. Then, you can do
>  - wake up without hook
>  - throttle its work.
>  - balance the whole system rather than zone.
>    I think per-node balance is enough...
> 
> 
> 
> 
>

Honestly, I did look at that option, but balancing via kswapd seemed
like the best option. Creating a new thread/daemon did not make sense
because

1. The control is very lose
2. kswapd can deal with it while balancing other things, in fact
imagine kswapd waking up to free memory, but there being other free
memory easily available. Parallel reclaim, zone lock contention
addition does not help, IMHO.
3. kswapd does not indicate memory shortage per-se, please see
min_free_kbytes_sysctl_handler, kswapd is to balance the nodes/zone.
If you tune min_free_kbytes and kswapd runs, it does not mean memory
shortage on the system

> 
> 
> 
> > >
> > >> >
> > >> >
> > >> >>
> > >> >>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
> > >> >>>                        if (zone_watermark_ok(zone, order, mark,
> > >> >>>                                    classzone_idx, alloc_flags))
> > >> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> > >> >>>
> > >> >>>                zone->spanned_pages = size;
> > >> >>>                zone->present_pages = realsize;
> > >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> > >> >>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> > >> >>>                                                / 100;
> > >> >>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> > >> >>> +                                               / 100;
> > >> >>> +#endif
> > >> >>>  #ifdef CONFIG_NUMA
> > >> >>>                zone->node = nid;
> > >> >>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> > >> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
> > >> >>>        return 0;
> > >> >>>  }
> > >> >>>
> > >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> > >> >>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> > >> >>>  {
> > >> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>>        return 0;
> > >> >>>  }
> > >> >>>
> > >> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>> +       void __user *buffer, size_t *length, loff_t *ppos)
> > >> >>> +{
> > >> >>> +       struct zone *zone;
> > >> >>> +       int rc;
> > >> >>> +
> > >> >>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> > >> >>> +       if (rc)
> > >> >>> +               return rc;
> > >> >>> +
> > >> >>> +       for_each_zone(zone)
> > >> >>> +               zone->max_unmapped_pages = (zone->present_pages *
> > >> >>> +                               sysctl_max_unmapped_ratio) / 100;
> > >> >>> +       return 0;
> > >> >>> +}
> > >> >>> +#endif
> > >> >>> +
> > >> >>>  #ifdef CONFIG_NUMA
> > >> >>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> > >> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> > >> >>> index 02cc82e..6377411 100644
> > >> >>> --- a/mm/vmscan.c
> > >> >>> +++ b/mm/vmscan.c
> > >> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
> > >> >>>  #define scanning_global_lru(sc)        (1)
> > >> >>>  #endif
> > >> >>>
> > >> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> > >> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> > >> >>> +                                               struct scan_control *sc);
> > >> >>> +static int unmapped_page_control __read_mostly;
> > >> >>> +
> > >> >>> +static int __init unmapped_page_control_parm(char *str)
> > >> >>> +{
> > >> >>> +       unmapped_page_control = 1;
> > >> >>> +       /*
> > >> >>> +        * XXX: Should we tweak swappiness here?
> > >> >>> +        */
> > >> >>> +       return 1;
> > >> >>> +}
> > >> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
> > >> >>> +
> > >> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> > >> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
> > >> >>> +                               struct zone *zone, struct scan_control *sc)
> > >> >>> +{
> > >> >>> +       return 0;
> > >> >>> +}
> > >> >>> +#endif
> > >> >>> +
> > >> >>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> > >> >>>                                                  struct scan_control *sc)
> > >> >>>  {
> > >> >>> @@ -2359,6 +2382,12 @@ loop_again:
> > >> >>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
> > >> >>>                                                        &sc, priority, 0);
> > >> >>>
> > >> >>> +                       /*
> > >> >>> +                        * We do unmapped page reclaim once here and once
> > >> >>> +                        * below, so that we don't lose out
> > >> >>> +                        */
> > >> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> > >> >>> +
> > >> >>>                        if (!zone_watermark_ok_safe(zone, order,
> > >> >>>                                        high_wmark_pages(zone), 0, 0)) {
> > >> >>>                                end_zone = i;
> > >> >>> @@ -2396,6 +2425,11 @@ loop_again:
> > >> >>>                                continue;
> > >> >>>
> > >> >>>                        sc.nr_scanned = 0;
> > >> >>> +                       /*
> > >> >>> +                        * Reclaim unmapped pages upfront, this should be
> > >> >>> +                        * really cheap
> > >> >>> +                        */
> > >> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> > >> >>
> > >> >> Why should we do by two phase?
> > >> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> > >> >> If we can't reclaim enough, next allocation would wake up kswapd again
> > >> >> and kswapd try it again.
> > >> >>
> > >> >
> > >> > I am not sure I understand, the wakeup will occur only if the unmapped
> > >> > pages are still above the max_unmapped_ratio. They are tunable control
> > >> > points.
> > >>
> > >> I mean you try to reclaim twice in one path.
> > >> one is when select highest zone to reclaim.
> > >> one is when VM reclaim the zone.
> > >>
> > >> What's your intention?
> > >>
> > >
> > > That is because some zones can be skipped, we need to ensure we go
> > > through all zones, rather than selective zones (limited via search for
> > > end_zone).
> > 
> > If kswapd is wake up by unmapped memory of some zone, we have to
> > include the zone while selective victim zones to prevent miss the
> > zone.
> > I think it would be better than reclaiming twice
> > 
> 
> That sounds checking all zones and loop again is enough.
> 
> 
> BTW, it seems this doesn't work when some apps use huge shmem.
> How to handle the issue ?
>

Could you elaborate further? 

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28  8:19                 ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28  8:19 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro, cl

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-28 16:56:05]:

> On Fri, 28 Jan 2011 16:24:19 +0900
> Minchan Kim <minchan.kim@gmail.com> wrote:
> 
> > On Fri, Jan 28, 2011 at 3:48 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> > > * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 14:44:50]:
> > >
> > >> On Fri, Jan 28, 2011 at 11:56 AM, Balbir Singh
> > >> <balbir@linux.vnet.ibm.com> wrote:
> > >> > On Thu, Jan 27, 2011 at 4:42 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> > >> > [snip]
> > >> >
> > >> >>> index 7b56473..2ac8549 100644
> > >> >>> --- a/mm/page_alloc.c
> > >> >>> +++ b/mm/page_alloc.c
> > >> >>> @@ -1660,6 +1660,9 @@ zonelist_scan:
> > >> >>>                        unsigned long mark;
> > >> >>>                        int ret;
> > >> >>>
> > >> >>> +                       if (should_reclaim_unmapped_pages(zone))
> > >> >>> +                               wakeup_kswapd(zone, order, classzone_idx);
> > >> >>> +
> > >> >>
> > >> >> Do we really need the check in fastpath?
> > >> >> There are lost of caller of alloc_pages.
> > >> >> Many of them are not related to mapped pages.
> > >> >> Could we move the check into add_to_page_cache_locked?
> > >> >
> > >> > The check is a simple check to see if the unmapped pages need
> > >> > balancing, the reason I placed this check here is to allow other
> > >> > allocations to benefit as well, if there are some unmapped pages to be
> > >> > freed. add_to_page_cache_locked (check under a critical section) is
> > >> > even worse, IMHO.
> > >>
> > >> It just moves the overhead from general into specific case(ie,
> > >> allocates page for just page cache).
> > >> Another cases(ie, allocates pages for other purpose except page cache,
> > >> ex device drivers or fs allocation for internal using) aren't
> > >> affected.
> > >> So, It would be better.
> > >>
> > >> The goal in this patch is to remove only page cache page, isn't it?
> > >> So I think we could the balance check in add_to_page_cache and trigger reclaim.
> > >> If we do so, what's the problem?
> > >>
> > >
> > > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > > are want more free memory (due to allocation). It is OK to wakeup
> > > kswapd while allocating memory, somehow for this purpose (global page
> > > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > > the right place to hook into. I'd be open to comments/suggestions
> > > though from others as well.
> 
> I don't like add hook here.
> AND I don't want to run kswapd because 'kswapd' has been a sign as
> there are memory shortage. (reusing code is ok.)
> 
> How about adding new daemon ? Recently, khugepaged, ksmd works for
> managing memory. Adding one more daemon for special purpose is not
> very bad, I think. Then, you can do
>  - wake up without hook
>  - throttle its work.
>  - balance the whole system rather than zone.
>    I think per-node balance is enough...
> 
> 
> 
> 
>

Honestly, I did look at that option, but balancing via kswapd seemed
like the best option. Creating a new thread/daemon did not make sense
because

1. The control is very lose
2. kswapd can deal with it while balancing other things, in fact
imagine kswapd waking up to free memory, but there being other free
memory easily available. Parallel reclaim, zone lock contention
addition does not help, IMHO.
3. kswapd does not indicate memory shortage per-se, please see
min_free_kbytes_sysctl_handler, kswapd is to balance the nodes/zone.
If you tune min_free_kbytes and kswapd runs, it does not mean memory
shortage on the system

> 
> 
> 
> > >
> > >> >
> > >> >
> > >> >>
> > >> >>>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
> > >> >>>                        if (zone_watermark_ok(zone, order, mark,
> > >> >>>                                    classzone_idx, alloc_flags))
> > >> >>> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
> > >> >>>
> > >> >>>                zone->spanned_pages = size;
> > >> >>>                zone->present_pages = realsize;
> > >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> > >> >>>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
> > >> >>>                                                / 100;
> > >> >>> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> > >> >>> +                                               / 100;
> > >> >>> +#endif
> > >> >>>  #ifdef CONFIG_NUMA
> > >> >>>                zone->node = nid;
> > >> >>>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> > >> >>> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
> > >> >>>        return 0;
> > >> >>>  }
> > >> >>>
> > >> >>> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
> > >> >>>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> > >> >>>  {
> > >> >>> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>>        return 0;
> > >> >>>  }
> > >> >>>
> > >> >>> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>> +       void __user *buffer, size_t *length, loff_t *ppos)
> > >> >>> +{
> > >> >>> +       struct zone *zone;
> > >> >>> +       int rc;
> > >> >>> +
> > >> >>> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> > >> >>> +       if (rc)
> > >> >>> +               return rc;
> > >> >>> +
> > >> >>> +       for_each_zone(zone)
> > >> >>> +               zone->max_unmapped_pages = (zone->present_pages *
> > >> >>> +                               sysctl_max_unmapped_ratio) / 100;
> > >> >>> +       return 0;
> > >> >>> +}
> > >> >>> +#endif
> > >> >>> +
> > >> >>>  #ifdef CONFIG_NUMA
> > >> >>>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
> > >> >>>        void __user *buffer, size_t *length, loff_t *ppos)
> > >> >>> diff --git a/mm/vmscan.c b/mm/vmscan.c
> > >> >>> index 02cc82e..6377411 100644
> > >> >>> --- a/mm/vmscan.c
> > >> >>> +++ b/mm/vmscan.c
> > >> >>> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
> > >> >>>  #define scanning_global_lru(sc)        (1)
> > >> >>>  #endif
> > >> >>>
> > >> >>> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> > >> >>> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> > >> >>> +                                               struct scan_control *sc);
> > >> >>> +static int unmapped_page_control __read_mostly;
> > >> >>> +
> > >> >>> +static int __init unmapped_page_control_parm(char *str)
> > >> >>> +{
> > >> >>> +       unmapped_page_control = 1;
> > >> >>> +       /*
> > >> >>> +        * XXX: Should we tweak swappiness here?
> > >> >>> +        */
> > >> >>> +       return 1;
> > >> >>> +}
> > >> >>> +__setup("unmapped_page_control", unmapped_page_control_parm);
> > >> >>> +
> > >> >>> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> > >> >>> +static inline unsigned long reclaim_unmapped_pages(int priority,
> > >> >>> +                               struct zone *zone, struct scan_control *sc)
> > >> >>> +{
> > >> >>> +       return 0;
> > >> >>> +}
> > >> >>> +#endif
> > >> >>> +
> > >> >>>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
> > >> >>>                                                  struct scan_control *sc)
> > >> >>>  {
> > >> >>> @@ -2359,6 +2382,12 @@ loop_again:
> > >> >>>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
> > >> >>>                                                        &sc, priority, 0);
> > >> >>>
> > >> >>> +                       /*
> > >> >>> +                        * We do unmapped page reclaim once here and once
> > >> >>> +                        * below, so that we don't lose out
> > >> >>> +                        */
> > >> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> > >> >>> +
> > >> >>>                        if (!zone_watermark_ok_safe(zone, order,
> > >> >>>                                        high_wmark_pages(zone), 0, 0)) {
> > >> >>>                                end_zone = i;
> > >> >>> @@ -2396,6 +2425,11 @@ loop_again:
> > >> >>>                                continue;
> > >> >>>
> > >> >>>                        sc.nr_scanned = 0;
> > >> >>> +                       /*
> > >> >>> +                        * Reclaim unmapped pages upfront, this should be
> > >> >>> +                        * really cheap
> > >> >>> +                        */
> > >> >>> +                       reclaim_unmapped_pages(priority, zone, &sc);
> > >> >>
> > >> >> Why should we do by two phase?
> > >> >> It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
> > >> >> If we can't reclaim enough, next allocation would wake up kswapd again
> > >> >> and kswapd try it again.
> > >> >>
> > >> >
> > >> > I am not sure I understand, the wakeup will occur only if the unmapped
> > >> > pages are still above the max_unmapped_ratio. They are tunable control
> > >> > points.
> > >>
> > >> I mean you try to reclaim twice in one path.
> > >> one is when select highest zone to reclaim.
> > >> one is when VM reclaim the zone.
> > >>
> > >> What's your intention?
> > >>
> > >
> > > That is because some zones can be skipped, we need to ensure we go
> > > through all zones, rather than selective zones (limited via search for
> > > end_zone).
> > 
> > If kswapd is wake up by unmapped memory of some zone, we have to
> > include the zone while selective victim zones to prevent miss the
> > zone.
> > I think it would be better than reclaiming twice
> > 
> 
> That sounds checking all zones and loop again is enough.
> 
> 
> BTW, it seems this doesn't work when some apps use huge shmem.
> How to handle the issue ?
>

Could you elaborate further? 

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28  7:24             ` Minchan Kim
@ 2011-01-28 11:18               ` Balbir Singh
  -1 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28 11:18 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

* MinChan Kim <minchan.kim@gmail.com> [2011-01-28 16:24:19]:

> >
> > But the assumption for LRU order to change happens only if the page
> > cannot be successfully freed, which means it is in some way active..
> > and needs to be moved no?
> 
> 1. holded page by someone
> 2. mapped pages
> 3. active pages
> 
> 1 is rare so it isn't the problem.
> Of course, in case of 3, we have to activate it so no problem.
> The problem is 2.
>

2 is a problem, but due to the size aspects not a big one. Like you
said even lumpy reclaim affects it. May be the reclaim code could
honour may_unmap much earlier. 

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28 11:18               ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28 11:18 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

* MinChan Kim <minchan.kim@gmail.com> [2011-01-28 16:24:19]:

> >
> > But the assumption for LRU order to change happens only if the page
> > cannot be successfully freed, which means it is in some way active..
> > and needs to be moved no?
> 
> 1. holded page by someone
> 2. mapped pages
> 3. active pages
> 
> 1 is rare so it isn't the problem.
> Of course, in case of 3, we have to activate it so no problem.
> The problem is 2.
>

2 is a problem, but due to the size aspects not a big one. Like you
said even lumpy reclaim affects it. May be the reclaim code could
honour may_unmap much earlier. 

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28  8:17                   ` KAMEZAWA Hiroyuki
@ 2011-01-28 12:02                     ` Balbir Singh
  -1 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28 12:02 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro, cl

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-28 17:17:44]:

> On Fri, 28 Jan 2011 13:49:28 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
> > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-28 16:56:05]:
> 
> > > BTW, it seems this doesn't work when some apps use huge shmem.
> > > How to handle the issue ?
> > >
> > 
> > Could you elaborate further? 
> > 
> ==
> static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
> {
>         unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
>         unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
>                 zone_page_state(zone, NR_ACTIVE_FILE);
> 
>         /*
>          * It's possible for there to be more file mapped pages than
>          * accounted for by the pages on the file LRU lists because
>          * tmpfs pages accounted for as ANON can also be FILE_MAPPED
>          */
>         return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
> }

Yes, I did :) The word huge confused me. I am not sure if there is an
easy accounting fix for this one, though given the approximate nature
of the control, I am not sure it would matter very much. But you do
have a very good point.

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28 12:02                     ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-28 12:02 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro, cl

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-28 17:17:44]:

> On Fri, 28 Jan 2011 13:49:28 +0530
> Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> 
> > * KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-28 16:56:05]:
> 
> > > BTW, it seems this doesn't work when some apps use huge shmem.
> > > How to handle the issue ?
> > >
> > 
> > Could you elaborate further? 
> > 
> ==
> static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
> {
>         unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
>         unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
>                 zone_page_state(zone, NR_ACTIVE_FILE);
> 
>         /*
>          * It's possible for there to be more file mapped pages than
>          * accounted for by the pages on the file LRU lists because
>          * tmpfs pages accounted for as ANON can also be FILE_MAPPED
>          */
>         return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
> }

Yes, I did :) The word huge confused me. I am not sure if there is an
easy accounting fix for this one, though given the approximate nature
of the control, I am not sure it would matter very much. But you do
have a very good point.

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28  7:56               ` KAMEZAWA Hiroyuki
@ 2011-01-28 15:20                 ` Christoph Lameter
  -1 siblings, 0 replies; 47+ messages in thread
From: Christoph Lameter @ 2011-01-28 15:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, balbir, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro

On Fri, 28 Jan 2011, KAMEZAWA Hiroyuki wrote:

> > > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > > are want more free memory (due to allocation). It is OK to wakeup
> > > kswapd while allocating memory, somehow for this purpose (global page
> > > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > > the right place to hook into. I'd be open to comments/suggestions
> > > though from others as well.
>
> I don't like add hook here.
> AND I don't want to run kswapd because 'kswapd' has been a sign as
> there are memory shortage. (reusing code is ok.)
>
> How about adding new daemon ? Recently, khugepaged, ksmd works for
> managing memory. Adding one more daemon for special purpose is not
> very bad, I think. Then, you can do
>  - wake up without hook
>  - throttle its work.
>  - balance the whole system rather than zone.
>    I think per-node balance is enough...


I think we already have enough kernel daemons floating around. They are
multiplying in an amazing way. What would be useful is to map all
the memory management background stuff into a process. May call this memd
instead? Perhaps we can fold khugepaged into kswapd as well etc.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-28 15:20                 ` Christoph Lameter
  0 siblings, 0 replies; 47+ messages in thread
From: Christoph Lameter @ 2011-01-28 15:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, balbir, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro

On Fri, 28 Jan 2011, KAMEZAWA Hiroyuki wrote:

> > > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > > are want more free memory (due to allocation). It is OK to wakeup
> > > kswapd while allocating memory, somehow for this purpose (global page
> > > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > > the right place to hook into. I'd be open to comments/suggestions
> > > though from others as well.
>
> I don't like add hook here.
> AND I don't want to run kswapd because 'kswapd' has been a sign as
> there are memory shortage. (reusing code is ok.)
>
> How about adding new daemon ? Recently, khugepaged, ksmd works for
> managing memory. Adding one more daemon for special purpose is not
> very bad, I think. Then, you can do
>  - wake up without hook
>  - throttle its work.
>  - balance the whole system rather than zone.
>    I think per-node balance is enough...


I think we already have enough kernel daemons floating around. They are
multiplying in an amazing way. What would be useful is to map all
the memory management background stuff into a process. May call this memd
instead? Perhaps we can fold khugepaged into kswapd as well etc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28 15:20                 ` Christoph Lameter
@ 2011-01-30 23:58                   ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 47+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-01-30 23:58 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Minchan Kim, balbir, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro

On Fri, 28 Jan 2011 09:20:02 -0600 (CST)
Christoph Lameter <cl@linux.com> wrote:

> On Fri, 28 Jan 2011, KAMEZAWA Hiroyuki wrote:
> 
> > > > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > > > are want more free memory (due to allocation). It is OK to wakeup
> > > > kswapd while allocating memory, somehow for this purpose (global page
> > > > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > > > the right place to hook into. I'd be open to comments/suggestions
> > > > though from others as well.
> >
> > I don't like add hook here.
> > AND I don't want to run kswapd because 'kswapd' has been a sign as
> > there are memory shortage. (reusing code is ok.)
> >
> > How about adding new daemon ? Recently, khugepaged, ksmd works for
> > managing memory. Adding one more daemon for special purpose is not
> > very bad, I think. Then, you can do
> >  - wake up without hook
> >  - throttle its work.
> >  - balance the whole system rather than zone.
> >    I think per-node balance is enough...
> 
> 
> I think we already have enough kernel daemons floating around. They are
> multiplying in an amazing way. What would be useful is to map all
> the memory management background stuff into a process. May call this memd
> instead? Perhaps we can fold khugepaged into kswapd as well etc.
> 

Making kswapd slow for whis "additional", "requested by user, not by system"
work is good thing ? I think workqueue works enough well, it's scale based on
workloads, if using thread is bad.

Thanks,
-Kame





^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-30 23:58                   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 47+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-01-30 23:58 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Minchan Kim, balbir, linux-mm, akpm, npiggin, kvm, linux-kernel,
	kosaki.motohiro

On Fri, 28 Jan 2011 09:20:02 -0600 (CST)
Christoph Lameter <cl@linux.com> wrote:

> On Fri, 28 Jan 2011, KAMEZAWA Hiroyuki wrote:
> 
> > > > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > > > are want more free memory (due to allocation). It is OK to wakeup
> > > > kswapd while allocating memory, somehow for this purpose (global page
> > > > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > > > the right place to hook into. I'd be open to comments/suggestions
> > > > though from others as well.
> >
> > I don't like add hook here.
> > AND I don't want to run kswapd because 'kswapd' has been a sign as
> > there are memory shortage. (reusing code is ok.)
> >
> > How about adding new daemon ? Recently, khugepaged, ksmd works for
> > managing memory. Adding one more daemon for special purpose is not
> > very bad, I think. Then, you can do
> >  - wake up without hook
> >  - throttle its work.
> >  - balance the whole system rather than zone.
> >    I think per-node balance is enough...
> 
> 
> I think we already have enough kernel daemons floating around. They are
> multiplying in an amazing way. What would be useful is to map all
> the memory management background stuff into a process. May call this memd
> instead? Perhaps we can fold khugepaged into kswapd as well etc.
> 

Making kswapd slow for whis "additional", "requested by user, not by system"
work is good thing ? I think workqueue works enough well, it's scale based on
workloads, if using thread is bad.

Thanks,
-Kame




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-30 23:58                   ` KAMEZAWA Hiroyuki
@ 2011-01-31  4:37                     ` Balbir Singh
  -1 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-31  4:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, Minchan Kim, linux-mm, akpm, npiggin, kvm,
	linux-kernel, kosaki.motohiro

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-31 08:58:53]:

> On Fri, 28 Jan 2011 09:20:02 -0600 (CST)
> Christoph Lameter <cl@linux.com> wrote:
> 
> > On Fri, 28 Jan 2011, KAMEZAWA Hiroyuki wrote:
> > 
> > > > > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > > > > are want more free memory (due to allocation). It is OK to wakeup
> > > > > kswapd while allocating memory, somehow for this purpose (global page
> > > > > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > > > > the right place to hook into. I'd be open to comments/suggestions
> > > > > though from others as well.
> > >
> > > I don't like add hook here.
> > > AND I don't want to run kswapd because 'kswapd' has been a sign as
> > > there are memory shortage. (reusing code is ok.)
> > >
> > > How about adding new daemon ? Recently, khugepaged, ksmd works for
> > > managing memory. Adding one more daemon for special purpose is not
> > > very bad, I think. Then, you can do
> > >  - wake up without hook
> > >  - throttle its work.
> > >  - balance the whole system rather than zone.
> > >    I think per-node balance is enough...
> > 
> > 
> > I think we already have enough kernel daemons floating around. They are
> > multiplying in an amazing way. What would be useful is to map all
> > the memory management background stuff into a process. May call this memd
> > instead? Perhaps we can fold khugepaged into kswapd as well etc.
> > 
> 
> Making kswapd slow for whis "additional", "requested by user, not by system"
> work is good thing ? I think workqueue works enough well, it's scale based on
> workloads, if using thread is bad.
>

Making it slow is a generic statement, kswapd
is supposed to do background reclaim, in this case a special request
for unmapped pages, specifically and deliberately requested by the
admin via a boot option.
 
-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-01-31  4:37                     ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-01-31  4:37 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, Minchan Kim, linux-mm, akpm, npiggin, kvm,
	linux-kernel, kosaki.motohiro

* KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> [2011-01-31 08:58:53]:

> On Fri, 28 Jan 2011 09:20:02 -0600 (CST)
> Christoph Lameter <cl@linux.com> wrote:
> 
> > On Fri, 28 Jan 2011, KAMEZAWA Hiroyuki wrote:
> > 
> > > > > I see it as a tradeoff of when to check? add_to_page_cache or when we
> > > > > are want more free memory (due to allocation). It is OK to wakeup
> > > > > kswapd while allocating memory, somehow for this purpose (global page
> > > > > cache), add_to_page_cache or add_to_page_cache_locked does not seem
> > > > > the right place to hook into. I'd be open to comments/suggestions
> > > > > though from others as well.
> > >
> > > I don't like add hook here.
> > > AND I don't want to run kswapd because 'kswapd' has been a sign as
> > > there are memory shortage. (reusing code is ok.)
> > >
> > > How about adding new daemon ? Recently, khugepaged, ksmd works for
> > > managing memory. Adding one more daemon for special purpose is not
> > > very bad, I think. Then, you can do
> > >  - wake up without hook
> > >  - throttle its work.
> > >  - balance the whole system rather than zone.
> > >    I think per-node balance is enough...
> > 
> > 
> > I think we already have enough kernel daemons floating around. They are
> > multiplying in an amazing way. What would be useful is to map all
> > the memory management background stuff into a process. May call this memd
> > instead? Perhaps we can fold khugepaged into kswapd as well etc.
> > 
> 
> Making kswapd slow for whis "additional", "requested by user, not by system"
> work is good thing ? I think workqueue works enough well, it's scale based on
> workloads, if using thread is bad.
>

Making it slow is a generic statement, kswapd
is supposed to do background reclaim, in this case a special request
for unmapped pages, specifically and deliberately requested by the
admin via a boot option.
 
-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-01-28 11:18               ` Balbir Singh
@ 2011-02-10  5:33                 ` Minchan Kim
  -1 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-02-10  5:33 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

Sorry for late response.

On Fri, Jan 28, 2011 at 8:18 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 16:24:19]:
>
>> >
>> > But the assumption for LRU order to change happens only if the page
>> > cannot be successfully freed, which means it is in some way active..
>> > and needs to be moved no?
>>
>> 1. holded page by someone
>> 2. mapped pages
>> 3. active pages
>>
>> 1 is rare so it isn't the problem.
>> Of course, in case of 3, we have to activate it so no problem.
>> The problem is 2.
>>
>
> 2 is a problem, but due to the size aspects not a big one. Like you
> said even lumpy reclaim affects it. May be the reclaim code could
> honour may_unmap much earlier.

Even if it is, it's a trade-off to get a big contiguous memory. I
don't want to add new mess. (In addition, lumpy is weak by compaction
as time goes by)
What I have in mind for preventing LRU ignore is that put the page
into original position instead of head of lru. Maybe it can help the
situation both lumpy and your case. But it's another story.

How about the idea?

I borrow the idea from CFLRU[1]
- PCFLRU(Page-Cache First LRU)

When we allocates new page for page cache, we adds the page into LRU's tail.
When we map the page cache into page table, we rotate the page into LRU's head.

So, inactive list's result is following as.

M.P : mapped page
N.P : none-mapped page

HEAD-M.P-M.P-M.P-M.P-N.P-N.P-N.P-N.P-N.P-TAIL

Admin can set threshold window size which determines stop reclaiming
none-mapped page contiguously.

I think it needs some tweak of page cache/page mapping functions but
we can use kswapd/direct reclaim without change.

Also, it can change page reclaim policy totally but it's just what you
want, I think.

[1] http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.100.6188&rep=rep1&type=pdf

>
> --
>        Three Cheers,
>        Balbir
>



-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-02-10  5:33                 ` Minchan Kim
  0 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-02-10  5:33 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

Sorry for late response.

On Fri, Jan 28, 2011 at 8:18 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 16:24:19]:
>
>> >
>> > But the assumption for LRU order to change happens only if the page
>> > cannot be successfully freed, which means it is in some way active..
>> > and needs to be moved no?
>>
>> 1. holded page by someone
>> 2. mapped pages
>> 3. active pages
>>
>> 1 is rare so it isn't the problem.
>> Of course, in case of 3, we have to activate it so no problem.
>> The problem is 2.
>>
>
> 2 is a problem, but due to the size aspects not a big one. Like you
> said even lumpy reclaim affects it. May be the reclaim code could
> honour may_unmap much earlier.

Even if it is, it's a trade-off to get a big contiguous memory. I
don't want to add new mess. (In addition, lumpy is weak by compaction
as time goes by)
What I have in mind for preventing LRU ignore is that put the page
into original position instead of head of lru. Maybe it can help the
situation both lumpy and your case. But it's another story.

How about the idea?

I borrow the idea from CFLRU[1]
- PCFLRU(Page-Cache First LRU)

When we allocates new page for page cache, we adds the page into LRU's tail

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-02-10  5:33                 ` Minchan Kim
@ 2011-02-10  5:41                   ` Minchan Kim
  -1 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-02-10  5:41 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

I don't know why the part of message is deleted only when I send you.
Maybe it's gmail bug.

I hope mail sending is successful in this turn. :)

On Thu, Feb 10, 2011 at 2:33 PM, Minchan Kim <minchan.kim@gmail.com> wrote:
> Sorry for late response.
>
> On Fri, Jan 28, 2011 at 8:18 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>> * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 16:24:19]:
>>
>>> >
>>> > But the assumption for LRU order to change happens only if the page
>>> > cannot be successfully freed, which means it is in some way active..
>>> > and needs to be moved no?
>>>
>>> 1. holded page by someone
>>> 2. mapped pages
>>> 3. active pages
>>>
>>> 1 is rare so it isn't the problem.
>>> Of course, in case of 3, we have to activate it so no problem.
>>> The problem is 2.
>>>
>>
>> 2 is a problem, but due to the size aspects not a big one. Like you
>> said even lumpy reclaim affects it. May be the reclaim code could
>> honour may_unmap much earlier.
>
> Even if it is, it's a trade-off to get a big contiguous memory. I
> don't want to add new mess. (In addition, lumpy is weak by compaction
> as time goes by)
> What I have in mind for preventing LRU ignore is that put the page
> into original position instead of head of lru. Maybe it can help the
> situation both lumpy and your case. But it's another story.
>
> How about the idea?
>
> I borrow the idea from CFLRU[1]
> - PCFLRU(Page-Cache First LRU)
>
> When we allocates new page for page cache, we adds the page into LRU's tail.
> When we map the page cache into page table, we rotate the page into LRU's head.
>
> So, inactive list's result is following as.
>
> M.P : mapped page
> N.P : none-mapped page
>
> HEAD-M.P-M.P-M.P-M.P-N.P-N.P-N.P-N.P-N.P-TAIL
>
> Admin can set threshold window size which determines stop reclaiming
> none-mapped page contiguously.
>
> I think it needs some tweak of page cache/page mapping functions but
> we can use kswapd/direct reclaim without change.
>
> Also, it can change page reclaim policy totally but it's just what you
> want, I think.
>
> [1] http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.100.6188&rep=rep1&type=pdf
>
>>
>> --
>>        Three Cheers,
>>        Balbir
>>
>
>
>
> --
> Kind regards,
> Minchan Kim
>



-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-02-10  5:41                   ` Minchan Kim
  0 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-02-10  5:41 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

I don't know why the part of message is deleted only when I send you.
Maybe it's gmail bug.

I hope mail sending is successful in this turn. :)

On Thu, Feb 10, 2011 at 2:33 PM, Minchan Kim <minchan.kim@gmail.com> wrote:
> Sorry for late response.
>
> On Fri, Jan 28, 2011 at 8:18 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>> * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 16:24:19]:
>>
>>> >
>>> > But the assumption for LRU order to change happens only if the page
>>> > cannot be successfully freed, which means it is in some way active..
>>> > and needs to be moved no?
>>>
>>> 1. holded page by someone
>>> 2. mapped pages
>>> 3. active pages
>>>
>>> 1 is rare so it isn't the problem.
>>> Of course, in case of 3, we have to activate it so no problem.
>>> The problem is 2.
>>>
>>
>> 2 is a problem, but due to the size aspects not a big one. Like you
>> said even lumpy reclaim affects it. May be the reclaim code could
>> honour may_unmap much earlier.
>
> Even if it is, it's a trade-off to get a big contiguous memory. I
> don't want to add new mess. (In addition, lumpy is weak by compaction
> as time goes by)
> What I have in mind for preventing LRU ignore is that put the page
> into original position instead of head of lru. Maybe it can help the
> situation both lumpy and your case. But it's another story.
>
> How about the idea?
>
> I borrow the idea from CFLRU[1]
> - PCFLRU(Page-Cache First LRU)
>
> When we allocates new page for page cache, we adds the page into LRU's tail.
> When we map the page cache into page table, we rotate the page into LRU's head.
>
> So, inactive list's result is following as.
>
> M.P : mapped page
> N.P : none-mapped page
>
> HEAD-M.P-M.P-M.P-M.P-N.P-N.P-N.P-N.P-N.P-TAIL
>
> Admin can set threshold window size which determines stop reclaiming
> none-mapped page contiguously.
>
> I think it needs some tweak of page cache/page mapping functions but
> we can use kswapd/direct reclaim without change.
>
> Also, it can change page reclaim policy totally but it's just what you
> want, I think.
>
> [1] http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.100.6188&rep=rep1&type=pdf
>
>>
>> --
>>        Three Cheers,
>>        Balbir
>>
>
>
>
> --
> Kind regards,
> Minchan Kim
>



-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-02-10  5:41                   ` Minchan Kim
@ 2011-02-13 17:33                     ` Balbir Singh
  -1 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-02-13 17:33 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

* MinChan Kim <minchan.kim@gmail.com> [2011-02-10 14:41:44]:

> I don't know why the part of message is deleted only when I send you.
> Maybe it's gmail bug.
> 
> I hope mail sending is successful in this turn. :)
> 
> On Thu, Feb 10, 2011 at 2:33 PM, Minchan Kim <minchan.kim@gmail.com> wrote:
> > Sorry for late response.
> >
> > On Fri, Jan 28, 2011 at 8:18 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >> * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 16:24:19]:
> >>
> >>> >
> >>> > But the assumption for LRU order to change happens only if the page
> >>> > cannot be successfully freed, which means it is in some way active..
> >>> > and needs to be moved no?
> >>>
> >>> 1. holded page by someone
> >>> 2. mapped pages
> >>> 3. active pages
> >>>
> >>> 1 is rare so it isn't the problem.
> >>> Of course, in case of 3, we have to activate it so no problem.
> >>> The problem is 2.
> >>>
> >>
> >> 2 is a problem, but due to the size aspects not a big one. Like you
> >> said even lumpy reclaim affects it. May be the reclaim code could
> >> honour may_unmap much earlier.
> >
> > Even if it is, it's a trade-off to get a big contiguous memory. I
> > don't want to add new mess. (In addition, lumpy is weak by compaction
> > as time goes by)
> > What I have in mind for preventing LRU ignore is that put the page
> > into original position instead of head of lru. Maybe it can help the
> > situation both lumpy and your case. But it's another story.
> >
> > How about the idea?
> >
> > I borrow the idea from CFLRU[1]
> > - PCFLRU(Page-Cache First LRU)
> >
> > When we allocates new page for page cache, we adds the page into LRU's tail.
> > When we map the page cache into page table, we rotate the page into LRU's head.
> >
> > So, inactive list's result is following as.
> >
> > M.P : mapped page
> > N.P : none-mapped page
> >
> > HEAD-M.P-M.P-M.P-M.P-N.P-N.P-N.P-N.P-N.P-TAIL
> >
> > Admin can set threshold window size which determines stop reclaiming
> > none-mapped page contiguously.
> >
> > I think it needs some tweak of page cache/page mapping functions but
> > we can use kswapd/direct reclaim without change.
> >
> > Also, it can change page reclaim policy totally but it's just what you
> > want, I think.
> >

I am not sure how this would work, moreover the idea behind
min_unmapped_pages is to keep sufficient unmapped pages around for the
FS metadata and has been working with the existing code for zone
reclaim. What you propose is more drastic re-org of the LRU and I am
not sure I have the apetite for it.

-- 
	Three Cheers,
	Balbir

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-02-13 17:33                     ` Balbir Singh
  0 siblings, 0 replies; 47+ messages in thread
From: Balbir Singh @ 2011-02-13 17:33 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

* MinChan Kim <minchan.kim@gmail.com> [2011-02-10 14:41:44]:

> I don't know why the part of message is deleted only when I send you.
> Maybe it's gmail bug.
> 
> I hope mail sending is successful in this turn. :)
> 
> On Thu, Feb 10, 2011 at 2:33 PM, Minchan Kim <minchan.kim@gmail.com> wrote:
> > Sorry for late response.
> >
> > On Fri, Jan 28, 2011 at 8:18 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> >> * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 16:24:19]:
> >>
> >>> >
> >>> > But the assumption for LRU order to change happens only if the page
> >>> > cannot be successfully freed, which means it is in some way active..
> >>> > and needs to be moved no?
> >>>
> >>> 1. holded page by someone
> >>> 2. mapped pages
> >>> 3. active pages
> >>>
> >>> 1 is rare so it isn't the problem.
> >>> Of course, in case of 3, we have to activate it so no problem.
> >>> The problem is 2.
> >>>
> >>
> >> 2 is a problem, but due to the size aspects not a big one. Like you
> >> said even lumpy reclaim affects it. May be the reclaim code could
> >> honour may_unmap much earlier.
> >
> > Even if it is, it's a trade-off to get a big contiguous memory. I
> > don't want to add new mess. (In addition, lumpy is weak by compaction
> > as time goes by)
> > What I have in mind for preventing LRU ignore is that put the page
> > into original position instead of head of lru. Maybe it can help the
> > situation both lumpy and your case. But it's another story.
> >
> > How about the idea?
> >
> > I borrow the idea from CFLRU[1]
> > - PCFLRU(Page-Cache First LRU)
> >
> > When we allocates new page for page cache, we adds the page into LRU's tail.
> > When we map the page cache into page table, we rotate the page into LRU's head.
> >
> > So, inactive list's result is following as.
> >
> > M.P : mapped page
> > N.P : none-mapped page
> >
> > HEAD-M.P-M.P-M.P-M.P-N.P-N.P-N.P-N.P-N.P-TAIL
> >
> > Admin can set threshold window size which determines stop reclaiming
> > none-mapped page contiguously.
> >
> > I think it needs some tweak of page cache/page mapping functions but
> > we can use kswapd/direct reclaim without change.
> >
> > Also, it can change page reclaim policy totally but it's just what you
> > want, I think.
> >

I am not sure how this would work, moreover the idea behind
min_unmapped_pages is to keep sufficient unmapped pages around for the
FS metadata and has been working with the existing code for zone
reclaim. What you propose is more drastic re-org of the LRU and I am
not sure I have the apetite for it.

-- 
	Three Cheers,
	Balbir

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
  2011-02-13 17:33                     ` Balbir Singh
@ 2011-02-16 23:45                       ` Minchan Kim
  -1 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-02-16 23:45 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

On Mon, Feb 14, 2011 at 2:33 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * MinChan Kim <minchan.kim@gmail.com> [2011-02-10 14:41:44]:
>
>> I don't know why the part of message is deleted only when I send you.
>> Maybe it's gmail bug.
>>
>> I hope mail sending is successful in this turn. :)
>>
>> On Thu, Feb 10, 2011 at 2:33 PM, Minchan Kim <minchan.kim@gmail.com> wrote:
>> > Sorry for late response.
>> >
>> > On Fri, Jan 28, 2011 at 8:18 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>> >> * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 16:24:19]:
>> >>
>> >>> >
>> >>> > But the assumption for LRU order to change happens only if the page
>> >>> > cannot be successfully freed, which means it is in some way active..
>> >>> > and needs to be moved no?
>> >>>
>> >>> 1. holded page by someone
>> >>> 2. mapped pages
>> >>> 3. active pages
>> >>>
>> >>> 1 is rare so it isn't the problem.
>> >>> Of course, in case of 3, we have to activate it so no problem.
>> >>> The problem is 2.
>> >>>
>> >>
>> >> 2 is a problem, but due to the size aspects not a big one. Like you
>> >> said even lumpy reclaim affects it. May be the reclaim code could
>> >> honour may_unmap much earlier.
>> >
>> > Even if it is, it's a trade-off to get a big contiguous memory. I
>> > don't want to add new mess. (In addition, lumpy is weak by compaction
>> > as time goes by)
>> > What I have in mind for preventing LRU ignore is that put the page
>> > into original position instead of head of lru. Maybe it can help the
>> > situation both lumpy and your case. But it's another story.
>> >
>> > How about the idea?
>> >
>> > I borrow the idea from CFLRU[1]
>> > - PCFLRU(Page-Cache First LRU)
>> >
>> > When we allocates new page for page cache, we adds the page into LRU's tail.
>> > When we map the page cache into page table, we rotate the page into LRU's head.
>> >
>> > So, inactive list's result is following as.
>> >
>> > M.P : mapped page
>> > N.P : none-mapped page
>> >
>> > HEAD-M.P-M.P-M.P-M.P-N.P-N.P-N.P-N.P-N.P-TAIL
>> >
>> > Admin can set threshold window size which determines stop reclaiming
>> > none-mapped page contiguously.
>> >
>> > I think it needs some tweak of page cache/page mapping functions but
>> > we can use kswapd/direct reclaim without change.
>> >
>> > Also, it can change page reclaim policy totally but it's just what you
>> > want, I think.
>> >
>
> I am not sure how this would work, moreover the idea behind
> min_unmapped_pages is to keep sufficient unmapped pages around for the
> FS metadata and has been working with the existing code for zone
> reclaim. What you propose is more drastic re-org of the LRU and I am
> not sure I have the apetite for it.

Yes. My suggestion can change LRU order totally but it can't meet your
goal so it was a bad idea. Sorry for bothering you.

I can add reviewed-by [1/3],[2/3], but still doubt [3/3].
LRU ordering problem as I mentioned is not only your problem but it's
general problem these day(ex, more aggressive compaction/lumpy
reclaim). So we may need general solution if it is real problem.
Okay. I don't oppose your approach from now on until I can prove how
much LRU-reordering makes bad effect. (But still I  raise my eyebrow
on implementation [3/3] but I don't oppose it until I suggest better
approach)

Thanks.
-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 3/3] Provide control over unmapped pages (v4)
@ 2011-02-16 23:45                       ` Minchan Kim
  0 siblings, 0 replies; 47+ messages in thread
From: Minchan Kim @ 2011-02-16 23:45 UTC (permalink / raw)
  To: balbir
  Cc: linux-mm, akpm, npiggin, kvm, linux-kernel, kosaki.motohiro, cl,
	kamezawa.hiroyu

On Mon, Feb 14, 2011 at 2:33 AM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> * MinChan Kim <minchan.kim@gmail.com> [2011-02-10 14:41:44]:
>
>> I don't know why the part of message is deleted only when I send you.
>> Maybe it's gmail bug.
>>
>> I hope mail sending is successful in this turn. :)
>>
>> On Thu, Feb 10, 2011 at 2:33 PM, Minchan Kim <minchan.kim@gmail.com> wrote:
>> > Sorry for late response.
>> >
>> > On Fri, Jan 28, 2011 at 8:18 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
>> >> * MinChan Kim <minchan.kim@gmail.com> [2011-01-28 16:24:19]:
>> >>
>> >>> >
>> >>> > But the assumption for LRU order to change happens only if the page
>> >>> > cannot be successfully freed, which means it is in some way active..
>> >>> > and needs to be moved no?
>> >>>
>> >>> 1. holded page by someone
>> >>> 2. mapped pages
>> >>> 3. active pages
>> >>>
>> >>> 1 is rare so it isn't the problem.
>> >>> Of course, in case of 3, we have to activate it so no problem.
>> >>> The problem is 2.
>> >>>
>> >>
>> >> 2 is a problem, but due to the size aspects not a big one. Like you
>> >> said even lumpy reclaim affects it. May be the reclaim code could
>> >> honour may_unmap much earlier.
>> >
>> > Even if it is, it's a trade-off to get a big contiguous memory. I
>> > don't want to add new mess. (In addition, lumpy is weak by compaction
>> > as time goes by)
>> > What I have in mind for preventing LRU ignore is that put the page
>> > into original position instead of head of lru. Maybe it can help the
>> > situation both lumpy and your case. But it's another story.
>> >
>> > How about the idea?
>> >
>> > I borrow the idea from CFLRU[1]
>> > - PCFLRU(Page-Cache First LRU)
>> >
>> > When we allocates new page for page cache, we adds the page into LRU's tail.
>> > When we map the page cache into page table, we rotate the page into LRU's head.
>> >
>> > So, inactive list's result is following as.
>> >
>> > M.P : mapped page
>> > N.P : none-mapped page
>> >
>> > HEAD-M.P-M.P-M.P-M.P-N.P-N.P-N.P-N.P-N.P-TAIL
>> >
>> > Admin can set threshold window size which determines stop reclaiming
>> > none-mapped page contiguously.
>> >
>> > I think it needs some tweak of page cache/page mapping functions but
>> > we can use kswapd/direct reclaim without change.
>> >
>> > Also, it can change page reclaim policy totally but it's just what you
>> > want, I think.
>> >
>
> I am not sure how this would work, moreover the idea behind
> min_unmapped_pages is to keep sufficient unmapped pages around for the
> FS metadata and has been working with the existing code for zone
> reclaim. What you propose is more drastic re-org of the LRU and I am
> not sure I have the apetite for it.

Yes. My suggestion can change LRU order totally but it can't meet your
goal so it was a bad idea. Sorry for bothering you.

I can add reviewed-by [1/3],[2/3], but still doubt [3/3].
LRU ordering problem as I mentioned is not only your problem but it's
general problem these day(ex, more aggressive compaction/lumpy
reclaim). So we may need general solution if it is real problem.
Okay. I don't oppose your approach from now on until I can prove how
much LRU-reordering makes bad effect. (But still I  raise my eyebrow
on implementation [3/3] but I don't oppose it until I suggest better
approach)

Thanks.
-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2011-02-16 23:45 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-25  5:10 [PATCH 1/2] Refactor zone_reclaim code (v4) Balbir Singh
2011-01-25  5:10 ` Balbir Singh
2011-01-25  5:10 ` [PATCH 3/3] Provide control over unmapped pages (v4) Balbir Singh
2011-01-25  5:10   ` Balbir Singh
2011-01-26 16:57   ` Christoph Lameter
2011-01-26 16:57     ` Christoph Lameter
2011-01-26 17:43     ` Balbir Singh
2011-01-26 17:43       ` Balbir Singh
2011-01-26 23:12   ` Minchan Kim
2011-01-26 23:12     ` Minchan Kim
2011-01-28  2:56     ` Balbir Singh
2011-01-28  2:56       ` Balbir Singh
2011-01-28  5:44       ` Minchan Kim
2011-01-28  5:44         ` Minchan Kim
2011-01-28  6:48         ` Balbir Singh
2011-01-28  6:48           ` Balbir Singh
2011-01-28  6:48           ` Balbir Singh
2011-01-28  7:24           ` Minchan Kim
2011-01-28  7:24             ` Minchan Kim
2011-01-28  7:56             ` KAMEZAWA Hiroyuki
2011-01-28  7:56               ` KAMEZAWA Hiroyuki
2011-01-28  7:56               ` KAMEZAWA Hiroyuki
2011-01-28  8:19               ` Balbir Singh
2011-01-28  8:19                 ` Balbir Singh
2011-01-28  8:19                 ` Balbir Singh
2011-01-28  8:17                 ` KAMEZAWA Hiroyuki
2011-01-28  8:17                   ` KAMEZAWA Hiroyuki
2011-01-28 12:02                   ` Balbir Singh
2011-01-28 12:02                     ` Balbir Singh
2011-01-28 15:20               ` Christoph Lameter
2011-01-28 15:20                 ` Christoph Lameter
2011-01-30 23:58                 ` KAMEZAWA Hiroyuki
2011-01-30 23:58                   ` KAMEZAWA Hiroyuki
2011-01-31  4:37                   ` Balbir Singh
2011-01-31  4:37                     ` Balbir Singh
2011-01-28 11:18             ` Balbir Singh
2011-01-28 11:18               ` Balbir Singh
2011-02-10  5:33               ` Minchan Kim
2011-02-10  5:33                 ` Minchan Kim
2011-02-10  5:41                 ` Minchan Kim
2011-02-10  5:41                   ` Minchan Kim
2011-02-13 17:33                   ` Balbir Singh
2011-02-13 17:33                     ` Balbir Singh
2011-02-16 23:45                     ` Minchan Kim
2011-02-16 23:45                       ` Minchan Kim
2011-01-25  5:15 ` [PATCH 1/2] Refactor zone_reclaim code (v4) Balbir Singh
2011-01-25  5:15   ` Balbir Singh

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.